Update version number and copyright year.
[exim.git] / src / src / parse.c
1 /* $Cambridge: exim/src/src/parse.c,v 1.11 2007/01/08 10:50:18 ph10 Exp $ */
2
3 /*************************************************
4 * Exim - an Internet mail transport agent *
5 *************************************************/
6
7 /* Copyright (c) University of Cambridge 1995 - 2007 */
8 /* See the file NOTICE for conditions of use and distribution. */
9
10 /* Functions for parsing addresses */
11
12
13 #include "exim.h"
14
15
16 static uschar *last_comment_position;
17
18
19
20 /* In stand-alone mode, provide a replacement for deliver_make_addr()
21 and rewrite_address[_qualify]() so as to avoid having to drag in too much
22 redundant apparatus. */
23
24 #ifdef STAND_ALONE
25
26 address_item *deliver_make_addr(uschar *address, BOOL copy)
27 {
28 address_item *addr = store_get(sizeof(address_item));
29 addr->next = NULL;
30 addr->parent = NULL;
31 addr->address = address;
32 return addr;
33 }
34
35 uschar *rewrite_address(uschar *recipient, BOOL dummy1, BOOL dummy2, rewrite_rule
36 *dummy3, int dummy4)
37 {
38 return recipient;
39 }
40
41 uschar *rewrite_address_qualify(uschar *recipient, BOOL dummy1)
42 {
43 return recipient;
44 }
45
46 #endif
47
48
49
50
51 /*************************************************
52 * Find the end of an address *
53 *************************************************/
54
55 /* Scan over a string looking for the termination of an address at a comma,
56 or end of the string. It's the source-routed addresses which cause much pain
57 here. Although Exim ignores source routes, it must recognize such addresses, so
58 we cannot get rid of this logic.
59
60 Argument:
61 s pointer to the start of an address
62 nl_ends if TRUE, '\n' terminates an address
63
64 Returns: pointer past the end of the address
65 (i.e. points to null or comma)
66 */
67
68 uschar *
69 parse_find_address_end(uschar *s, BOOL nl_ends)
70 {
71 BOOL source_routing = *s == '@';
72 int no_term = source_routing? 1 : 0;
73
74 while (*s != 0 && (*s != ',' || no_term > 0) && (*s != '\n' || !nl_ends))
75 {
76 /* Skip single quoted characters. Strictly these should not occur outside
77 quoted strings in RFC 822 addresses, but they can in RFC 821 addresses. Pity
78 about the lack of consistency, isn't it? */
79
80 if (*s == '\\' && s[1] != 0) s += 2;
81
82 /* Skip quoted items that are not inside brackets. Note that
83 quoted pairs are allowed inside quoted strings. */
84
85 else if (*s == '\"')
86 {
87 while (*(++s) != 0 && (*s != '\n' || !nl_ends))
88 {
89 if (*s == '\\' && s[1] != 0) s++;
90 else if (*s == '\"') { s++; break; }
91 }
92 }
93
94 /* Skip comments, which may include nested brackets, but quotes
95 are not recognized inside comments, though quoted pairs are. */
96
97 else if (*s == '(')
98 {
99 int level = 1;
100 while (*(++s) != 0 && (*s != '\n' || !nl_ends))
101 {
102 if (*s == '\\' && s[1] != 0) s++;
103 else if (*s == '(') level++;
104 else if (*s == ')' && --level <= 0) { s++; break; }
105 }
106 }
107
108 /* Non-special character; just advance. Passing the colon in a source
109 routed address means that any subsequent comma or colon may terminate unless
110 inside angle brackets. */
111
112 else
113 {
114 if (*s == '<')
115 {
116 source_routing = s[1] == '@';
117 no_term = source_routing? 2 : 1;
118 }
119 else if (*s == '>') no_term--;
120 else if (source_routing && *s == ':') no_term--;
121 s++;
122 }
123 }
124
125 return s;
126 }
127
128
129
130 /*************************************************
131 * Find last @ in an address *
132 *************************************************/
133
134 /* This function is used when we have something that may not qualified. If we
135 know it's qualified, searching for the rightmost '@' is sufficient. Here we
136 have to be a bit more clever than just a plain search, in order to handle
137 unqualified local parts like "thing@thong" correctly. Since quotes may not
138 legally be part of a domain name, we can give up on hitting the first quote
139 when searching from the right. Now that the parsing also permits the RFC 821
140 form of address, where quoted-pairs are allowed in unquoted local parts, we
141 must take care to handle that too.
142
143 Argument: pointer to an address, possibly unqualified
144 Returns: pointer to the last @ in an address, or NULL if none
145 */
146
147 uschar *
148 parse_find_at(uschar *s)
149 {
150 uschar *t = s + Ustrlen(s);
151 while (--t >= s)
152 {
153 if (*t == '@')
154 {
155 int backslash_count = 0;
156 uschar *tt = t - 1;
157 while (tt > s && *tt-- == '\\') backslash_count++;
158 if ((backslash_count & 1) == 0) return t;
159 }
160 else if (*t == '\"') return NULL;
161 }
162 return NULL;
163 }
164
165
166
167
168 /***************************************************************************
169 * In all the functions below that read a particular object type from *
170 * the input, return the new value of the pointer s (the first argument), *
171 * and put the object into the store pointed to by t (the second argument), *
172 * adding a terminating zero. If no object is found, t will point to zero *
173 * on return. *
174 ***************************************************************************/
175
176
177 /*************************************************
178 * Skip white space and comment *
179 *************************************************/
180
181 /* Algorithm:
182 (1) Skip spaces.
183 (2) If uschar not '(', return.
184 (3) Skip till matching ')', not counting any characters
185 escaped with '\'.
186 (4) Move past ')' and goto (1).
187
188 The start of the last potential comment position is remembered to
189 make it possible to ignore comments at the end of compound items.
190
191 Argument: current character pointer
192 Regurns: new character pointer
193 */
194
195 static uschar *
196 skip_comment(uschar *s)
197 {
198 last_comment_position = s;
199 while (*s)
200 {
201 int c, level;
202 while (isspace(*s)) s++;
203 if (*s != '(') break;
204 level = 1;
205 while((c = *(++s)) != 0)
206 {
207 if (c == '(') level++;
208 else if (c == ')') { if (--level <= 0) { s++; break; } }
209 else if (c == '\\' && s[1] != 0) s++;
210 }
211 }
212 return s;
213 }
214
215
216
217 /*************************************************
218 * Read a domain *
219 *************************************************/
220
221 /* A domain is a sequence of subdomains, separated by dots. See comments below
222 for detailed syntax of the subdomains.
223
224 If allow_domain_literals is TRUE, a "domain" may also be an IP address enclosed
225 in []. Make sure the output is set to the null string if there is a syntax
226 error as well as if there is no domain at all.
227
228 Arguments:
229 s current character pointer
230 t where to put the domain
231 errorptr put error message here on failure (*t will be 0 on exit)
232
233 Returns: new character pointer
234 */
235
236 static uschar *
237 read_domain(uschar *s, uschar *t, uschar **errorptr)
238 {
239 uschar *tt = t;
240 s = skip_comment(s);
241
242 /* Handle domain literals if permitted. An RFC 822 domain literal may contain
243 any character except [ ] \, including linear white space, and may contain
244 quoted characters. However, RFC 821 restricts literals to being dot-separated
245 3-digit numbers, and we make the obvious extension for IPv6. Go for a sequence
246 of digits, dots, hex digits, and colons here; later this will be checked for
247 being a syntactically valid IP address if it ever gets to a router.
248
249 Allow both the formal IPv6 form, with IPV6: at the start, and the informal form
250 without it, and accept IPV4: as well, 'cause someone will use it sooner or
251 later. */
252
253 if (*s == '[')
254 {
255 *t++ = *s++;
256
257 if (strncmpic(s, US"IPv6:", 5) == 0 || strncmpic(s, US"IPv4:", 5) == 0)
258 {
259 memcpy(t, s, 5);
260 t += 5;
261 s += 5;
262 }
263 while (*s == '.' || *s == ':' || isxdigit(*s)) *t++ = *s++;
264
265 if (*s == ']') *t++ = *s++; else
266 {
267 *errorptr = US"malformed domain literal";
268 *tt = 0;
269 }
270
271 if (!allow_domain_literals)
272 {
273 *errorptr = US"domain literals not allowed";
274 *tt = 0;
275 }
276 *t = 0;
277 return skip_comment(s);
278 }
279
280 /* Handle a proper domain, which is a sequence of dot-separated atoms. Remove
281 trailing dots if strip_trailing_dot is set. A subdomain is an atom.
282
283 An atom is a sequence of any characters except specials, space, and controls.
284 The specials are ( ) < > @ , ; : \ " . [ and ]. This is the rule for RFC 822
285 and its successor (RFC 2822). However, RFC 821 and its successor (RFC 2821) is
286 tighter, allowing only letters, digits, and hyphens, not starting with a
287 hyphen.
288
289 There used to be a global flag that got set when checking addresses that came
290 in over SMTP and which should therefore should be checked according to the
291 stricter rule. However, it seems silly to make the distinction, because I don't
292 suppose anybody ever uses local domains that are 822-compliant and not
293 821-compliant. Furthermore, Exim now has additional data on the spool file line
294 after an address (after "one_time" processing), and it makes use of a #
295 character to delimit it. When I wrote that code, I forgot about this 822-domain
296 stuff, and assumed # could never appear in a domain.
297
298 So the old code is now cut out for Release 4.11 onwards, on 09-Aug-02. In a few
299 years, when we are sure this isn't actually causing trouble, throw it away.
300
301 March 2003: the story continues: There is a camp that is arguing for the use of
302 UTF-8 in domain names as the way to internationalization, and other MTAs
303 support this. Therefore, we now have a flag that permits the use of characters
304 with values greater than 127, encoded in UTF-8, in subdomains, so that Exim can
305 be used experimentally in this way. */
306
307 for (;;)
308 {
309 uschar *tsave = t;
310
311 /*********************
312 if (rfc821_domains)
313 {
314 if (*s != '-') while (isalnum(*s) || *s == '-') *t++ = *s++;
315 }
316 else
317 while (!mac_iscntrl_or_special(*s)) *t++ = *s++;
318 *********************/
319
320 if (*s != '-')
321 {
322 /* Only letters, digits, and hyphens */
323
324 if (!allow_utf8_domains)
325 {
326 while (isalnum(*s) || *s == '-') *t++ = *s++;
327 }
328
329 /* Permit legal UTF-8 characters to be included */
330
331 else for(;;)
332 {
333 int i, d;
334 if (isalnum(*s) || *s == '-') /* legal ascii characters */
335 {
336 *t++ = *s++;
337 continue;
338 }
339 if ((*s & 0xc0) != 0xc0) break; /* not start of UTF-8 character */
340 d = *s << 2;
341 for (i = 1; i < 6; i++) /* i is the number of additional bytes */
342 {
343 if ((d & 0x80) == 0) break;
344 d <<= 1;
345 }
346 if (i == 6) goto BAD_UTF8; /* invalid UTF-8 */
347 *t++ = *s++; /* leading UTF-8 byte */
348 while (i-- > 0) /* copy and check remainder */
349 {
350 if ((*s & 0xc0) != 0x80)
351 {
352 BAD_UTF8:
353 *errorptr = US"invalid UTF-8 byte sequence";
354 *tt = 0;
355 return s;
356 }
357 *t++ = *s++;
358 }
359 } /* End of loop for UTF-8 character */
360 } /* End of subdomain */
361
362 s = skip_comment(s);
363 *t = 0;
364
365 if (t == tsave) /* empty component */
366 {
367 if (strip_trailing_dot && t > tt && *s != '.') t[-1] = 0; else
368 {
369 *errorptr = US"domain missing or malformed";
370 *tt = 0;
371 }
372 return s;
373 }
374
375 if (*s != '.') break;
376 *t++ = *s++;
377 s = skip_comment(s);
378 }
379
380 return s;
381 }
382
383
384
385 /*************************************************
386 * Read a local-part *
387 *************************************************/
388
389 /* A local-part is a sequence of words, separated by periods. A null word
390 between dots is not strictly allowed but apparently many mailers permit it,
391 so, sigh, better be compatible. Even accept a trailing dot...
392
393 A <word> is either a quoted string, or an <atom>, which is a sequence
394 of any characters except specials, space, and controls. The specials are
395 ( ) < > @ , ; : \ " . [ and ]. In RFC 822, a single quoted character, (a
396 quoted-pair) is not allowed in a word. However, in RFC 821, it is permitted in
397 the local part of an address. Rather than have separate parsing functions for
398 the different cases, take the liberal attitude always. At least one MUA is
399 happy to recognize this case; I don't know how many other programs do.
400
401 Arguments:
402 s current character pointer
403 t where to put the local part
404 error where to point error text
405 allow_null TRUE if an empty local part is not an error
406
407 Returns: new character pointer
408 */
409
410 static uschar *
411 read_local_part(uschar *s, uschar *t, uschar **error, BOOL allow_null)
412 {
413 uschar *tt = t;
414 *error = NULL;
415 for (;;)
416 {
417 int c;
418 uschar *tsave = t;
419 s = skip_comment(s);
420
421 /* Handle a quoted string */
422
423 if (*s == '\"')
424 {
425 *t++ = '\"';
426 while ((c = *(++s)) != 0 && c != '\"')
427 {
428 *t++ = c;
429 if (c == '\\' && s[1] != 0) *t++ = *(++s);
430 }
431 if (c == '\"')
432 {
433 s++;
434 *t++ = '\"';
435 }
436 else
437 {
438 *error = US"unmatched doublequote in local part";
439 return s;
440 }
441 }
442
443 /* Handle an atom, but allow quoted pairs within it. */
444
445 else while (!mac_iscntrl_or_special(*s) || *s == '\\')
446 {
447 c = *t++ = *s++;
448 if (c == '\\' && *s != 0) *t++ = *s++;
449 }
450
451 /* Terminate the word and skip subsequent comment */
452
453 *t = 0;
454 s = skip_comment(s);
455
456 /* If we have read a null component at this point, give an error unless it is
457 terminated by a dot - an extension to RFC 822 - or if it is the first
458 component of the local part and an empty local part is permitted, in which
459 case just return normally. */
460
461 if (t == tsave && *s != '.')
462 {
463 if (t == tt && !allow_null)
464 *error = US"missing or malformed local part";
465 return s;
466 }
467
468 /* Anything other than a dot terminates the local part. Treat multiple dots
469 as a single dot, as this seems to be a common extension. */
470
471 if (*s != '.') break;
472 do { *t++ = *s++; } while (*s == '.');
473 }
474
475 return s;
476 }
477
478
479 /*************************************************
480 * Read route part of route-addr *
481 *************************************************/
482
483 /* The pointer is at the initial "@" on entry. Return it following the
484 terminating colon. Exim no longer supports the use of source routes, but it is
485 required to accept the syntax.
486
487 Arguments:
488 s current character pointer
489 t where to put the route
490 errorptr where to put an error message
491
492 Returns: new character pointer
493 */
494
495 static uschar *
496 read_route(uschar *s, uschar *t, uschar **errorptr)
497 {
498 BOOL commas = FALSE;
499 *errorptr = NULL;
500
501 while (*s == '@')
502 {
503 *t++ = '@';
504 s = read_domain(s+1, t, errorptr);
505 if (*t == 0) return s;
506 t += Ustrlen((const uschar *)t);
507 if (*s != ',') break;
508 *t++ = *s++;
509 commas = TRUE;
510 s = skip_comment(s);
511 }
512
513 if (*s == ':') *t++ = *s++;
514
515 /* If there is no colon, and there were no commas, the most likely error
516 is in fact a missing local part in the address rather than a missing colon
517 after the route. */
518
519 else *errorptr = commas?
520 US"colon expected after route list" :
521 US"no local part";
522
523 /* Terminate the route and return */
524
525 *t = 0;
526 return skip_comment(s);
527 }
528
529
530
531 /*************************************************
532 * Read addr-spec *
533 *************************************************/
534
535 /* Addr-spec is local-part@domain. We make the domain optional -
536 the expected terminator for the whole thing is passed to check this.
537 This function is called only when we know we have a route-addr.
538
539 Arguments:
540 s current character pointer
541 t where to put the addr-spec
542 term expected terminator (0 or >)
543 errorptr where to put an error message
544 domainptr set to point to the start of the domain
545
546 Returns: new character pointer
547 */
548
549 static uschar *
550 read_addr_spec(uschar *s, uschar *t, int term, uschar **errorptr,
551 uschar **domainptr)
552 {
553 s = read_local_part(s, t, errorptr, FALSE);
554 if (*errorptr == NULL)
555 {
556 if (*s != term)
557 {
558 if (*s != '@')
559 *errorptr = string_sprintf("\"@\" or \".\" expected after \"%s\"", t);
560 else
561 {
562 t += Ustrlen((const uschar *)t);
563 *t++ = *s++;
564 *domainptr = t;
565 s = read_domain(s, t, errorptr);
566 }
567 }
568 }
569 return s;
570 }
571
572
573
574 /*************************************************
575 * Extract operative address *
576 *************************************************/
577
578 /* This function extracts an operative address from a full RFC822 mailbox and
579 returns it in a piece of dynamic store. We take the easy way and get a piece
580 of store the same size as the input, and then copy into it whatever is
581 necessary. If we cannot find a valid address (syntax error), return NULL, and
582 point the error pointer to the reason. The arguments "start" and "end" are used
583 to return the offsets of the first and one past the last characters in the
584 original mailbox of the address that has been extracted, to aid in re-writing.
585 The argument "domain" is set to point to the first character after "@" in the
586 final part of the returned address, or zero if there is no @.
587
588 Exim no longer supports the use of source routed addresses (those of the form
589 @domain,...:route_addr). It recognizes the syntax, but collapses such addresses
590 down to their final components. Formerly, collapse_source_routes had to be set
591 to achieve this effect. RFC 1123 allows collapsing with MAY, while the revision
592 of RFC 821 had increased this to SHOULD, so I've gone for it, because it makes
593 a lot of code elsewhere in Exim much simpler.
594
595 There are some special fudges here for handling RFC 822 group address notation
596 which may appear in certain headers. If the flag parse_allow_group is set
597 TRUE and parse_found_group is FALSE when this function is called, an address
598 which is the start of a group (i.e. preceded by a phrase and a colon) is
599 recognized; the phrase is ignored and the flag parse_found_group is set. If
600 this flag is TRUE at the end of an address, and if an extraneous semicolon is
601 found, it is ignored and the flag is cleared.
602
603 This logic is used only when scanning through addresses in headers, either to
604 fulfil the -t option, or for rewriting, or for checking header syntax. Because
605 the group "state" has to be remembered between multiple calls of this function,
606 the variables parse_{allow,found}_group are global. It is important to ensure
607 that they are reset to FALSE at the end of scanning a header's list of
608 addresses.
609
610 Arguments:
611 mailbox points to the RFC822 mailbox
612 errorptr where to point an error message
613 start set to start offset in mailbox
614 end set to end offset in mailbox
615 domain set to domain offset in result, or 0 if no domain present
616 allow_null allow <> if TRUE
617
618 Returns: points to the extracted address, or NULL on error
619 */
620
621 #define FAILED(s) { *errorptr = s; goto PARSE_FAILED; }
622
623 uschar *
624 parse_extract_address(uschar *mailbox, uschar **errorptr, int *start, int *end,
625 int *domain, BOOL allow_null)
626 {
627 uschar *yield = store_get(Ustrlen(mailbox) + 1);
628 uschar *startptr, *endptr;
629 uschar *s = (uschar *)mailbox;
630 uschar *t = (uschar *)yield;
631
632 *domain = 0;
633
634 /* At the start of the string we expect either an addr-spec or a phrase
635 preceding a <route-addr>. If groups are allowed, we might also find a phrase
636 preceding a colon and an address. If we find an initial word followed by
637 a dot, strict interpretation of the RFC would cause it to be taken
638 as the start of an addr-spec. However, many mailers break the rules
639 and use addresses of the form "a.n.other <ano@somewhere>" and so we
640 allow this case. */
641
642 RESTART: /* Come back here after passing a group name */
643
644 s = skip_comment(s);
645 startptr = s; /* In case addr-spec */
646 s = read_local_part(s, t, errorptr, TRUE); /* Dot separated words */
647 if (*errorptr != NULL) goto PARSE_FAILED;
648
649 /* If the terminator is neither < nor @ then the format of the address
650 must either be a bare local-part (we are now at the end), or a phrase
651 followed by a route-addr (more words must follow). */
652
653 if (*s != '@' && *s != '<')
654 {
655 if (*s == 0 || *s == ';')
656 {
657 if (*t == 0) FAILED(US"empty address");
658 endptr = last_comment_position;
659 goto PARSE_SUCCEEDED; /* Bare local part */
660 }
661
662 /* Expect phrase route-addr, or phrase : if groups permitted, but allow
663 dots in the phrase; complete the loop only when '<' or ':' is encountered -
664 end of string will produce a null local_part and therefore fail. We don't
665 need to keep updating t, as the phrase isn't to be kept. */
666
667 while (*s != '<' && (!parse_allow_group || *s != ':'))
668 {
669 s = read_local_part(s, t, errorptr, FALSE);
670 if (*errorptr != NULL)
671 {
672 *errorptr = string_sprintf("%s (expected word or \"<\")", *errorptr);
673 goto PARSE_FAILED;
674 }
675 }
676
677 if (*s == ':')
678 {
679 parse_found_group = TRUE;
680 parse_allow_group = FALSE;
681 s++;
682 goto RESTART;
683 }
684
685 /* Assert *s == '<' */
686 }
687
688 /* At this point the next character is either '@' or '<'. If it is '@', only a
689 single local-part has previously been read. An angle bracket signifies the
690 start of an <addr-spec>. Throw away anything we have saved so far before
691 processing it. Note that this is "if" rather than "else if" because it's also
692 used after reading a preceding phrase.
693
694 There are a lot of broken sendmails out there that put additional pairs of <>
695 round <route-addr>s. If strip_excess_angle_brackets is set, allow any number of
696 them, as long as they match. */
697
698 if (*s == '<')
699 {
700 uschar *domainptr = yield;
701 BOOL source_routed = FALSE;
702 int bracket_count = 1;
703
704 s++;
705 if (strip_excess_angle_brackets)
706 while (*s == '<') { bracket_count++; s++; }
707
708 t = yield;
709 startptr = s;
710 s = skip_comment(s);
711
712 /* Read an optional series of routes, each of which is a domain. They
713 are separated by commas and terminated by a colon. However, we totally ignore
714 such routes (RFC 1123 says we MAY, and the revision of RFC 821 says we
715 SHOULD). */
716
717 if (*s == '@')
718 {
719 s = read_route(s, t, errorptr);
720 if (*errorptr != NULL) goto PARSE_FAILED;
721 *t = 0; /* Ensure route is ignored - probably overkill */
722 source_routed = TRUE;
723 }
724
725 /* Now an addr-spec, terminated by '>'. If there is no preceding route,
726 we must allow an empty addr-spec if allow_null is TRUE, to permit the
727 address "<>" in some circumstances. A source-routed address MUST have
728 a domain in the final part. */
729
730 if (allow_null && !source_routed && *s == '>')
731 {
732 *t = 0;
733 *errorptr = NULL;
734 }
735 else
736 {
737 s = read_addr_spec(s, t, '>', errorptr, &domainptr);
738 if (*errorptr != NULL) goto PARSE_FAILED;
739 *domain = domainptr - yield;
740 if (source_routed && *domain == 0)
741 FAILED(US"domain missing in source-routed address");
742 }
743
744 endptr = s;
745 if (*errorptr != NULL) goto PARSE_FAILED;
746 while (bracket_count-- > 0) if (*s++ != '>')
747 {
748 *errorptr = (s[-1] == 0)? US"'>' missing at end of address" :
749 string_sprintf("malformed address: %.32s may not follow %.*s",
750 s-1, s - (uschar *)mailbox - 1, mailbox);
751 goto PARSE_FAILED;
752 }
753
754 s = skip_comment(s);
755 }
756
757 /* Hitting '@' after the first local-part means we have definitely got an
758 addr-spec, on a strict reading of the RFC, and the rest of the string
759 should be the domain. However, for flexibility we allow for a route-address
760 not enclosed in <> as well, which is indicated by an empty first local
761 part preceding '@'. The source routing is, however, ignored. */
762
763 else if (*t == 0)
764 {
765 uschar *domainptr = yield;
766 s = read_route(s, t, errorptr);
767 if (*errorptr != NULL) goto PARSE_FAILED;
768 *t = 0; /* Ensure route is ignored - probably overkill */
769 s = read_addr_spec(s, t, 0, errorptr, &domainptr);
770 if (*errorptr != NULL) goto PARSE_FAILED;
771 *domain = domainptr - yield;
772 endptr = last_comment_position;
773 if (*domain == 0) FAILED(US"domain missing in source-routed address");
774 }
775
776 /* This is the strict case of local-part@domain. */
777
778 else
779 {
780 t += Ustrlen((const uschar *)t);
781 *t++ = *s++;
782 *domain = t - yield;
783 s = read_domain(s, t, errorptr);
784 if (*t == 0) goto PARSE_FAILED;
785 endptr = last_comment_position;
786 }
787
788 /* Use goto to get here from the bare local part case. Arrive by falling
789 through for other cases. Endptr may have been moved over whitespace, so
790 move it back past white space if necessary. */
791
792 PARSE_SUCCEEDED:
793 if (*s != 0)
794 {
795 if (parse_found_group && *s == ';')
796 {
797 parse_found_group = FALSE;
798 parse_allow_group = TRUE;
799 }
800 else
801 {
802 *errorptr = string_sprintf("malformed address: %.32s may not follow %.*s",
803 s, s - (uschar *)mailbox, mailbox);
804 goto PARSE_FAILED;
805 }
806 }
807 *start = startptr - (uschar *)mailbox; /* Return offsets */
808 while (isspace(endptr[-1])) endptr--;
809 *end = endptr - (uschar *)mailbox;
810
811 /* Although this code has no limitation on the length of address extracted,
812 other parts of Exim may have limits, and in any case, RFC 2821 limits local
813 parts to 64 and domains to 255, so we do a check here, giving an error if the
814 address is ridiculously long. */
815
816 if (*end - *start > ADDRESS_MAXLENGTH)
817 {
818 *errorptr = string_sprintf("address is ridiculously long: %.64s...", yield);
819 return NULL;
820 }
821
822 return (uschar *)yield;
823
824 /* Use goto (via the macro FAILED) to get to here from a variety of places.
825 We might have an empty address in a group - the caller can choose to ignore
826 this. We must, however, keep the flags correct. */
827
828 PARSE_FAILED:
829 if (parse_found_group && *s == ';')
830 {
831 parse_found_group = FALSE;
832 parse_allow_group = TRUE;
833 }
834 return NULL;
835 }
836
837 #undef FAILED
838
839
840
841 /*************************************************
842 * Quote according to RFC 2047 *
843 *************************************************/
844
845 /* This function is used for quoting text in headers according to RFC 2047.
846 If the only characters that strictly need quoting are spaces, we return the
847 original string, unmodified. If a quoted string is too long for the buffer, it
848 is truncated. (This shouldn't happen: this is normally handling short strings.)
849
850 Hmmph. As always, things get perverted for other uses. This function was
851 originally for the "phrase" part of addresses. Now it is being used for much
852 longer texts in ACLs and via the ${rfc2047: expansion item. This means we have
853 to check for overlong "encoded-word"s and split them. November 2004.
854
855 Arguments:
856 string the string to quote - already checked to contain non-printing
857 chars
858 len the length of the string
859 charset the name of the character set; NULL => iso-8859-1
860 buffer the buffer to put the answer in
861 buffer_size the size of the buffer
862 fold if TRUE, a newline is inserted before the separating space when
863 more than one encoded-word is generated
864
865 Returns: pointer to the original string, if no quoting needed, or
866 pointer to buffer containing the quoted string, or
867 a pointer to "String too long" if the buffer can't even hold
868 the introduction
869 */
870
871 uschar *
872 parse_quote_2047(uschar *string, int len, uschar *charset, uschar *buffer,
873 int buffer_size, BOOL fold)
874 {
875 uschar *s = string;
876 uschar *p, *t;
877 int hlen;
878 BOOL coded = FALSE;
879
880 if (charset == NULL) charset = US"iso-8859-1";
881
882 /* We don't expect this to fail! */
883
884 if (!string_format(buffer, buffer_size, "=?%s?Q?", charset))
885 return US"String too long";
886
887 hlen = Ustrlen(buffer);
888 t = buffer + hlen;
889 p = buffer;
890
891 for (; len > 0; len--)
892 {
893 int ch = *s++;
894 if (t > buffer + buffer_size - hlen - 8) break;
895
896 if (t - p > 70)
897 {
898 *t++ = '?';
899 *t++ = '=';
900 if (fold) *t++ = '\n';
901 *t++ = ' ';
902 p = t;
903 Ustrncpy(p, buffer, hlen);
904 t += hlen;
905 }
906
907 if (ch < 33 || ch > 126 ||
908 Ustrchr("?=()<>@,;:\\\".[]_", ch) != NULL)
909 {
910 if (ch == ' ') *t++ = '_'; else
911 {
912 sprintf(CS t, "=%02X", ch);
913 while (*t != 0) t++;
914 coded = TRUE;
915 }
916 }
917 else *t++ = ch;
918 }
919
920 *t++ = '?';
921 *t++ = '=';
922 *t = 0;
923
924 return coded? buffer : string;
925 }
926
927
928
929
930 /*************************************************
931 * Fix up an RFC 822 "phrase" *
932 *************************************************/
933
934 /* This function is called to repair any syntactic defects in the "phrase" part
935 of an RFC822 address. In particular, it is applied to the user's name as read
936 from the passwd file when accepting a local message, and to the data from the
937 -F option.
938
939 If the string contains existing quoted strings or comments containing
940 freestanding quotes, then we just quote those bits that need quoting -
941 otherwise it would get awfully messy and probably not look good. If not, we
942 quote the whole thing if necessary. Thus
943
944 John Q. Smith => "John Q. Smith"
945 John "Jack" Smith => John "Jack" Smith
946 John "Jack" Q. Smith => John "Jack" "Q." Smith
947 John (Jack) Q. Smith => "John (Jack) Q. Smith"
948 John ("Jack") Q. Smith => John ("Jack") "Q." Smith
949 but
950 John (\"Jack\") Q. Smith => "John (\"Jack\") Q. Smith"
951
952 Sheesh! This is tedious code. It is a great pity that the syntax of RFC822 is
953 the way it is...
954
955 August 2000: Additional code added:
956
957 Previously, non-printing characters were turned into question marks, which do
958 not need to be quoted.
959
960 Now, a different tactic is used if there are any non-printing ASCII
961 characters. The encoding method from RFC 2047 is used, assuming iso-8859-1 as
962 the character set.
963
964 We *could* use this for all cases, getting rid of the messy original code,
965 but leave it for now. It would complicate simple cases like "John Q. Smith".
966
967 The result is passed back in the buffer; it is usually going to be added to
968 some other string. In order to be sure there is going to be no overflow,
969 restrict the length of the input to 1/4 of the buffer size - this allows for
970 every single character to be quoted or encoded without overflowing, and that
971 wouldn't happen because of amalgamation. If the phrase is too long, return a
972 fixed string.
973
974 Arguments:
975 phrase an RFC822 phrase
976 len the length of the phrase
977 buffer a buffer to put the result in
978 buffer_size the size of the buffer
979
980 Returns: the fixed RFC822 phrase
981 */
982
983 uschar *
984 parse_fix_phrase(uschar *phrase, int len, uschar *buffer, int buffer_size)
985 {
986 int ch, i;
987 BOOL quoted = FALSE;
988 uschar *s, *t, *end, *yield;
989
990 while (len > 0 && isspace(*phrase)) { phrase++; len--; }
991 if (len > buffer_size/4) return US"Name too long";
992
993 /* See if there are any non-printing characters, and if so, use the RFC 2047
994 encoding for the whole thing. */
995
996 for (i = 0, s = phrase; i < len; i++, s++)
997 if ((*s < 32 && *s != '\t') || *s > 126) break;
998
999 if (i < len) return parse_quote_2047(phrase, len, headers_charset, buffer,
1000 buffer_size, FALSE);
1001
1002 /* No non-printers; use the RFC 822 quoting rules */
1003
1004 s = phrase;
1005 end = s + len;
1006 yield = t = buffer + 1;
1007
1008 while (s < end)
1009 {
1010 ch = *s++;
1011
1012 /* Copy over quoted strings, remembering we encountered one */
1013
1014 if (ch == '\"')
1015 {
1016 *t++ = '\"';
1017 while (s < end && (ch = *s++) != '\"')
1018 {
1019 *t++ = ch;
1020 if (ch == '\\' && s < end) *t++ = *s++;
1021 }
1022 *t++ = '\"';
1023 if (s >= end) break;
1024 quoted = TRUE;
1025 }
1026
1027 /* Copy over comments, noting if they contain freestanding quote
1028 characters */
1029
1030 else if (ch == '(')
1031 {
1032 int level = 1;
1033 *t++ = '(';
1034 while (s < end)
1035 {
1036 ch = *s++;
1037 *t++ = ch;
1038 if (ch == '(') level++;
1039 else if (ch == ')') { if (--level <= 0) break; }
1040 else if (ch == '\\' && s < end) *t++ = *s++ & 127;
1041 else if (ch == '\"') quoted = TRUE;
1042 }
1043 if (ch == 0)
1044 {
1045 while (level--) *t++ = ')';
1046 break;
1047 }
1048 }
1049
1050 /* Handle special characters that need to be quoted */
1051
1052 else if (Ustrchr(")<>@,;:\\.[]", ch) != NULL)
1053 {
1054 /* If hit previous quotes just make one quoted "word" */
1055
1056 if (quoted)
1057 {
1058 uschar *tt = t++;
1059 while (*(--tt) != ' ' && *tt != '\"' && *tt != ')') tt[1] = *tt;
1060 tt[1] = '\"';
1061 *t++ = ch;
1062 while (s < end)
1063 {
1064 ch = *s++;
1065 if (ch == ' ' || ch == '\"') { s--; break; } else *t++ = ch;
1066 }
1067 *t++ = '\"';
1068 }
1069
1070 /* Else quote the whole string so far, and the rest up to any following
1071 quotes. We must treat anything following a backslash as a literal. */
1072
1073 else
1074 {
1075 BOOL escaped = (ch == '\\');
1076 *(--yield) = '\"';
1077 *t++ = ch;
1078
1079 /* Now look for the end or a quote */
1080
1081 while (s < end)
1082 {
1083 ch = *s++;
1084
1085 /* Handle escaped pairs */
1086
1087 if (escaped)
1088 {
1089 *t++ = ch;
1090 escaped = FALSE;
1091 }
1092
1093 else if (ch == '\\')
1094 {
1095 *t++ = ch;
1096 escaped = TRUE;
1097 }
1098
1099 /* If hit subsequent quotes, insert our quote before any trailing
1100 spaces and back up to re-handle the quote in the outer loop. */
1101
1102 else if (ch == '\"')
1103 {
1104 int count = 0;
1105 while (t[-1] == ' ') { t--; count++; }
1106 *t++ = '\"';
1107 while (count-- > 0) *t++ = ' ';
1108 s--;
1109 break;
1110 }
1111
1112 /* If hit a subsequent comment, check it for unescaped quotes,
1113 and if so, end our quote before it. */
1114
1115 else if (ch == '(')
1116 {
1117 uschar *ss = s; /* uschar after '(' */
1118 int level = 1;
1119 while(ss < end)
1120 {
1121 ch = *ss++;
1122 if (ch == '(') level++;
1123 else if (ch == ')') { if (--level <= 0) break; }
1124 else if (ch == '\\' && ss+1 < end) ss++;
1125 else if (ch == '\"') { quoted = TRUE; break; }
1126 }
1127
1128 /* Comment contains unescaped quotes; end our quote before
1129 the start of the comment. */
1130
1131 if (quoted)
1132 {
1133 int count = 0;
1134 while (t[-1] == ' ') { t--; count++; }
1135 *t++ = '\"';
1136 while (count-- > 0) *t++ = ' ';
1137 break;
1138 }
1139
1140 /* Comment does not contain unescaped quotes; include it in
1141 our quote. */
1142
1143 else
1144 {
1145 if (ss >= end) ss--;
1146 *t++ = '(';
1147 Ustrncpy(t, s, ss-s);
1148 t += ss-s;
1149 s = ss;
1150 }
1151 }
1152
1153 /* Not a comment or quote; include this character in our quotes. */
1154
1155 else *t++ = ch;
1156 }
1157 }
1158
1159 /* Add a final quote if we hit the end of the string. */
1160
1161 if (s >= end) *t++ = '\"';
1162 }
1163
1164 /* Non-special character; just copy it over */
1165
1166 else *t++ = ch;
1167 }
1168
1169 *t = 0;
1170 return yield;
1171 }
1172
1173
1174 /*************************************************
1175 * Extract addresses from a list *
1176 *************************************************/
1177
1178 /* This function is called by the redirect router to scan a string containing a
1179 list of addresses separated by commas (with optional white space) or by
1180 newlines, and to generate a chain of address items from them. In other words,
1181 to unpick data from an alias or .forward file.
1182
1183 The SunOS5 documentation for alias files is not very clear on the syntax; it
1184 does not say that either a comma or a newline can be used for separation.
1185 However, that is the way Smail does it, so we follow suit.
1186
1187 If a # character is encountered in a white space position, then characters from
1188 there to the next newline are skipped.
1189
1190 If an unqualified address begins with '\', just skip that character. This gives
1191 compatibility with Sendmail's use of \ to prevent looping. Exim has its own
1192 loop prevention scheme which handles other cases too - see the code in
1193 route_address().
1194
1195 An "address" can be a specification of a file or a pipe; the latter may often
1196 need to be quoted because it may contain spaces, but we don't want to retain
1197 the quotes. Quotes may appear in normal addresses too, and should be retained.
1198 We can distinguish between these cases, because in addresses, quotes are used
1199 only for parts of the address, not the whole thing. Therefore, we remove quotes
1200 from items when they entirely enclose them, but not otherwise.
1201
1202 An "address" can also be of the form :include:pathname to include a list of
1203 addresses contained in the specified file.
1204
1205 Any unqualified addresses are qualified with and rewritten if necessary, via
1206 the rewrite_address() function.
1207
1208 Arguments:
1209 s the list of addresses (typically a complete
1210 .forward file or a list of entries in an alias file)
1211 options option bits for permitting or denying various special cases;
1212 not all bits are relevant here - some are for filter
1213 files; those we use here are:
1214 RDO_DEFER
1215 RDO_FREEZE
1216 RDO_FAIL
1217 RDO_BLACKHOLE
1218 RDO_REWRITE
1219 RDO_INCLUDE
1220 anchor where to hang the chain of newly-created addresses. This
1221 should be initialized to NULL.
1222 error where to return an error text
1223 incoming domain domain of the incoming address; used to qualify unqualified
1224 local parts preceded by \
1225 directory if NULL, no checks are done on :include: files
1226 otherwise, included file names must start with the given
1227 directory
1228 syntax_errors if not NULL, it carries on after syntax errors in addresses,
1229 building up a list of errors as error blocks chained on
1230 here.
1231
1232 Returns: FF_DELIVERED addresses extracted
1233 FF_NOTDELIVERED no addresses extracted, but no errors
1234 FF_BLACKHOLE :blackhole:
1235 FF_DEFER :defer:
1236 FF_FAIL :fail:
1237 FF_INCLUDEFAIL some problem with :include:; *error set
1238 FF_ERROR other problems; *error is set
1239 */
1240
1241 int
1242 parse_forward_list(uschar *s, int options, address_item **anchor,
1243 uschar **error, uschar *incoming_domain, uschar *directory,
1244 error_block **syntax_errors)
1245 {
1246 int count = 0;
1247
1248 DEBUG(D_route) debug_printf("parse_forward_list: %s\n", s);
1249
1250 for (;;)
1251 {
1252 int len;
1253 int special = 0;
1254 int specopt = 0;
1255 int specbit = 0;
1256 uschar *ss, *nexts;
1257 address_item *addr;
1258 BOOL inquote = FALSE;
1259
1260 for (;;)
1261 {
1262 while (isspace(*s) || *s == ',') s++;
1263 if (*s == '#') { while (*s != 0 && *s != '\n') s++; } else break;
1264 }
1265
1266 /* When we reach the end of the list, we return FF_DELIVERED if any child
1267 addresses have been generated. If nothing has been generated, there are two
1268 possibilities: either the list is really empty, or there were syntax errors
1269 that are being skipped. (If syntax errors are not being skipped, an FF_ERROR
1270 return is generated on hitting a syntax error and we don't get here.) For a
1271 truly empty list we return FF_NOTDELIVERED so that the router can decline.
1272 However, if the list is empty only because syntax errors were skipped, we
1273 return FF_DELIVERED. */
1274
1275 if (*s == 0)
1276 {
1277 return (count > 0 || (syntax_errors != NULL && *syntax_errors != NULL))?
1278 FF_DELIVERED : FF_NOTDELIVERED;
1279
1280 /* This previous code returns FF_ERROR if nothing is generated but a
1281 syntax error has been skipped. I now think it is the wrong approach, but
1282 have left this here just in case, and for the record. */
1283
1284 #ifdef NEVER
1285 if (count > 0) return FF_DELIVERED; /* Something was generated */
1286
1287 if (syntax_errors == NULL || /* Not skipping syntax errors, or */
1288 *syntax_errors == NULL) /* we didn't actually skip any */
1289 return FF_NOTDELIVERED;
1290
1291 *error = string_sprintf("no addresses generated: syntax error in %s: %s",
1292 (*syntax_errors)->text2, (*syntax_errors)->text1);
1293 return FF_ERROR;
1294 #endif
1295
1296 }
1297
1298 /* Find the end of the next address. Quoted strings in addresses may contain
1299 escaped characters; I haven't found a proper specification of .forward or
1300 alias files that mentions the quoting properties, but it seems right to do
1301 the escaping thing in all cases, so use the function that finds the end of an
1302 address. However, don't let a quoted string extend over the end of a line. */
1303
1304 ss = parse_find_address_end(s, TRUE);
1305
1306 /* Remember where we finished, for starting the next one. */
1307
1308 nexts = ss;
1309
1310 /* Remove any trailing spaces; we know there's at least one non-space. */
1311
1312 while (isspace((ss[-1]))) ss--;
1313
1314 /* We now have s->start and ss->end of the next address. Remove quotes
1315 if they completely enclose, remembering the address started with a quote
1316 for handling pipes and files. Another round of removal of leading and
1317 trailing spaces is then required. */
1318
1319 if (*s == '\"' && ss[-1] == '\"')
1320 {
1321 s++;
1322 ss--;
1323 inquote = TRUE;
1324 while (s < ss && isspace(*s)) s++;
1325 while (ss > s && isspace((ss[-1]))) ss--;
1326 }
1327
1328 /* Set up the length of the address. */
1329
1330 len = ss - s;
1331
1332 DEBUG(D_route)
1333 {
1334 int save = s[len];
1335 s[len] = 0;
1336 debug_printf("extract item: %s\n", s);
1337 s[len] = save;
1338 }
1339
1340 /* Handle special addresses if permitted. If the address is :unknown:
1341 ignore it - this is for backward compatibility with old alias files. You
1342 don't need to use it nowadays - just generate an empty string. For :defer:,
1343 :blackhole:, or :fail: we have to set up the error message and give up right
1344 away. */
1345
1346 if (Ustrncmp(s, ":unknown:", len) == 0)
1347 {
1348 s = nexts;
1349 continue;
1350 }
1351
1352 if (Ustrncmp(s, ":defer:", 7) == 0)
1353 { special = FF_DEFER; specopt = RDO_DEFER; } /* specbit is 0 */
1354 else if (Ustrncmp(s, ":blackhole:", 11) == 0)
1355 { special = FF_BLACKHOLE; specopt = specbit = RDO_BLACKHOLE; }
1356 else if (Ustrncmp(s, ":fail:", 6) == 0)
1357 { special = FF_FAIL; specopt = RDO_FAIL; } /* specbit is 0 */
1358
1359 if (special != 0)
1360 {
1361 uschar *ss = Ustrchr(s+1, ':') + 1;
1362 if ((options & specopt) == specbit)
1363 {
1364 *error = string_sprintf("\"%.*s\" is not permitted", len, s);
1365 return FF_ERROR;
1366 }
1367 while (*ss != 0 && isspace(*ss)) ss++;
1368 while (s[len] != 0 && s[len] != '\n') len++;
1369 s[len] = 0;
1370 *error = string_copy(ss);
1371 return special;
1372 }
1373
1374 /* If the address is of the form :include:pathname, read the file, and call
1375 this function recursively to extract the addresses from it. If directory is
1376 NULL, do no checks. Otherwise, insist that the file name starts with the
1377 given directory and is a regular file. */
1378
1379 if (Ustrncmp(s, ":include:", 9) == 0)
1380 {
1381 uschar *filebuf;
1382 uschar filename[256];
1383 uschar *t = s+9;
1384 int flen = len - 9;
1385 int frc;
1386 struct stat statbuf;
1387 address_item *last;
1388 FILE *f;
1389
1390 while (flen > 0 && isspace(*t)) { t++; flen--; }
1391
1392 if (flen <= 0)
1393 {
1394 *error = string_sprintf("file name missing after :include:");
1395 return FF_ERROR;
1396 }
1397
1398 if (flen > 255)
1399 {
1400 *error = string_sprintf("included file name \"%s\" is too long", t);
1401 return FF_ERROR;
1402 }
1403
1404 Ustrncpy(filename, t, flen);
1405 filename[flen] = 0;
1406
1407 /* Insist on absolute path */
1408
1409 if (filename[0]!= '/')
1410 {
1411 *error = string_sprintf("included file \"%s\" is not an absolute path",
1412 filename);
1413 return FF_ERROR;
1414 }
1415
1416 /* Check if include is permitted */
1417
1418 if ((options & RDO_INCLUDE) != 0)
1419 {
1420 *error = US"included files not permitted";
1421 return FF_ERROR;
1422 }
1423
1424 /* Check file name if required */
1425
1426 if (directory != NULL)
1427 {
1428 int len = Ustrlen(directory);
1429 uschar *p = filename + len;
1430
1431 if (Ustrncmp(filename, directory, len) != 0 || *p != '/')
1432 {
1433 *error = string_sprintf("included file %s is not in directory %s",
1434 filename, directory);
1435 return FF_ERROR;
1436 }
1437
1438 /* It is necessary to check that every component inside the directory
1439 is NOT a symbolic link, in order to keep the file inside the directory.
1440 This is mighty tedious. It is also not totally foolproof in that it
1441 leaves the possibility of a race attack, but I don't know how to do
1442 any better. */
1443
1444 while (*p != 0)
1445 {
1446 int temp;
1447 while (*(++p) != 0 && *p != '/');
1448 temp = *p;
1449 *p = 0;
1450 if (Ulstat(filename, &statbuf) != 0)
1451 {
1452 *error = string_sprintf("failed to stat %s (component of included "
1453 "file)", filename);
1454 *p = temp;
1455 return FF_ERROR;
1456 }
1457
1458 *p = temp;
1459
1460 if ((statbuf.st_mode & S_IFMT) == S_IFLNK)
1461 {
1462 *error = string_sprintf("included file %s in the %s directory "
1463 "involves a symbolic link", filename, directory);
1464 return FF_ERROR;
1465 }
1466 }
1467 }
1468
1469 /* Open and stat the file */
1470
1471 if ((f = Ufopen(filename, "rb")) == NULL)
1472 {
1473 *error = string_open_failed(errno, "included file %s", filename);
1474 return FF_INCLUDEFAIL;
1475 }
1476
1477 if (fstat(fileno(f), &statbuf) != 0)
1478 {
1479 *error = string_sprintf("failed to stat included file %s: %s",
1480 filename, strerror(errno));
1481 (void)fclose(f);
1482 return FF_INCLUDEFAIL;
1483 }
1484
1485 /* If directory was checked, double check that we opened a regular file */
1486
1487 if (directory != NULL && (statbuf.st_mode & S_IFMT) != S_IFREG)
1488 {
1489 *error = string_sprintf("included file %s is not a regular file in "
1490 "the %s directory", filename, directory);
1491 return FF_ERROR;
1492 }
1493
1494 /* Get a buffer and read the contents */
1495
1496 if (statbuf.st_size > MAX_INCLUDE_SIZE)
1497 {
1498 *error = string_sprintf("included file %s is too big (max %d)",
1499 filename, MAX_INCLUDE_SIZE);
1500 return FF_ERROR;
1501 }
1502
1503 filebuf = store_get(statbuf.st_size + 1);
1504 if (fread(filebuf, 1, statbuf.st_size, f) != statbuf.st_size)
1505 {
1506 *error = string_sprintf("error while reading included file %s: %s",
1507 filename, strerror(errno));
1508 (void)fclose(f);
1509 return FF_ERROR;
1510 }
1511 filebuf[statbuf.st_size] = 0;
1512 (void)fclose(f);
1513
1514 addr = NULL;
1515 frc = parse_forward_list(filebuf, options, &addr,
1516 error, incoming_domain, directory, syntax_errors);
1517 if (frc != FF_DELIVERED && frc != FF_NOTDELIVERED) return frc;
1518
1519 if (addr != NULL)
1520 {
1521 last = addr;
1522 while (last->next != NULL) { count++; last = last->next; }
1523 last->next = *anchor;
1524 *anchor = addr;
1525 count++;
1526 }
1527 }
1528
1529 /* Else (not :include:) ensure address is syntactically correct and fully
1530 qualified if not a pipe or a file, removing a leading \ if present on an
1531 unqualified address. For pipes and files we must handle quoting. It's
1532 not quite clear exactly what to do for partially quoted things, but the
1533 common case of having the whole thing in quotes is straightforward. If this
1534 was the case, inquote will have been set TRUE above and the quotes removed.
1535
1536 There is a possible ambiguity over addresses whose local parts start with
1537 a vertical bar or a slash, and the latter do in fact occur, thanks to X.400.
1538 Consider a .forward file that contains the line
1539
1540 /X=xxx/Y=xxx/OU=xxx/@some.gate.way
1541
1542 Is this a file or an X.400 address? Does it make any difference if it is in
1543 quotes? On the grounds that file names of this type are rare, Exim treats
1544 something that parses as an RFC 822 address and has a domain as an address
1545 rather than a file or a pipe. This is also how an address such as the above
1546 would be treated if it came in from outside. */
1547
1548 else
1549 {
1550 int start, end, domain;
1551 uschar *recipient = NULL;
1552 int save = s[len];
1553 s[len] = 0;
1554
1555 /* If it starts with \ and the rest of it parses as a valid mail address
1556 without a domain, carry on with that address, but qualify it with the
1557 incoming domain. Otherwise arrange for the address to fall through,
1558 causing an error message on the re-parse. */
1559
1560 if (*s == '\\')
1561 {
1562 recipient =
1563 parse_extract_address(s+1, error, &start, &end, &domain, FALSE);
1564 if (recipient != NULL)
1565 recipient = (domain != 0)? NULL :
1566 string_sprintf("%s@%s", recipient, incoming_domain);
1567 }
1568
1569 /* Try parsing the item as an address. */
1570
1571 if (recipient == NULL) recipient =
1572 parse_extract_address(s, error, &start, &end, &domain, FALSE);
1573
1574 /* If item starts with / or | and is not a valid address, or there
1575 is no domain, treat it as a file or pipe. If it was a quoted item,
1576 remove the quoting occurrences of \ within it. */
1577
1578 if ((*s == '|' || *s == '/') && (recipient == NULL || domain == 0))
1579 {
1580 uschar *t = store_get(Ustrlen(s) + 1);
1581 uschar *p = t;
1582 uschar *q = s;
1583 while (*q != 0)
1584 {
1585 if (inquote)
1586 {
1587 *p++ = (*q == '\\')? *(++q) : *q;
1588 q++;
1589 }
1590 else *p++ = *q++;
1591 }
1592 *p = 0;
1593 addr = deliver_make_addr(t, TRUE);
1594 setflag(addr, af_pfr); /* indicates pipe/file/reply */
1595 if (*s != '|') setflag(addr, af_file); /* indicates file */
1596 }
1597
1598 /* Item must be an address. Complain if not, else qualify, rewrite and set
1599 up the control block. It appears that people are in the habit of using
1600 empty addresses but with comments as a way of putting comments into
1601 alias and forward files. Therefore, ignore the error "empty address".
1602 Mailing lists might want to tolerate syntax errors; there is therefore
1603 an option to do so. */
1604
1605 else
1606 {
1607 if (recipient == NULL)
1608 {
1609 if (Ustrcmp(*error, "empty address") == 0)
1610 {
1611 *error = NULL;
1612 s[len] = save;
1613 s = nexts;
1614 continue;
1615 }
1616
1617 if (syntax_errors != NULL)
1618 {
1619 error_block *e = store_get(sizeof(error_block));
1620 error_block *last = *syntax_errors;
1621 if (last == NULL) *syntax_errors = e; else
1622 {
1623 while (last->next != NULL) last = last->next;
1624 last->next = e;
1625 }
1626 e->next = NULL;
1627 e->text1 = *error;
1628 e->text2 = string_copy(s);
1629 s[len] = save;
1630 s = nexts;
1631 continue;
1632 }
1633 else
1634 {
1635 *error = string_sprintf("%s in \"%s\"", *error, s);
1636 s[len] = save; /* _after_ using it for *error */
1637 return FF_ERROR;
1638 }
1639 }
1640
1641 /* Address was successfully parsed. Rewrite, and then make an address
1642 block. */
1643
1644 recipient = ((options & RDO_REWRITE) != 0)?
1645 rewrite_address(recipient, TRUE, FALSE, global_rewrite_rules,
1646 rewrite_existflags) :
1647 rewrite_address_qualify(recipient, TRUE);
1648 addr = deliver_make_addr(recipient, TRUE); /* TRUE => copy recipient */
1649 }
1650
1651 /* Restore the final character in the original data, and add to the
1652 output chain. */
1653
1654 s[len] = save;
1655 addr->next = *anchor;
1656 *anchor = addr;
1657 count++;
1658 }
1659
1660 /* Advance pointer for the next address */
1661
1662 s = nexts;
1663 }
1664 }
1665
1666
1667
1668 /*************************************************
1669 * Extract a Message-ID *
1670 *************************************************/
1671
1672 /* This function is used to extract message ids from In-Reply-To: and
1673 References: header lines.
1674
1675 Arguments:
1676 str pointer to the start of the message-id
1677 yield put pointer to the message id (in dynamic memory) here
1678 error put error message here on failure
1679
1680 Returns: points after the processed message-id or NULL on error
1681 */
1682
1683 uschar *
1684 parse_message_id(uschar *str, uschar **yield, uschar **error)
1685 {
1686 uschar *domain = NULL;
1687 uschar *id;
1688
1689 str = skip_comment(str);
1690 if (*str != '<')
1691 {
1692 *error = US"Missing '<' before message-id";
1693 return NULL;
1694 }
1695
1696 /* Getting a block the size of the input string will definitely be sufficient
1697 for the answer, but it may also be very long if we are processing a header
1698 line. Therefore, take care to release unwanted store afterwards. */
1699
1700 id = *yield = store_get(Ustrlen(str) + 1);
1701 *id++ = *str++;
1702
1703 str = read_addr_spec(str, id, '>', error, &domain);
1704
1705 if (*error == NULL)
1706 {
1707 if (*str != '>') *error = US"Missing '>' after message-id";
1708 else if (domain == NULL) *error = US"domain missing in message-id";
1709 }
1710
1711 if (*error != NULL)
1712 {
1713 store_reset(*yield);
1714 return NULL;
1715 }
1716
1717 while (*id != 0) id++;
1718 *id++ = *str++;
1719 *id++ = 0;
1720 store_reset(id);
1721
1722 str = skip_comment(str);
1723 return str;
1724 }
1725
1726
1727
1728
1729 /*************************************************
1730 **************************************************
1731 * Stand-alone test program *
1732 **************************************************
1733 *************************************************/
1734
1735 #if defined STAND_ALONE
1736 int main(void)
1737 {
1738 int start, end, domain;
1739 uschar buffer[1024];
1740 uschar outbuff[1024];
1741
1742 big_buffer = store_malloc(big_buffer_size);
1743
1744 /* strip_trailing_dot = TRUE; */
1745 allow_domain_literals = TRUE;
1746
1747 printf("Testing parse_fix_phrase\n");
1748
1749 while (Ufgets(buffer, sizeof(buffer), stdin) != NULL)
1750 {
1751 buffer[Ustrlen(buffer)-1] = 0;
1752 if (buffer[0] == 0) break;
1753 printf("%s\n", CS parse_fix_phrase(buffer, Ustrlen(buffer), outbuff,
1754 sizeof(outbuff)));
1755 }
1756
1757 printf("Testing parse_extract_address without group syntax and without UTF-8\n");
1758
1759 while (Ufgets(buffer, sizeof(buffer), stdin) != NULL)
1760 {
1761 uschar *out;
1762 uschar *errmess;
1763 buffer[Ustrlen(buffer) - 1] = 0;
1764 if (buffer[0] == 0) break;
1765 out = parse_extract_address(buffer, &errmess, &start, &end, &domain, FALSE);
1766 if (out == NULL) printf("*** bad address: %s\n", errmess); else
1767 {
1768 uschar extract[1024];
1769 Ustrncpy(extract, buffer+start, end-start);
1770 extract[end-start] = 0;
1771 printf("%s %d %d %d \"%s\"\n", out, start, end, domain, extract);
1772 }
1773 }
1774
1775 printf("Testing parse_extract_address without group syntax but with UTF-8\n");
1776
1777 allow_utf8_domains = TRUE;
1778 while (Ufgets(buffer, sizeof(buffer), stdin) != NULL)
1779 {
1780 uschar *out;
1781 uschar *errmess;
1782 buffer[Ustrlen(buffer) - 1] = 0;
1783 if (buffer[0] == 0) break;
1784 out = parse_extract_address(buffer, &errmess, &start, &end, &domain, FALSE);
1785 if (out == NULL) printf("*** bad address: %s\n", errmess); else
1786 {
1787 uschar extract[1024];
1788 Ustrncpy(extract, buffer+start, end-start);
1789 extract[end-start] = 0;
1790 printf("%s %d %d %d \"%s\"\n", out, start, end, domain, extract);
1791 }
1792 }
1793 allow_utf8_domains = FALSE;
1794
1795 printf("Testing parse_extract_address with group syntax\n");
1796
1797 parse_allow_group = TRUE;
1798 while (Ufgets(buffer, sizeof(buffer), stdin) != NULL)
1799 {
1800 uschar *out;
1801 uschar *errmess;
1802 uschar *s;
1803 buffer[Ustrlen(buffer) - 1] = 0;
1804 if (buffer[0] == 0) break;
1805 s = buffer;
1806 while (*s != 0)
1807 {
1808 uschar *ss = parse_find_address_end(s, FALSE);
1809 int terminator = *ss;
1810 *ss = 0;
1811 out = parse_extract_address(buffer, &errmess, &start, &end, &domain, FALSE);
1812 *ss = terminator;
1813
1814 if (out == NULL) printf("*** bad address: %s\n", errmess); else
1815 {
1816 uschar extract[1024];
1817 Ustrncpy(extract, buffer+start, end-start);
1818 extract[end-start] = 0;
1819 printf("%s %d %d %d \"%s\"\n", out, start, end, domain, extract);
1820 }
1821
1822 s = ss + (terminator? 1:0);
1823 while (isspace(*s)) s++;
1824 }
1825 }
1826
1827 printf("Testing parse_find_at\n");
1828
1829 while (Ufgets(buffer, sizeof(buffer), stdin) != NULL)
1830 {
1831 uschar *s;
1832 buffer[Ustrlen(buffer)-1] = 0;
1833 if (buffer[0] == 0) break;
1834 s = parse_find_at(buffer);
1835 if (s == NULL) printf("no @ found\n");
1836 else printf("offset = %d\n", s - buffer);
1837 }
1838
1839 printf("Testing parse_extract_addresses\n");
1840
1841 while (Ufgets(buffer, sizeof(buffer), stdin) != NULL)
1842 {
1843 uschar *errmess;
1844 int extracted;
1845 address_item *anchor = NULL;
1846 buffer[Ustrlen(buffer) - 1] = 0;
1847 if (buffer[0] == 0) break;
1848 if ((extracted = parse_forward_list(buffer, -1, &anchor,
1849 &errmess, US"incoming.domain", NULL, NULL)) == FF_DELIVERED)
1850 {
1851 while (anchor != NULL)
1852 {
1853 address_item *addr = anchor;
1854 anchor = anchor->next;
1855 printf("%d %s\n", testflag(addr, af_pfr), addr->address);
1856 }
1857 }
1858 else printf("Failed: %d %s\n", extracted, errmess);
1859 }
1860
1861 printf("Testing parse_message_id\n");
1862
1863 while (Ufgets(buffer, sizeof(buffer), stdin) != NULL)
1864 {
1865 uschar *s, *t, *errmess;
1866 buffer[Ustrlen(buffer) - 1] = 0;
1867 if (buffer[0] == 0) break;
1868 s = buffer;
1869 while (*s != 0)
1870 {
1871 s = parse_message_id(s, &t, &errmess);
1872 if (errmess != NULL)
1873 {
1874 printf("Failed: %s\n", errmess);
1875 break;
1876 }
1877 printf("%s\n", t);
1878 }
1879 }
1880
1881 return 0;
1882 }
1883
1884 #endif
1885
1886 /* End of parse.c */