Fix signed/unsigned and UTF errors Fixes: #901
[exim.git] / src / src / parse.c
CommitLineData
5ca6d115 1/* $Cambridge: exim/src/src/parse.c,v 1.14 2009/11/05 19:40:51 nm4 Exp $ */
059ec3d9
PH
2
3/*************************************************
4* Exim - an Internet mail transport agent *
5*************************************************/
6
184e8823 7/* Copyright (c) University of Cambridge 1995 - 2007 */
059ec3d9
PH
8/* See the file NOTICE for conditions of use and distribution. */
9
10/* Functions for parsing addresses */
11
12
13#include "exim.h"
14
15
16static uschar *last_comment_position;
17
18
19
20/* In stand-alone mode, provide a replacement for deliver_make_addr()
21and rewrite_address[_qualify]() so as to avoid having to drag in too much
22redundant apparatus. */
23
24#ifdef STAND_ALONE
25
26address_item *deliver_make_addr(uschar *address, BOOL copy)
27{
28address_item *addr = store_get(sizeof(address_item));
29addr->next = NULL;
30addr->parent = NULL;
31addr->address = address;
32return addr;
33}
34
35uschar *rewrite_address(uschar *recipient, BOOL dummy1, BOOL dummy2, rewrite_rule
36 *dummy3, int dummy4)
37{
38return recipient;
39}
40
41uschar *rewrite_address_qualify(uschar *recipient, BOOL dummy1)
42{
43return recipient;
44}
45
46#endif
47
48
49
50
51/*************************************************
52* Find the end of an address *
53*************************************************/
54
55/* Scan over a string looking for the termination of an address at a comma,
56or end of the string. It's the source-routed addresses which cause much pain
57here. Although Exim ignores source routes, it must recognize such addresses, so
58we cannot get rid of this logic.
59
60Argument:
61 s pointer to the start of an address
62 nl_ends if TRUE, '\n' terminates an address
63
64Returns: pointer past the end of the address
65 (i.e. points to null or comma)
66*/
67
68uschar *
69parse_find_address_end(uschar *s, BOOL nl_ends)
70{
71BOOL source_routing = *s == '@';
72int no_term = source_routing? 1 : 0;
73
74while (*s != 0 && (*s != ',' || no_term > 0) && (*s != '\n' || !nl_ends))
75 {
76 /* Skip single quoted characters. Strictly these should not occur outside
77 quoted strings in RFC 822 addresses, but they can in RFC 821 addresses. Pity
78 about the lack of consistency, isn't it? */
79
80 if (*s == '\\' && s[1] != 0) s += 2;
81
82 /* Skip quoted items that are not inside brackets. Note that
83 quoted pairs are allowed inside quoted strings. */
84
85 else if (*s == '\"')
86 {
87 while (*(++s) != 0 && (*s != '\n' || !nl_ends))
88 {
89 if (*s == '\\' && s[1] != 0) s++;
90 else if (*s == '\"') { s++; break; }
91 }
92 }
93
94 /* Skip comments, which may include nested brackets, but quotes
95 are not recognized inside comments, though quoted pairs are. */
96
97 else if (*s == '(')
98 {
99 int level = 1;
100 while (*(++s) != 0 && (*s != '\n' || !nl_ends))
101 {
102 if (*s == '\\' && s[1] != 0) s++;
103 else if (*s == '(') level++;
104 else if (*s == ')' && --level <= 0) { s++; break; }
105 }
106 }
107
108 /* Non-special character; just advance. Passing the colon in a source
109 routed address means that any subsequent comma or colon may terminate unless
110 inside angle brackets. */
111
112 else
113 {
114 if (*s == '<')
115 {
116 source_routing = s[1] == '@';
117 no_term = source_routing? 2 : 1;
118 }
119 else if (*s == '>') no_term--;
120 else if (source_routing && *s == ':') no_term--;
121 s++;
122 }
123 }
124
125return s;
126}
127
128
129
130/*************************************************
131* Find last @ in an address *
132*************************************************/
133
134/* This function is used when we have something that may not qualified. If we
135know it's qualified, searching for the rightmost '@' is sufficient. Here we
136have to be a bit more clever than just a plain search, in order to handle
137unqualified local parts like "thing@thong" correctly. Since quotes may not
138legally be part of a domain name, we can give up on hitting the first quote
139when searching from the right. Now that the parsing also permits the RFC 821
140form of address, where quoted-pairs are allowed in unquoted local parts, we
141must take care to handle that too.
142
143Argument: pointer to an address, possibly unqualified
144Returns: pointer to the last @ in an address, or NULL if none
145*/
146
147uschar *
148parse_find_at(uschar *s)
149{
150uschar *t = s + Ustrlen(s);
151while (--t >= s)
152 {
153 if (*t == '@')
154 {
155 int backslash_count = 0;
156 uschar *tt = t - 1;
157 while (tt > s && *tt-- == '\\') backslash_count++;
158 if ((backslash_count & 1) == 0) return t;
159 }
160 else if (*t == '\"') return NULL;
161 }
162return NULL;
163}
164
165
166
167
168/***************************************************************************
169* In all the functions below that read a particular object type from *
170* the input, return the new value of the pointer s (the first argument), *
171* and put the object into the store pointed to by t (the second argument), *
172* adding a terminating zero. If no object is found, t will point to zero *
173* on return. *
174***************************************************************************/
175
176
177/*************************************************
178* Skip white space and comment *
179*************************************************/
180
181/* Algorithm:
182 (1) Skip spaces.
183 (2) If uschar not '(', return.
184 (3) Skip till matching ')', not counting any characters
185 escaped with '\'.
186 (4) Move past ')' and goto (1).
187
188The start of the last potential comment position is remembered to
189make it possible to ignore comments at the end of compound items.
190
191Argument: current character pointer
192Regurns: new character pointer
193*/
194
195static uschar *
196skip_comment(uschar *s)
197{
198last_comment_position = s;
199while (*s)
200 {
201 int c, level;
202 while (isspace(*s)) s++;
203 if (*s != '(') break;
204 level = 1;
205 while((c = *(++s)) != 0)
206 {
207 if (c == '(') level++;
208 else if (c == ')') { if (--level <= 0) { s++; break; } }
209 else if (c == '\\' && s[1] != 0) s++;
210 }
211 }
212return s;
213}
214
215
216
217/*************************************************
218* Read a domain *
219*************************************************/
220
221/* A domain is a sequence of subdomains, separated by dots. See comments below
222for detailed syntax of the subdomains.
223
224If allow_domain_literals is TRUE, a "domain" may also be an IP address enclosed
225in []. Make sure the output is set to the null string if there is a syntax
226error as well as if there is no domain at all.
227
228Arguments:
229 s current character pointer
230 t where to put the domain
231 errorptr put error message here on failure (*t will be 0 on exit)
232
233Returns: new character pointer
234*/
235
236static uschar *
237read_domain(uschar *s, uschar *t, uschar **errorptr)
238{
239uschar *tt = t;
240s = skip_comment(s);
241
242/* Handle domain literals if permitted. An RFC 822 domain literal may contain
243any character except [ ] \, including linear white space, and may contain
244quoted characters. However, RFC 821 restricts literals to being dot-separated
2453-digit numbers, and we make the obvious extension for IPv6. Go for a sequence
309bd837
PH
246of digits, dots, hex digits, and colons here; later this will be checked for
247being a syntactically valid IP address if it ever gets to a router.
059ec3d9 248
309bd837
PH
249Allow both the formal IPv6 form, with IPV6: at the start, and the informal form
250without it, and accept IPV4: as well, 'cause someone will use it sooner or
251later. */
059ec3d9
PH
252
253if (*s == '[')
254 {
255 *t++ = *s++;
256
059ec3d9
PH
257 if (strncmpic(s, US"IPv6:", 5) == 0 || strncmpic(s, US"IPv4:", 5) == 0)
258 {
259 memcpy(t, s, 5);
260 t += 5;
261 s += 5;
262 }
263 while (*s == '.' || *s == ':' || isxdigit(*s)) *t++ = *s++;
264
059ec3d9
PH
265 if (*s == ']') *t++ = *s++; else
266 {
267 *errorptr = US"malformed domain literal";
268 *tt = 0;
269 }
270
271 if (!allow_domain_literals)
272 {
273 *errorptr = US"domain literals not allowed";
274 *tt = 0;
275 }
276 *t = 0;
277 return skip_comment(s);
278 }
279
280/* Handle a proper domain, which is a sequence of dot-separated atoms. Remove
281trailing dots if strip_trailing_dot is set. A subdomain is an atom.
282
283An atom is a sequence of any characters except specials, space, and controls.
284The specials are ( ) < > @ , ; : \ " . [ and ]. This is the rule for RFC 822
285and its successor (RFC 2822). However, RFC 821 and its successor (RFC 2821) is
286tighter, allowing only letters, digits, and hyphens, not starting with a
287hyphen.
288
289There used to be a global flag that got set when checking addresses that came
290in over SMTP and which should therefore should be checked according to the
291stricter rule. However, it seems silly to make the distinction, because I don't
292suppose anybody ever uses local domains that are 822-compliant and not
293821-compliant. Furthermore, Exim now has additional data on the spool file line
294after an address (after "one_time" processing), and it makes use of a #
295character to delimit it. When I wrote that code, I forgot about this 822-domain
296stuff, and assumed # could never appear in a domain.
297
298So the old code is now cut out for Release 4.11 onwards, on 09-Aug-02. In a few
299years, when we are sure this isn't actually causing trouble, throw it away.
300
301March 2003: the story continues: There is a camp that is arguing for the use of
302UTF-8 in domain names as the way to internationalization, and other MTAs
303support this. Therefore, we now have a flag that permits the use of characters
304with values greater than 127, encoded in UTF-8, in subdomains, so that Exim can
305be used experimentally in this way. */
306
307for (;;)
308 {
309 uschar *tsave = t;
310
311/*********************
312 if (rfc821_domains)
313 {
314 if (*s != '-') while (isalnum(*s) || *s == '-') *t++ = *s++;
315 }
316 else
317 while (!mac_iscntrl_or_special(*s)) *t++ = *s++;
318*********************/
319
320 if (*s != '-')
321 {
322 /* Only letters, digits, and hyphens */
323
324 if (!allow_utf8_domains)
325 {
326 while (isalnum(*s) || *s == '-') *t++ = *s++;
327 }
328
329 /* Permit legal UTF-8 characters to be included */
330
331 else for(;;)
332 {
333 int i, d;
334 if (isalnum(*s) || *s == '-') /* legal ascii characters */
335 {
336 *t++ = *s++;
337 continue;
338 }
339 if ((*s & 0xc0) != 0xc0) break; /* not start of UTF-8 character */
340 d = *s << 2;
341 for (i = 1; i < 6; i++) /* i is the number of additional bytes */
342 {
343 if ((d & 0x80) == 0) break;
344 d <<= 1;
345 }
346 if (i == 6) goto BAD_UTF8; /* invalid UTF-8 */
347 *t++ = *s++; /* leading UTF-8 byte */
348 while (i-- > 0) /* copy and check remainder */
349 {
350 if ((*s & 0xc0) != 0x80)
351 {
352 BAD_UTF8:
353 *errorptr = US"invalid UTF-8 byte sequence";
354 *tt = 0;
355 return s;
356 }
357 *t++ = *s++;
358 }
359 } /* End of loop for UTF-8 character */
360 } /* End of subdomain */
361
362 s = skip_comment(s);
363 *t = 0;
364
365 if (t == tsave) /* empty component */
366 {
367 if (strip_trailing_dot && t > tt && *s != '.') t[-1] = 0; else
368 {
369 *errorptr = US"domain missing or malformed";
370 *tt = 0;
371 }
372 return s;
373 }
374
375 if (*s != '.') break;
376 *t++ = *s++;
377 s = skip_comment(s);
378 }
379
380return s;
381}
382
383
384
385/*************************************************
386* Read a local-part *
387*************************************************/
388
389/* A local-part is a sequence of words, separated by periods. A null word
390between dots is not strictly allowed but apparently many mailers permit it,
391so, sigh, better be compatible. Even accept a trailing dot...
392
393A <word> is either a quoted string, or an <atom>, which is a sequence
394of any characters except specials, space, and controls. The specials are
395( ) < > @ , ; : \ " . [ and ]. In RFC 822, a single quoted character, (a
396quoted-pair) is not allowed in a word. However, in RFC 821, it is permitted in
397the local part of an address. Rather than have separate parsing functions for
398the different cases, take the liberal attitude always. At least one MUA is
399happy to recognize this case; I don't know how many other programs do.
400
401Arguments:
402 s current character pointer
403 t where to put the local part
404 error where to point error text
405 allow_null TRUE if an empty local part is not an error
406
407Returns: new character pointer
408*/
409
410static uschar *
411read_local_part(uschar *s, uschar *t, uschar **error, BOOL allow_null)
412{
413uschar *tt = t;
414*error = NULL;
415for (;;)
416 {
417 int c;
418 uschar *tsave = t;
419 s = skip_comment(s);
420
421 /* Handle a quoted string */
422
423 if (*s == '\"')
424 {
425 *t++ = '\"';
426 while ((c = *(++s)) != 0 && c != '\"')
427 {
428 *t++ = c;
429 if (c == '\\' && s[1] != 0) *t++ = *(++s);
430 }
431 if (c == '\"')
432 {
433 s++;
434 *t++ = '\"';
435 }
436 else
437 {
438 *error = US"unmatched doublequote in local part";
439 return s;
440 }
441 }
442
443 /* Handle an atom, but allow quoted pairs within it. */
444
445 else while (!mac_iscntrl_or_special(*s) || *s == '\\')
446 {
447 c = *t++ = *s++;
448 if (c == '\\' && *s != 0) *t++ = *s++;
449 }
450
451 /* Terminate the word and skip subsequent comment */
452
453 *t = 0;
454 s = skip_comment(s);
455
456 /* If we have read a null component at this point, give an error unless it is
457 terminated by a dot - an extension to RFC 822 - or if it is the first
458 component of the local part and an empty local part is permitted, in which
459 case just return normally. */
460
461 if (t == tsave && *s != '.')
462 {
463 if (t == tt && !allow_null)
464 *error = US"missing or malformed local part";
465 return s;
466 }
467
468 /* Anything other than a dot terminates the local part. Treat multiple dots
469 as a single dot, as this seems to be a common extension. */
470
471 if (*s != '.') break;
472 do { *t++ = *s++; } while (*s == '.');
473 }
474
475return s;
476}
477
478
479/*************************************************
480* Read route part of route-addr *
481*************************************************/
482
483/* The pointer is at the initial "@" on entry. Return it following the
484terminating colon. Exim no longer supports the use of source routes, but it is
485required to accept the syntax.
486
487Arguments:
488 s current character pointer
489 t where to put the route
490 errorptr where to put an error message
491
492Returns: new character pointer
493*/
494
495static uschar *
496read_route(uschar *s, uschar *t, uschar **errorptr)
497{
498BOOL commas = FALSE;
499*errorptr = NULL;
500
501while (*s == '@')
502 {
503 *t++ = '@';
504 s = read_domain(s+1, t, errorptr);
505 if (*t == 0) return s;
506 t += Ustrlen((const uschar *)t);
507 if (*s != ',') break;
508 *t++ = *s++;
509 commas = TRUE;
510 s = skip_comment(s);
511 }
512
513if (*s == ':') *t++ = *s++;
514
515/* If there is no colon, and there were no commas, the most likely error
516is in fact a missing local part in the address rather than a missing colon
517after the route. */
518
519else *errorptr = commas?
520 US"colon expected after route list" :
521 US"no local part";
522
523/* Terminate the route and return */
524
525*t = 0;
526return skip_comment(s);
527}
528
529
530
531/*************************************************
532* Read addr-spec *
533*************************************************/
534
535/* Addr-spec is local-part@domain. We make the domain optional -
536the expected terminator for the whole thing is passed to check this.
537This function is called only when we know we have a route-addr.
538
539Arguments:
540 s current character pointer
541 t where to put the addr-spec
542 term expected terminator (0 or >)
543 errorptr where to put an error message
544 domainptr set to point to the start of the domain
545
546Returns: new character pointer
547*/
548
549static uschar *
550read_addr_spec(uschar *s, uschar *t, int term, uschar **errorptr,
551 uschar **domainptr)
552{
553s = read_local_part(s, t, errorptr, FALSE);
554if (*errorptr == NULL)
555 {
556 if (*s != term)
557 {
558 if (*s != '@')
559 *errorptr = string_sprintf("\"@\" or \".\" expected after \"%s\"", t);
560 else
561 {
562 t += Ustrlen((const uschar *)t);
563 *t++ = *s++;
564 *domainptr = t;
565 s = read_domain(s, t, errorptr);
566 }
567 }
568 }
569return s;
570}
571
572
573
574/*************************************************
575* Extract operative address *
576*************************************************/
577
578/* This function extracts an operative address from a full RFC822 mailbox and
579returns it in a piece of dynamic store. We take the easy way and get a piece
580of store the same size as the input, and then copy into it whatever is
581necessary. If we cannot find a valid address (syntax error), return NULL, and
582point the error pointer to the reason. The arguments "start" and "end" are used
583to return the offsets of the first and one past the last characters in the
584original mailbox of the address that has been extracted, to aid in re-writing.
585The argument "domain" is set to point to the first character after "@" in the
586final part of the returned address, or zero if there is no @.
587
588Exim no longer supports the use of source routed addresses (those of the form
589@domain,...:route_addr). It recognizes the syntax, but collapses such addresses
590down to their final components. Formerly, collapse_source_routes had to be set
591to achieve this effect. RFC 1123 allows collapsing with MAY, while the revision
592of RFC 821 had increased this to SHOULD, so I've gone for it, because it makes
593a lot of code elsewhere in Exim much simpler.
594
595There are some special fudges here for handling RFC 822 group address notation
596which may appear in certain headers. If the flag parse_allow_group is set
597TRUE and parse_found_group is FALSE when this function is called, an address
598which is the start of a group (i.e. preceded by a phrase and a colon) is
599recognized; the phrase is ignored and the flag parse_found_group is set. If
1eccaa59
PH
600this flag is TRUE at the end of an address, and if an extraneous semicolon is
601found, it is ignored and the flag is cleared.
602
603This logic is used only when scanning through addresses in headers, either to
604fulfil the -t option, or for rewriting, or for checking header syntax. Because
605the group "state" has to be remembered between multiple calls of this function,
606the variables parse_{allow,found}_group are global. It is important to ensure
607that they are reset to FALSE at the end of scanning a header's list of
608addresses.
059ec3d9
PH
609
610Arguments:
611 mailbox points to the RFC822 mailbox
612 errorptr where to point an error message
613 start set to start offset in mailbox
614 end set to end offset in mailbox
615 domain set to domain offset in result, or 0 if no domain present
616 allow_null allow <> if TRUE
617
618Returns: points to the extracted address, or NULL on error
619*/
620
621#define FAILED(s) { *errorptr = s; goto PARSE_FAILED; }
622
623uschar *
624parse_extract_address(uschar *mailbox, uschar **errorptr, int *start, int *end,
625 int *domain, BOOL allow_null)
626{
627uschar *yield = store_get(Ustrlen(mailbox) + 1);
628uschar *startptr, *endptr;
629uschar *s = (uschar *)mailbox;
630uschar *t = (uschar *)yield;
631
632*domain = 0;
633
634/* At the start of the string we expect either an addr-spec or a phrase
635preceding a <route-addr>. If groups are allowed, we might also find a phrase
636preceding a colon and an address. If we find an initial word followed by
637a dot, strict interpretation of the RFC would cause it to be taken
638as the start of an addr-spec. However, many mailers break the rules
639and use addresses of the form "a.n.other <ano@somewhere>" and so we
640allow this case. */
641
642RESTART: /* Come back here after passing a group name */
643
644s = skip_comment(s);
645startptr = s; /* In case addr-spec */
646s = read_local_part(s, t, errorptr, TRUE); /* Dot separated words */
647if (*errorptr != NULL) goto PARSE_FAILED;
648
649/* If the terminator is neither < nor @ then the format of the address
650must either be a bare local-part (we are now at the end), or a phrase
651followed by a route-addr (more words must follow). */
652
653if (*s != '@' && *s != '<')
654 {
655 if (*s == 0 || *s == ';')
656 {
657 if (*t == 0) FAILED(US"empty address");
658 endptr = last_comment_position;
659 goto PARSE_SUCCEEDED; /* Bare local part */
660 }
661
662 /* Expect phrase route-addr, or phrase : if groups permitted, but allow
663 dots in the phrase; complete the loop only when '<' or ':' is encountered -
664 end of string will produce a null local_part and therefore fail. We don't
665 need to keep updating t, as the phrase isn't to be kept. */
666
667 while (*s != '<' && (!parse_allow_group || *s != ':'))
668 {
669 s = read_local_part(s, t, errorptr, FALSE);
670 if (*errorptr != NULL)
671 {
672 *errorptr = string_sprintf("%s (expected word or \"<\")", *errorptr);
673 goto PARSE_FAILED;
674 }
675 }
676
677 if (*s == ':')
678 {
679 parse_found_group = TRUE;
680 parse_allow_group = FALSE;
681 s++;
682 goto RESTART;
683 }
684
685 /* Assert *s == '<' */
686 }
687
688/* At this point the next character is either '@' or '<'. If it is '@', only a
689single local-part has previously been read. An angle bracket signifies the
690start of an <addr-spec>. Throw away anything we have saved so far before
691processing it. Note that this is "if" rather than "else if" because it's also
692used after reading a preceding phrase.
693
694There are a lot of broken sendmails out there that put additional pairs of <>
695round <route-addr>s. If strip_excess_angle_brackets is set, allow any number of
696them, as long as they match. */
697
698if (*s == '<')
699 {
700 uschar *domainptr = yield;
701 BOOL source_routed = FALSE;
702 int bracket_count = 1;
703
704 s++;
705 if (strip_excess_angle_brackets)
706 while (*s == '<') { bracket_count++; s++; }
707
708 t = yield;
709 startptr = s;
710 s = skip_comment(s);
711
712 /* Read an optional series of routes, each of which is a domain. They
713 are separated by commas and terminated by a colon. However, we totally ignore
714 such routes (RFC 1123 says we MAY, and the revision of RFC 821 says we
715 SHOULD). */
716
717 if (*s == '@')
718 {
719 s = read_route(s, t, errorptr);
720 if (*errorptr != NULL) goto PARSE_FAILED;
721 *t = 0; /* Ensure route is ignored - probably overkill */
722 source_routed = TRUE;
723 }
724
725 /* Now an addr-spec, terminated by '>'. If there is no preceding route,
726 we must allow an empty addr-spec if allow_null is TRUE, to permit the
727 address "<>" in some circumstances. A source-routed address MUST have
728 a domain in the final part. */
729
730 if (allow_null && !source_routed && *s == '>')
731 {
732 *t = 0;
733 *errorptr = NULL;
734 }
735 else
736 {
737 s = read_addr_spec(s, t, '>', errorptr, &domainptr);
738 if (*errorptr != NULL) goto PARSE_FAILED;
739 *domain = domainptr - yield;
740 if (source_routed && *domain == 0)
741 FAILED(US"domain missing in source-routed address");
742 }
743
744 endptr = s;
745 if (*errorptr != NULL) goto PARSE_FAILED;
746 while (bracket_count-- > 0) if (*s++ != '>')
747 {
748 *errorptr = (s[-1] == 0)? US"'>' missing at end of address" :
749 string_sprintf("malformed address: %.32s may not follow %.*s",
750 s-1, s - (uschar *)mailbox - 1, mailbox);
751 goto PARSE_FAILED;
752 }
753
754 s = skip_comment(s);
755 }
756
757/* Hitting '@' after the first local-part means we have definitely got an
758addr-spec, on a strict reading of the RFC, and the rest of the string
759should be the domain. However, for flexibility we allow for a route-address
760not enclosed in <> as well, which is indicated by an empty first local
761part preceding '@'. The source routing is, however, ignored. */
762
763else if (*t == 0)
764 {
765 uschar *domainptr = yield;
766 s = read_route(s, t, errorptr);
767 if (*errorptr != NULL) goto PARSE_FAILED;
768 *t = 0; /* Ensure route is ignored - probably overkill */
769 s = read_addr_spec(s, t, 0, errorptr, &domainptr);
770 if (*errorptr != NULL) goto PARSE_FAILED;
771 *domain = domainptr - yield;
772 endptr = last_comment_position;
773 if (*domain == 0) FAILED(US"domain missing in source-routed address");
774 }
775
776/* This is the strict case of local-part@domain. */
777
778else
779 {
780 t += Ustrlen((const uschar *)t);
781 *t++ = *s++;
782 *domain = t - yield;
783 s = read_domain(s, t, errorptr);
784 if (*t == 0) goto PARSE_FAILED;
785 endptr = last_comment_position;
786 }
787
788/* Use goto to get here from the bare local part case. Arrive by falling
789through for other cases. Endptr may have been moved over whitespace, so
790move it back past white space if necessary. */
791
792PARSE_SUCCEEDED:
793if (*s != 0)
794 {
795 if (parse_found_group && *s == ';')
796 {
797 parse_found_group = FALSE;
798 parse_allow_group = TRUE;
799 }
800 else
801 {
802 *errorptr = string_sprintf("malformed address: %.32s may not follow %.*s",
803 s, s - (uschar *)mailbox, mailbox);
804 goto PARSE_FAILED;
805 }
806 }
807*start = startptr - (uschar *)mailbox; /* Return offsets */
808while (isspace(endptr[-1])) endptr--;
809*end = endptr - (uschar *)mailbox;
810
811/* Although this code has no limitation on the length of address extracted,
812other parts of Exim may have limits, and in any case, RFC 2821 limits local
813parts to 64 and domains to 255, so we do a check here, giving an error if the
814address is ridiculously long. */
815
816if (*end - *start > ADDRESS_MAXLENGTH)
817 {
818 *errorptr = string_sprintf("address is ridiculously long: %.64s...", yield);
819 return NULL;
820 }
821
822return (uschar *)yield;
823
824/* Use goto (via the macro FAILED) to get to here from a variety of places.
825We might have an empty address in a group - the caller can choose to ignore
826this. We must, however, keep the flags correct. */
827
828PARSE_FAILED:
829if (parse_found_group && *s == ';')
830 {
831 parse_found_group = FALSE;
832 parse_allow_group = TRUE;
833 }
834return NULL;
835}
836
837#undef FAILED
838
839
840
841/*************************************************
842* Quote according to RFC 2047 *
843*************************************************/
844
845/* This function is used for quoting text in headers according to RFC 2047.
846If the only characters that strictly need quoting are spaces, we return the
847original string, unmodified. If a quoted string is too long for the buffer, it
848is truncated. (This shouldn't happen: this is normally handling short strings.)
849
8e669ac1
PH
850Hmmph. As always, things get perverted for other uses. This function was
851originally for the "phrase" part of addresses. Now it is being used for much
852longer texts in ACLs and via the ${rfc2047: expansion item. This means we have
14702f5b
PH
853to check for overlong "encoded-word"s and split them. November 2004.
854
059ec3d9
PH
855Arguments:
856 string the string to quote - already checked to contain non-printing
857 chars
858 len the length of the string
859 charset the name of the character set; NULL => iso-8859-1
860 buffer the buffer to put the answer in
861 buffer_size the size of the buffer
46218253
PH
862 fold if TRUE, a newline is inserted before the separating space when
863 more than one encoded-word is generated
059ec3d9
PH
864
865Returns: pointer to the original string, if no quoting needed, or
866 pointer to buffer containing the quoted string, or
867 a pointer to "String too long" if the buffer can't even hold
868 the introduction
869*/
870
871uschar *
872parse_quote_2047(uschar *string, int len, uschar *charset, uschar *buffer,
46218253 873 int buffer_size, BOOL fold)
059ec3d9
PH
874{
875uschar *s = string;
14702f5b
PH
876uschar *p, *t;
877int hlen;
059ec3d9
PH
878BOOL coded = FALSE;
879
880if (charset == NULL) charset = US"iso-8859-1";
881
882/* We don't expect this to fail! */
883
884if (!string_format(buffer, buffer_size, "=?%s?Q?", charset))
885 return US"String too long";
886
14702f5b
PH
887hlen = Ustrlen(buffer);
888t = buffer + hlen;
889p = buffer;
890
059ec3d9
PH
891for (; len > 0; len--)
892 {
893 int ch = *s++;
14702f5b 894 if (t > buffer + buffer_size - hlen - 8) break;
8e669ac1 895
14702f5b
PH
896 if (t - p > 70)
897 {
898 *t++ = '?';
899 *t++ = '=';
46218253 900 if (fold) *t++ = '\n';
14702f5b
PH
901 *t++ = ' ';
902 p = t;
903 Ustrncpy(p, buffer, hlen);
904 t += hlen;
8e669ac1
PH
905 }
906
059ec3d9
PH
907 if (ch < 33 || ch > 126 ||
908 Ustrchr("?=()<>@,;:\\\".[]_", ch) != NULL)
909 {
910 if (ch == ' ') *t++ = '_'; else
911 {
912 sprintf(CS t, "=%02X", ch);
913 while (*t != 0) t++;
914 coded = TRUE;
915 }
916 }
917 else *t++ = ch;
918 }
8e669ac1 919
14702f5b 920*t++ = '?';
8e669ac1 921*t++ = '=';
14702f5b 922*t = 0;
8e669ac1 923
059ec3d9
PH
924return coded? buffer : string;
925}
926
927
928
929
930/*************************************************
931* Fix up an RFC 822 "phrase" *
932*************************************************/
933
934/* This function is called to repair any syntactic defects in the "phrase" part
935of an RFC822 address. In particular, it is applied to the user's name as read
936from the passwd file when accepting a local message, and to the data from the
937-F option.
938
939If the string contains existing quoted strings or comments containing
940freestanding quotes, then we just quote those bits that need quoting -
941otherwise it would get awfully messy and probably not look good. If not, we
942quote the whole thing if necessary. Thus
943
944 John Q. Smith => "John Q. Smith"
945 John "Jack" Smith => John "Jack" Smith
946 John "Jack" Q. Smith => John "Jack" "Q." Smith
947 John (Jack) Q. Smith => "John (Jack) Q. Smith"
948 John ("Jack") Q. Smith => John ("Jack") "Q." Smith
949but
950 John (\"Jack\") Q. Smith => "John (\"Jack\") Q. Smith"
951
952Sheesh! This is tedious code. It is a great pity that the syntax of RFC822 is
953the way it is...
954
955August 2000: Additional code added:
956
957 Previously, non-printing characters were turned into question marks, which do
958 not need to be quoted.
959
960 Now, a different tactic is used if there are any non-printing ASCII
961 characters. The encoding method from RFC 2047 is used, assuming iso-8859-1 as
962 the character set.
963
964 We *could* use this for all cases, getting rid of the messy original code,
965 but leave it for now. It would complicate simple cases like "John Q. Smith".
966
967The result is passed back in the buffer; it is usually going to be added to
968some other string. In order to be sure there is going to be no overflow,
969restrict the length of the input to 1/4 of the buffer size - this allows for
970every single character to be quoted or encoded without overflowing, and that
971wouldn't happen because of amalgamation. If the phrase is too long, return a
972fixed string.
973
974Arguments:
975 phrase an RFC822 phrase
976 len the length of the phrase
977 buffer a buffer to put the result in
978 buffer_size the size of the buffer
979
980Returns: the fixed RFC822 phrase
981*/
982
983uschar *
984parse_fix_phrase(uschar *phrase, int len, uschar *buffer, int buffer_size)
985{
986int ch, i;
987BOOL quoted = FALSE;
988uschar *s, *t, *end, *yield;
989
990while (len > 0 && isspace(*phrase)) { phrase++; len--; }
991if (len > buffer_size/4) return US"Name too long";
992
993/* See if there are any non-printing characters, and if so, use the RFC 2047
994encoding for the whole thing. */
995
996for (i = 0, s = phrase; i < len; i++, s++)
997 if ((*s < 32 && *s != '\t') || *s > 126) break;
998
999if (i < len) return parse_quote_2047(phrase, len, headers_charset, buffer,
46218253 1000 buffer_size, FALSE);
059ec3d9
PH
1001
1002/* No non-printers; use the RFC 822 quoting rules */
1003
1004s = phrase;
1005end = s + len;
1006yield = t = buffer + 1;
1007
1008while (s < end)
1009 {
1010 ch = *s++;
1011
1012 /* Copy over quoted strings, remembering we encountered one */
1013
1014 if (ch == '\"')
1015 {
1016 *t++ = '\"';
1017 while (s < end && (ch = *s++) != '\"')
1018 {
1019 *t++ = ch;
1020 if (ch == '\\' && s < end) *t++ = *s++;
1021 }
1022 *t++ = '\"';
1023 if (s >= end) break;
1024 quoted = TRUE;
1025 }
1026
1027 /* Copy over comments, noting if they contain freestanding quote
1028 characters */
1029
1030 else if (ch == '(')
1031 {
1032 int level = 1;
1033 *t++ = '(';
1034 while (s < end)
1035 {
1036 ch = *s++;
1037 *t++ = ch;
1038 if (ch == '(') level++;
1039 else if (ch == ')') { if (--level <= 0) break; }
1040 else if (ch == '\\' && s < end) *t++ = *s++ & 127;
1041 else if (ch == '\"') quoted = TRUE;
1042 }
1043 if (ch == 0)
1044 {
1045 while (level--) *t++ = ')';
1046 break;
1047 }
1048 }
1049
1050 /* Handle special characters that need to be quoted */
1051
1052 else if (Ustrchr(")<>@,;:\\.[]", ch) != NULL)
1053 {
1054 /* If hit previous quotes just make one quoted "word" */
1055
1056 if (quoted)
1057 {
1058 uschar *tt = t++;
1059 while (*(--tt) != ' ' && *tt != '\"' && *tt != ')') tt[1] = *tt;
1060 tt[1] = '\"';
1061 *t++ = ch;
1062 while (s < end)
1063 {
1064 ch = *s++;
1065 if (ch == ' ' || ch == '\"') { s--; break; } else *t++ = ch;
1066 }
1067 *t++ = '\"';
1068 }
1069
1070 /* Else quote the whole string so far, and the rest up to any following
1071 quotes. We must treat anything following a backslash as a literal. */
1072
1073 else
1074 {
1075 BOOL escaped = (ch == '\\');
1076 *(--yield) = '\"';
1077 *t++ = ch;
1078
1079 /* Now look for the end or a quote */
1080
1081 while (s < end)
1082 {
1083 ch = *s++;
1084
1085 /* Handle escaped pairs */
1086
1087 if (escaped)
1088 {
1089 *t++ = ch;
1090 escaped = FALSE;
1091 }
1092
1093 else if (ch == '\\')
1094 {
1095 *t++ = ch;
1096 escaped = TRUE;
1097 }
1098
1099 /* If hit subsequent quotes, insert our quote before any trailing
1100 spaces and back up to re-handle the quote in the outer loop. */
1101
1102 else if (ch == '\"')
1103 {
1104 int count = 0;
1105 while (t[-1] == ' ') { t--; count++; }
1106 *t++ = '\"';
1107 while (count-- > 0) *t++ = ' ';
1108 s--;
1109 break;
1110 }
1111
1112 /* If hit a subsequent comment, check it for unescaped quotes,
1113 and if so, end our quote before it. */
1114
1115 else if (ch == '(')
1116 {
1117 uschar *ss = s; /* uschar after '(' */
1118 int level = 1;
1119 while(ss < end)
1120 {
1121 ch = *ss++;
1122 if (ch == '(') level++;
1123 else if (ch == ')') { if (--level <= 0) break; }
1124 else if (ch == '\\' && ss+1 < end) ss++;
1125 else if (ch == '\"') { quoted = TRUE; break; }
1126 }
1127
1128 /* Comment contains unescaped quotes; end our quote before
1129 the start of the comment. */
1130
1131 if (quoted)
1132 {
1133 int count = 0;
1134 while (t[-1] == ' ') { t--; count++; }
1135 *t++ = '\"';
1136 while (count-- > 0) *t++ = ' ';
1137 break;
1138 }
1139
1140 /* Comment does not contain unescaped quotes; include it in
1141 our quote. */
1142
1143 else
1144 {
1145 if (ss >= end) ss--;
1146 *t++ = '(';
1147 Ustrncpy(t, s, ss-s);
1148 t += ss-s;
1149 s = ss;
1150 }
1151 }
1152
1153 /* Not a comment or quote; include this character in our quotes. */
1154
1155 else *t++ = ch;
1156 }
1157 }
1158
1159 /* Add a final quote if we hit the end of the string. */
1160
1161 if (s >= end) *t++ = '\"';
1162 }
1163
1164 /* Non-special character; just copy it over */
1165
1166 else *t++ = ch;
1167 }
1168
1169*t = 0;
1170return yield;
1171}
1172
1173
1174/*************************************************
1175* Extract addresses from a list *
1176*************************************************/
1177
1178/* This function is called by the redirect router to scan a string containing a
1179list of addresses separated by commas (with optional white space) or by
1180newlines, and to generate a chain of address items from them. In other words,
1181to unpick data from an alias or .forward file.
1182
1183The SunOS5 documentation for alias files is not very clear on the syntax; it
1184does not say that either a comma or a newline can be used for separation.
1185However, that is the way Smail does it, so we follow suit.
1186
1187If a # character is encountered in a white space position, then characters from
1188there to the next newline are skipped.
1189
1190If an unqualified address begins with '\', just skip that character. This gives
1191compatibility with Sendmail's use of \ to prevent looping. Exim has its own
1192loop prevention scheme which handles other cases too - see the code in
1193route_address().
1194
1195An "address" can be a specification of a file or a pipe; the latter may often
1196need to be quoted because it may contain spaces, but we don't want to retain
1197the quotes. Quotes may appear in normal addresses too, and should be retained.
1198We can distinguish between these cases, because in addresses, quotes are used
1199only for parts of the address, not the whole thing. Therefore, we remove quotes
1200from items when they entirely enclose them, but not otherwise.
1201
1202An "address" can also be of the form :include:pathname to include a list of
1203addresses contained in the specified file.
1204
1205Any unqualified addresses are qualified with and rewritten if necessary, via
1206the rewrite_address() function.
1207
1208Arguments:
1209 s the list of addresses (typically a complete
1210 .forward file or a list of entries in an alias file)
1211 options option bits for permitting or denying various special cases;
1212 not all bits are relevant here - some are for filter
1213 files; those we use here are:
1214 RDO_DEFER
1215 RDO_FREEZE
1216 RDO_FAIL
1217 RDO_BLACKHOLE
1218 RDO_REWRITE
1219 RDO_INCLUDE
1220 anchor where to hang the chain of newly-created addresses. This
1221 should be initialized to NULL.
1222 error where to return an error text
1223 incoming domain domain of the incoming address; used to qualify unqualified
1224 local parts preceded by \
1225 directory if NULL, no checks are done on :include: files
1226 otherwise, included file names must start with the given
1227 directory
1228 syntax_errors if not NULL, it carries on after syntax errors in addresses,
1229 building up a list of errors as error blocks chained on
1230 here.
1231
1232Returns: FF_DELIVERED addresses extracted
1233 FF_NOTDELIVERED no addresses extracted, but no errors
1234 FF_BLACKHOLE :blackhole:
1235 FF_DEFER :defer:
1236 FF_FAIL :fail:
1237 FF_INCLUDEFAIL some problem with :include:; *error set
1238 FF_ERROR other problems; *error is set
1239*/
1240
1241int
1242parse_forward_list(uschar *s, int options, address_item **anchor,
1243 uschar **error, uschar *incoming_domain, uschar *directory,
1244 error_block **syntax_errors)
1245{
1246int count = 0;
1247
1248DEBUG(D_route) debug_printf("parse_forward_list: %s\n", s);
1249
1250for (;;)
1251 {
1252 int len;
1253 int special = 0;
1254 int specopt = 0;
1255 int specbit = 0;
1256 uschar *ss, *nexts;
1257 address_item *addr;
1258 BOOL inquote = FALSE;
1259
1260 for (;;)
1261 {
1262 while (isspace(*s) || *s == ',') s++;
1263 if (*s == '#') { while (*s != 0 && *s != '\n') s++; } else break;
1264 }
1265
1266 /* When we reach the end of the list, we return FF_DELIVERED if any child
1267 addresses have been generated. If nothing has been generated, there are two
1268 possibilities: either the list is really empty, or there were syntax errors
1269 that are being skipped. (If syntax errors are not being skipped, an FF_ERROR
1270 return is generated on hitting a syntax error and we don't get here.) For a
1271 truly empty list we return FF_NOTDELIVERED so that the router can decline.
1272 However, if the list is empty only because syntax errors were skipped, we
1273 return FF_DELIVERED. */
1274
1275 if (*s == 0)
1276 {
1277 return (count > 0 || (syntax_errors != NULL && *syntax_errors != NULL))?
1278 FF_DELIVERED : FF_NOTDELIVERED;
1279
1280 /* This previous code returns FF_ERROR if nothing is generated but a
1281 syntax error has been skipped. I now think it is the wrong approach, but
1282 have left this here just in case, and for the record. */
1283
1284 #ifdef NEVER
1285 if (count > 0) return FF_DELIVERED; /* Something was generated */
1286
1287 if (syntax_errors == NULL || /* Not skipping syntax errors, or */
1288 *syntax_errors == NULL) /* we didn't actually skip any */
1289 return FF_NOTDELIVERED;
1290
1291 *error = string_sprintf("no addresses generated: syntax error in %s: %s",
1292 (*syntax_errors)->text2, (*syntax_errors)->text1);
1293 return FF_ERROR;
1294 #endif
1295
1296 }
1297
1298 /* Find the end of the next address. Quoted strings in addresses may contain
1299 escaped characters; I haven't found a proper specification of .forward or
1300 alias files that mentions the quoting properties, but it seems right to do
1301 the escaping thing in all cases, so use the function that finds the end of an
1302 address. However, don't let a quoted string extend over the end of a line. */
1303
1304 ss = parse_find_address_end(s, TRUE);
1305
1306 /* Remember where we finished, for starting the next one. */
1307
1308 nexts = ss;
1309
1310 /* Remove any trailing spaces; we know there's at least one non-space. */
1311
1312 while (isspace((ss[-1]))) ss--;
1313
1314 /* We now have s->start and ss->end of the next address. Remove quotes
1315 if they completely enclose, remembering the address started with a quote
1316 for handling pipes and files. Another round of removal of leading and
1317 trailing spaces is then required. */
1318
1319 if (*s == '\"' && ss[-1] == '\"')
1320 {
1321 s++;
1322 ss--;
1323 inquote = TRUE;
1324 while (s < ss && isspace(*s)) s++;
1325 while (ss > s && isspace((ss[-1]))) ss--;
1326 }
1327
1328 /* Set up the length of the address. */
1329
1330 len = ss - s;
1331
1332 DEBUG(D_route)
1333 {
1334 int save = s[len];
1335 s[len] = 0;
1336 debug_printf("extract item: %s\n", s);
1337 s[len] = save;
1338 }
1339
1340 /* Handle special addresses if permitted. If the address is :unknown:
1341 ignore it - this is for backward compatibility with old alias files. You
1342 don't need to use it nowadays - just generate an empty string. For :defer:,
1343 :blackhole:, or :fail: we have to set up the error message and give up right
1344 away. */
1345
1346 if (Ustrncmp(s, ":unknown:", len) == 0)
1347 {
1348 s = nexts;
1349 continue;
1350 }
1351
1352 if (Ustrncmp(s, ":defer:", 7) == 0)
1353 { special = FF_DEFER; specopt = RDO_DEFER; } /* specbit is 0 */
1354 else if (Ustrncmp(s, ":blackhole:", 11) == 0)
1355 { special = FF_BLACKHOLE; specopt = specbit = RDO_BLACKHOLE; }
1356 else if (Ustrncmp(s, ":fail:", 6) == 0)
1357 { special = FF_FAIL; specopt = RDO_FAIL; } /* specbit is 0 */
1358
1359 if (special != 0)
1360 {
1361 uschar *ss = Ustrchr(s+1, ':') + 1;
1362 if ((options & specopt) == specbit)
1363 {
1364 *error = string_sprintf("\"%.*s\" is not permitted", len, s);
1365 return FF_ERROR;
1366 }
1367 while (*ss != 0 && isspace(*ss)) ss++;
1368 while (s[len] != 0 && s[len] != '\n') len++;
1369 s[len] = 0;
1370 *error = string_copy(ss);
1371 return special;
1372 }
1373
1374 /* If the address is of the form :include:pathname, read the file, and call
1375 this function recursively to extract the addresses from it. If directory is
1376 NULL, do no checks. Otherwise, insist that the file name starts with the
1377 given directory and is a regular file. */
1378
1379 if (Ustrncmp(s, ":include:", 9) == 0)
1380 {
1381 uschar *filebuf;
1382 uschar filename[256];
1383 uschar *t = s+9;
1384 int flen = len - 9;
1385 int frc;
1386 struct stat statbuf;
1387 address_item *last;
1388 FILE *f;
1389
1390 while (flen > 0 && isspace(*t)) { t++; flen--; }
1391
1392 if (flen <= 0)
1393 {
1394 *error = string_sprintf("file name missing after :include:");
1395 return FF_ERROR;
1396 }
1397
1398 if (flen > 255)
1399 {
1400 *error = string_sprintf("included file name \"%s\" is too long", t);
1401 return FF_ERROR;
1402 }
1403
1404 Ustrncpy(filename, t, flen);
1405 filename[flen] = 0;
1406
1407 /* Insist on absolute path */
1408
1409 if (filename[0]!= '/')
1410 {
1411 *error = string_sprintf("included file \"%s\" is not an absolute path",
1412 filename);
1413 return FF_ERROR;
1414 }
1415
1416 /* Check if include is permitted */
1417
1418 if ((options & RDO_INCLUDE) != 0)
1419 {
1420 *error = US"included files not permitted";
1421 return FF_ERROR;
1422 }
1423
1424 /* Check file name if required */
1425
1426 if (directory != NULL)
1427 {
1428 int len = Ustrlen(directory);
1429 uschar *p = filename + len;
1430
1431 if (Ustrncmp(filename, directory, len) != 0 || *p != '/')
1432 {
1433 *error = string_sprintf("included file %s is not in directory %s",
1434 filename, directory);
1435 return FF_ERROR;
1436 }
1437
1438 /* It is necessary to check that every component inside the directory
1439 is NOT a symbolic link, in order to keep the file inside the directory.
1440 This is mighty tedious. It is also not totally foolproof in that it
1441 leaves the possibility of a race attack, but I don't know how to do
1442 any better. */
1443
1444 while (*p != 0)
1445 {
1446 int temp;
1447 while (*(++p) != 0 && *p != '/');
1448 temp = *p;
1449 *p = 0;
1450 if (Ulstat(filename, &statbuf) != 0)
1451 {
1452 *error = string_sprintf("failed to stat %s (component of included "
1453 "file)", filename);
1454 *p = temp;
1455 return FF_ERROR;
1456 }
1457
1458 *p = temp;
1459
1460 if ((statbuf.st_mode & S_IFMT) == S_IFLNK)
1461 {
1462 *error = string_sprintf("included file %s in the %s directory "
1463 "involves a symbolic link", filename, directory);
1464 return FF_ERROR;
1465 }
1466 }
1467 }
1468
1469 /* Open and stat the file */
1470
1471 if ((f = Ufopen(filename, "rb")) == NULL)
1472 {
1473 *error = string_open_failed(errno, "included file %s", filename);
1474 return FF_INCLUDEFAIL;
1475 }
1476
1477 if (fstat(fileno(f), &statbuf) != 0)
1478 {
1479 *error = string_sprintf("failed to stat included file %s: %s",
1480 filename, strerror(errno));
f1e894f3 1481 (void)fclose(f);
059ec3d9
PH
1482 return FF_INCLUDEFAIL;
1483 }
1484
1485 /* If directory was checked, double check that we opened a regular file */
1486
1487 if (directory != NULL && (statbuf.st_mode & S_IFMT) != S_IFREG)
1488 {
1489 *error = string_sprintf("included file %s is not a regular file in "
1490 "the %s directory", filename, directory);
1491 return FF_ERROR;
1492 }
1493
1494 /* Get a buffer and read the contents */
1495
1496 if (statbuf.st_size > MAX_INCLUDE_SIZE)
1497 {
1498 *error = string_sprintf("included file %s is too big (max %d)",
1499 filename, MAX_INCLUDE_SIZE);
1500 return FF_ERROR;
1501 }
1502
1503 filebuf = store_get(statbuf.st_size + 1);
1504 if (fread(filebuf, 1, statbuf.st_size, f) != statbuf.st_size)
1505 {
1506 *error = string_sprintf("error while reading included file %s: %s",
1507 filename, strerror(errno));
f1e894f3 1508 (void)fclose(f);
059ec3d9
PH
1509 return FF_ERROR;
1510 }
1511 filebuf[statbuf.st_size] = 0;
f1e894f3 1512 (void)fclose(f);
059ec3d9
PH
1513
1514 addr = NULL;
1515 frc = parse_forward_list(filebuf, options, &addr,
1516 error, incoming_domain, directory, syntax_errors);
1517 if (frc != FF_DELIVERED && frc != FF_NOTDELIVERED) return frc;
1518
1519 if (addr != NULL)
1520 {
1521 last = addr;
1522 while (last->next != NULL) { count++; last = last->next; }
1523 last->next = *anchor;
1524 *anchor = addr;
1525 count++;
1526 }
1527 }
1528
1529 /* Else (not :include:) ensure address is syntactically correct and fully
1530 qualified if not a pipe or a file, removing a leading \ if present on an
1531 unqualified address. For pipes and files we must handle quoting. It's
1532 not quite clear exactly what to do for partially quoted things, but the
1533 common case of having the whole thing in quotes is straightforward. If this
1534 was the case, inquote will have been set TRUE above and the quotes removed.
1535
1536 There is a possible ambiguity over addresses whose local parts start with
1537 a vertical bar or a slash, and the latter do in fact occur, thanks to X.400.
1538 Consider a .forward file that contains the line
1539
1540 /X=xxx/Y=xxx/OU=xxx/@some.gate.way
1541
1542 Is this a file or an X.400 address? Does it make any difference if it is in
1543 quotes? On the grounds that file names of this type are rare, Exim treats
1544 something that parses as an RFC 822 address and has a domain as an address
1545 rather than a file or a pipe. This is also how an address such as the above
1546 would be treated if it came in from outside. */
1547
1548 else
1549 {
1550 int start, end, domain;
1551 uschar *recipient = NULL;
1552 int save = s[len];
1553 s[len] = 0;
1554
1555 /* If it starts with \ and the rest of it parses as a valid mail address
1556 without a domain, carry on with that address, but qualify it with the
1557 incoming domain. Otherwise arrange for the address to fall through,
1558 causing an error message on the re-parse. */
1559
1560 if (*s == '\\')
1561 {
1562 recipient =
1563 parse_extract_address(s+1, error, &start, &end, &domain, FALSE);
1564 if (recipient != NULL)
1565 recipient = (domain != 0)? NULL :
1566 string_sprintf("%s@%s", recipient, incoming_domain);
1567 }
1568
1569 /* Try parsing the item as an address. */
1570
1571 if (recipient == NULL) recipient =
1572 parse_extract_address(s, error, &start, &end, &domain, FALSE);
1573
1574 /* If item starts with / or | and is not a valid address, or there
1575 is no domain, treat it as a file or pipe. If it was a quoted item,
1576 remove the quoting occurrences of \ within it. */
1577
1578 if ((*s == '|' || *s == '/') && (recipient == NULL || domain == 0))
1579 {
1580 uschar *t = store_get(Ustrlen(s) + 1);
1581 uschar *p = t;
1582 uschar *q = s;
1583 while (*q != 0)
1584 {
1585 if (inquote)
1586 {
1587 *p++ = (*q == '\\')? *(++q) : *q;
1588 q++;
1589 }
1590 else *p++ = *q++;
1591 }
1592 *p = 0;
1593 addr = deliver_make_addr(t, TRUE);
1594 setflag(addr, af_pfr); /* indicates pipe/file/reply */
1595 if (*s != '|') setflag(addr, af_file); /* indicates file */
1596 }
1597
1598 /* Item must be an address. Complain if not, else qualify, rewrite and set
1599 up the control block. It appears that people are in the habit of using
1600 empty addresses but with comments as a way of putting comments into
1601 alias and forward files. Therefore, ignore the error "empty address".
1602 Mailing lists might want to tolerate syntax errors; there is therefore
1603 an option to do so. */
1604
1605 else
1606 {
1607 if (recipient == NULL)
1608 {
1609 if (Ustrcmp(*error, "empty address") == 0)
1610 {
1611 *error = NULL;
1612 s[len] = save;
1613 s = nexts;
1614 continue;
1615 }
1616
1617 if (syntax_errors != NULL)
1618 {
1619 error_block *e = store_get(sizeof(error_block));
1620 error_block *last = *syntax_errors;
1621 if (last == NULL) *syntax_errors = e; else
1622 {
1623 while (last->next != NULL) last = last->next;
1624 last->next = e;
1625 }
1626 e->next = NULL;
1627 e->text1 = *error;
1628 e->text2 = string_copy(s);
1629 s[len] = save;
1630 s = nexts;
1631 continue;
1632 }
1633 else
1634 {
1635 *error = string_sprintf("%s in \"%s\"", *error, s);
1636 s[len] = save; /* _after_ using it for *error */
1637 return FF_ERROR;
1638 }
1639 }
1640
1641 /* Address was successfully parsed. Rewrite, and then make an address
1642 block. */
1643
1644 recipient = ((options & RDO_REWRITE) != 0)?
1645 rewrite_address(recipient, TRUE, FALSE, global_rewrite_rules,
1646 rewrite_existflags) :
1647 rewrite_address_qualify(recipient, TRUE);
1648 addr = deliver_make_addr(recipient, TRUE); /* TRUE => copy recipient */
1649 }
1650
1651 /* Restore the final character in the original data, and add to the
1652 output chain. */
1653
1654 s[len] = save;
1655 addr->next = *anchor;
1656 *anchor = addr;
1657 count++;
1658 }
1659
1660 /* Advance pointer for the next address */
1661
1662 s = nexts;
1663 }
1664}
1665
1666
30dba1e6
PH
1667/*************************************************
1668* Extract a Message-ID *
1669*************************************************/
1670
1671/* This function is used to extract message ids from In-Reply-To: and
1672References: header lines.
1673
1674Arguments:
1675 str pointer to the start of the message-id
1676 yield put pointer to the message id (in dynamic memory) here
1677 error put error message here on failure
1678
1679Returns: points after the processed message-id or NULL on error
1680*/
1681
1682uschar *
1683parse_message_id(uschar *str, uschar **yield, uschar **error)
1684{
1685uschar *domain = NULL;
1686uschar *id;
1687
1688str = skip_comment(str);
1689if (*str != '<')
1690 {
1691 *error = US"Missing '<' before message-id";
1692 return NULL;
1693 }
1694
1695/* Getting a block the size of the input string will definitely be sufficient
1696for the answer, but it may also be very long if we are processing a header
1697line. Therefore, take care to release unwanted store afterwards. */
1698
1699id = *yield = store_get(Ustrlen(str) + 1);
1700*id++ = *str++;
1701
1702str = read_addr_spec(str, id, '>', error, &domain);
1703
1704if (*error == NULL)
1705 {
1706 if (*str != '>') *error = US"Missing '>' after message-id";
1707 else if (domain == NULL) *error = US"domain missing in message-id";
1708 }
1709
1710if (*error != NULL)
1711 {
1712 store_reset(*yield);
1713 return NULL;
1714 }
1715
1716while (*id != 0) id++;
1717*id++ = *str++;
1718*id++ = 0;
1719store_reset(id);
1720
1721str = skip_comment(str);
1722return str;
63ac05ee
MH
1723}
1724
1725
1726/*************************************************
1727* Parse a fixed digit number *
1728*************************************************/
1729
1730/* Parse a string containing an ASCII encoded fixed digits number
1731
1732Arguments:
1733 str pointer to the start of the ASCII encoded number
1734 n pointer to the resulting value
1735 digits number of required digits
1736
1737Returns: points after the processed date or NULL on error
1738*/
1739
1740static uschar *
1741parse_number(uschar *str, int *n, int digits)
1742{
1743 *n=0;
1744 while (digits--)
1745 {
1746 if (*str<'0' || *str>'9') return NULL;
1747 *n=10*(*n)+(*str++-'0');
1748 }
1749 return str;
1750}
1751
1752
1753/*************************************************
1754* Parse a RFC 2822 day of week *
1755*************************************************/
1756
1757/* Parse the day of the week from a RFC 2822 date, but do not
1758 decode it, because it is only for humans.
1759
1760Arguments:
1761 str pointer to the start of the day of the week
1762
1763Returns: points after the parsed day or NULL on error
1764*/
1765
1766static uschar *
1767parse_day_of_week(uschar *str)
1768{
1769/*
1770day-of-week = ([FWS] day-name) / obs-day-of-week
1771
1772day-name = "Mon" / "Tue" / "Wed" / "Thu" /
1773 "Fri" / "Sat" / "Sun"
1774
1775obs-day-of-week = [CFWS] day-name [CFWS]
1776*/
1777
5ca6d115 1778static const uschar *day_name[7]={ US"mon", US"tue", US"wed", US"thu", US"fri", US"sat", US"sun" };
63ac05ee
MH
1779int i;
1780uschar day[4];
1781
1782str=skip_comment(str);
1783for (i=0; i<3; ++i)
1784 {
1785 if ((day[i]=tolower(*str))=='\0') return NULL;
1786 ++str;
1787 }
1788day[3]='\0';
5ca6d115 1789for (i=0; i<7; ++i) if (Ustrcmp(day,day_name[i])==0) break;
63ac05ee
MH
1790if (i==7) return NULL;
1791str=skip_comment(str);
1792return str;
1793}
1794
1795
1796/*************************************************
1797* Parse a RFC 2822 date *
1798*************************************************/
1799
1800/* Parse the date part of a RFC 2822 date-time, extracting the
1801 day, month and year.
1802
1803Arguments:
1804 str pointer to the start of the date
1805 d pointer to the resulting day
1806 m pointer to the resulting month
1807 y pointer to the resulting year
1808
1809Returns: points after the processed date or NULL on error
1810*/
1811
1812static uschar *
1813parse_date(uschar *str, int *d, int *m, int *y)
1814{
1815/*
1816date = day month year
1817
1818year = 4*DIGIT / obs-year
1819
1820obs-year = [CFWS] 2*DIGIT [CFWS]
1821
1822month = (FWS month-name FWS) / obs-month
1823
1824month-name = "Jan" / "Feb" / "Mar" / "Apr" /
1825 "May" / "Jun" / "Jul" / "Aug" /
1826 "Sep" / "Oct" / "Nov" / "Dec"
1827
1828obs-month = CFWS month-name CFWS
1829
1830day = ([FWS] 1*2DIGIT) / obs-day
1831
1832obs-day = [CFWS] 1*2DIGIT [CFWS]
1833*/
1834
1835uschar *c,*n;
5ca6d115 1836static const uschar *month_name[]={ US"jan", US"feb", US"mar", US"apr", US"may", US"jun", US"jul", US"aug", US"sep", US"oct", US"nov", US"dec" };
63ac05ee
MH
1837int i;
1838uschar month[4];
1839
1840str=skip_comment(str);
1841if ((str=parse_number(str,d,1))==NULL) return NULL;
1842if (*str>='0' && *str<='9') *d=10*(*d)+(*str++-'0');
1843c=skip_comment(str);
1844if (c==str) return NULL;
1845else str=c;
1846for (i=0; i<3; ++i) if ((month[i]=tolower(*(str+i)))=='\0') return NULL;
1847month[3]='\0';
1848for (i=0; i<12; ++i) if (Ustrcmp(month,month_name[i])==0) break;
1849if (i==12) return NULL;
1850str+=3;
1851*m=i;
1852c=skip_comment(str);
1853if (c==str) return NULL;
1854else str=c;
1855if ((n=parse_number(str,y,4)))
1856 {
1857 str=n;
1858 if (*y<1900) return NULL;
1859 *y=*y-1900;
1860 }
1861else if ((n=parse_number(str,y,2)))
1862 {
1863 str=skip_comment(n);
1864 while (*(str-1)==' ' || *(str-1)=='\t') --str; /* match last FWS later */
1865 if (*y<50) *y+=100;
1866 }
1867else return NULL;
1868return str;
1869}
1870
1871
1872/*************************************************
1873* Parse a RFC 2822 Time *
1874*************************************************/
1875
1876/* Parse the time part of a RFC 2822 date-time, extracting the
1877 hour, minute, second and timezone.
1878
1879Arguments:
1880 str pointer to the start of the time
1881 h pointer to the resulting hour
1882 m pointer to the resulting minute
1883 s pointer to the resulting second
1884 z pointer to the resulting timezone (offset in seconds)
1885
1886Returns: points after the processed time or NULL on error
1887*/
1888
1889static uschar *
1890parse_time(uschar *str, int *h, int *m, int *s, int *z)
1891{
1892/*
1893time = time-of-day FWS zone
1894
1895time-of-day = hour ":" minute [ ":" second ]
1896
1897hour = 2DIGIT / obs-hour
1898
1899obs-hour = [CFWS] 2DIGIT [CFWS]
1900
1901minute = 2DIGIT / obs-minute
1902
1903obs-minute = [CFWS] 2DIGIT [CFWS]
1904
1905second = 2DIGIT / obs-second
1906
1907obs-second = [CFWS] 2DIGIT [CFWS]
1908
1909zone = (( "+" / "-" ) 4DIGIT) / obs-zone
1910
1911obs-zone = "UT" / "GMT" / ; Universal Time
1912 ; North American UT
1913 ; offsets
1914 "EST" / "EDT" / ; Eastern: - 5/ - 4
1915 "CST" / "CDT" / ; Central: - 6/ - 5
1916 "MST" / "MDT" / ; Mountain: - 7/ - 6
1917 "PST" / "PDT" / ; Pacific: - 8/ - 7
1918
1919 %d65-73 / ; Military zones - "A"
1920 %d75-90 / ; through "I" and "K"
1921 %d97-105 / ; through "Z", both
1922 %d107-122 ; upper and lower case
1923*/
1924
1925uschar *c;
1926
1927str=skip_comment(str);
1928if ((str=parse_number(str,h,2))==NULL) return NULL;
1929str=skip_comment(str);
1930if (*str!=':') return NULL;
1931++str;
1932str=skip_comment(str);
1933if ((str=parse_number(str,m,2))==NULL) return NULL;
1934c=skip_comment(str);
1935if (*str==':')
1936 {
1937 ++str;
1938 str=skip_comment(str);
1939 if ((str=parse_number(str,s,2))==NULL) return NULL;
1940 c=skip_comment(str);
1941 }
1942if (c==str) return NULL;
1943else str=c;
1944if (*str=='+' || *str=='-')
1945 {
1946 int neg;
1947
1948 neg=(*str=='-');
1949 ++str;
1950 if ((str=parse_number(str,z,4))==NULL) return NULL;
1951 *z=(*z/100)*3600+(*z%100)*60;
1952 if (neg) *z=-*z;
1953 }
1954else
1955 {
1956 char zone[5];
1957 struct { const char *name; int off; } zone_name[10]=
1958 { {"gmt",0}, {"ut",0}, {"est",-5}, {"edt",-4}, {"cst",-6}, {"cdt",-5}, {"mst",-7}, {"mdt",-6}, {"pst",-8}, {"pdt",-7}};
1959 int i,j;
1960
1961 for (i=0; i<4; ++i)
1962 {
1963 zone[i]=tolower(*(str+i));
1964 if (zone[i]<'a' || zone[i]>'z') break;
1965 }
1966 zone[i]='\0';
1967 for (j=0; j<10 && strcmp(zone,zone_name[j].name); ++j);
1968 /* Besides zones named in the grammar, RFC 2822 says other alphabetic */
1969 /* time zones should be treated as unknown offsets. */
1970 if (j<10)
1971 {
1972 *z=zone_name[j].off*3600;
1973 str+=i;
1974 }
1975 else if (zone[0]<'a' || zone[1]>'z') return 0;
1976 else
1977 {
1978 while ((*str>='a' && *str<='z') || (*str>='A' && *str<='Z')) ++str;
1979 *z=0;
1980 }
1981 }
1982return str;
1983}
1984
1985
1986/*************************************************
1987* Parse a RFC 2822 date-time *
1988*************************************************/
1989
1990/* Parse a RFC 2822 date-time and return it in seconds since the epoch.
1991
1992Arguments:
1993 str pointer to the start of the date-time
1994 t pointer to the parsed time
1995
1996Returns: points after the processed date-time or NULL on error
1997*/
1998
1999uschar *
2000parse_date_time(uschar *str, time_t *t)
2001{
2002/*
2003date-time = [ day-of-week "," ] date FWS time [CFWS]
2004*/
2005
2006struct tm tm;
2007int zone;
2008extern char **environ;
2009char **old_environ;
2010static char gmt0[]="TZ=GMT0";
2011static char *gmt_env[]={ gmt0, (char*)0 };
ed2a4866 2012uschar *try;
63ac05ee 2013
ed2a4866 2014if ((try=parse_day_of_week(str)))
63ac05ee 2015 {
ed2a4866 2016 str=try;
63ac05ee
MH
2017 if (*str!=',') return 0;
2018 ++str;
2019 }
2020if ((str=parse_date(str,&tm.tm_mday,&tm.tm_mon,&tm.tm_year))==NULL) return NULL;
2021if (*str!=' ' && *str!='\t') return NULL;
2022while (*str==' ' || *str=='\t') ++str;
2023if ((str=parse_time(str,&tm.tm_hour,&tm.tm_min,&tm.tm_sec,&zone))==NULL) return NULL;
2024tm.tm_isdst=0;
2025old_environ=environ;
2026environ=gmt_env;
2027*t=mktime(&tm);
2028environ=old_environ;
2029if (*t==-1) return NULL;
2030*t-=zone;
2031str=skip_comment(str);
2032return str;
30dba1e6
PH
2033}
2034
2035
2036
2037
059ec3d9
PH
2038/*************************************************
2039**************************************************
2040* Stand-alone test program *
2041**************************************************
2042*************************************************/
2043
2044#if defined STAND_ALONE
2045int main(void)
2046{
2047int start, end, domain;
2048uschar buffer[1024];
2049uschar outbuff[1024];
2050
2051big_buffer = store_malloc(big_buffer_size);
2052
2053/* strip_trailing_dot = TRUE; */
2054allow_domain_literals = TRUE;
2055
2056printf("Testing parse_fix_phrase\n");
2057
2058while (Ufgets(buffer, sizeof(buffer), stdin) != NULL)
2059 {
2060 buffer[Ustrlen(buffer)-1] = 0;
2061 if (buffer[0] == 0) break;
2062 printf("%s\n", CS parse_fix_phrase(buffer, Ustrlen(buffer), outbuff,
2063 sizeof(outbuff)));
2064 }
2065
2066printf("Testing parse_extract_address without group syntax and without UTF-8\n");
2067
2068while (Ufgets(buffer, sizeof(buffer), stdin) != NULL)
2069 {
2070 uschar *out;
2071 uschar *errmess;
2072 buffer[Ustrlen(buffer) - 1] = 0;
2073 if (buffer[0] == 0) break;
2074 out = parse_extract_address(buffer, &errmess, &start, &end, &domain, FALSE);
2075 if (out == NULL) printf("*** bad address: %s\n", errmess); else
2076 {
2077 uschar extract[1024];
2078 Ustrncpy(extract, buffer+start, end-start);
2079 extract[end-start] = 0;
2080 printf("%s %d %d %d \"%s\"\n", out, start, end, domain, extract);
2081 }
2082 }
2083
2084printf("Testing parse_extract_address without group syntax but with UTF-8\n");
2085
2086allow_utf8_domains = TRUE;
2087while (Ufgets(buffer, sizeof(buffer), stdin) != NULL)
2088 {
2089 uschar *out;
2090 uschar *errmess;
2091 buffer[Ustrlen(buffer) - 1] = 0;
2092 if (buffer[0] == 0) break;
2093 out = parse_extract_address(buffer, &errmess, &start, &end, &domain, FALSE);
2094 if (out == NULL) printf("*** bad address: %s\n", errmess); else
2095 {
2096 uschar extract[1024];
2097 Ustrncpy(extract, buffer+start, end-start);
2098 extract[end-start] = 0;
2099 printf("%s %d %d %d \"%s\"\n", out, start, end, domain, extract);
2100 }
2101 }
2102allow_utf8_domains = FALSE;
2103
2104printf("Testing parse_extract_address with group syntax\n");
2105
2106parse_allow_group = TRUE;
2107while (Ufgets(buffer, sizeof(buffer), stdin) != NULL)
2108 {
2109 uschar *out;
2110 uschar *errmess;
2111 uschar *s;
2112 buffer[Ustrlen(buffer) - 1] = 0;
2113 if (buffer[0] == 0) break;
2114 s = buffer;
2115 while (*s != 0)
2116 {
2117 uschar *ss = parse_find_address_end(s, FALSE);
2118 int terminator = *ss;
2119 *ss = 0;
2120 out = parse_extract_address(buffer, &errmess, &start, &end, &domain, FALSE);
2121 *ss = terminator;
2122
2123 if (out == NULL) printf("*** bad address: %s\n", errmess); else
2124 {
2125 uschar extract[1024];
2126 Ustrncpy(extract, buffer+start, end-start);
2127 extract[end-start] = 0;
2128 printf("%s %d %d %d \"%s\"\n", out, start, end, domain, extract);
2129 }
2130
2131 s = ss + (terminator? 1:0);
2132 while (isspace(*s)) s++;
2133 }
2134 }
2135
2136printf("Testing parse_find_at\n");
2137
2138while (Ufgets(buffer, sizeof(buffer), stdin) != NULL)
2139 {
2140 uschar *s;
2141 buffer[Ustrlen(buffer)-1] = 0;
2142 if (buffer[0] == 0) break;
2143 s = parse_find_at(buffer);
2144 if (s == NULL) printf("no @ found\n");
2145 else printf("offset = %d\n", s - buffer);
2146 }
2147
2148printf("Testing parse_extract_addresses\n");
2149
2150while (Ufgets(buffer, sizeof(buffer), stdin) != NULL)
2151 {
2152 uschar *errmess;
2153 int extracted;
2154 address_item *anchor = NULL;
2155 buffer[Ustrlen(buffer) - 1] = 0;
2156 if (buffer[0] == 0) break;
2157 if ((extracted = parse_forward_list(buffer, -1, &anchor,
2158 &errmess, US"incoming.domain", NULL, NULL)) == FF_DELIVERED)
2159 {
2160 while (anchor != NULL)
2161 {
2162 address_item *addr = anchor;
2163 anchor = anchor->next;
2164 printf("%d %s\n", testflag(addr, af_pfr), addr->address);
2165 }
2166 }
2167 else printf("Failed: %d %s\n", extracted, errmess);
2168 }
2169
30dba1e6
PH
2170printf("Testing parse_message_id\n");
2171
2172while (Ufgets(buffer, sizeof(buffer), stdin) != NULL)
2173 {
2174 uschar *s, *t, *errmess;
2175 buffer[Ustrlen(buffer) - 1] = 0;
2176 if (buffer[0] == 0) break;
2177 s = buffer;
2178 while (*s != 0)
2179 {
2180 s = parse_message_id(s, &t, &errmess);
2181 if (errmess != NULL)
2182 {
2183 printf("Failed: %s\n", errmess);
2184 break;
2185 }
2186 printf("%s\n", t);
2187 }
2188 }
2189
059ec3d9
PH
2190return 0;
2191}
2192
2193#endif
2194
2195/* End of parse.c */