src/src/parse.c

   1 /* $Cambridge: exim/src/src/parse.c,v 1.11 2007/01/08 10:50:18 ph10 Exp $ */
   2
   3 /*************************************************
   4 *     Exim - an Internet mail transport agent    *
   5 *************************************************/
   6
   7 /* Copyright (c) University of Cambridge 1995 - 2007 */
   8 /* See the file NOTICE for conditions of use and distribution. */
   9
  10 /* Functions for parsing addresses */
  11
  12
  13 #include "exim.h"
  14
  15
  16 static uschar *last_comment_position;
  17
  18
  19
  20 /* In stand-alone mode, provide a replacement for deliver_make_addr()
  21 and rewrite_address[_qualify]() so as to avoid having to drag in too much
  22 redundant apparatus. */
  23
  24 #ifdef STAND_ALONE
  25
  26 address_item *deliver_make_addr(uschar *address, BOOL copy)
  27 {
  28 address_item *addr = store_get(sizeof(address_item));
  29 addr->next = NULL;
  30 addr->parent = NULL;
  31 addr->address = address;
  32 return addr;
  33 }
  34
  35 uschar *rewrite_address(uschar *recipient, BOOL dummy1, BOOL dummy2, rewrite_rule
  36   *dummy3, int dummy4)
  37 {
  38 return recipient;
  39 }
  40
  41 uschar *rewrite_address_qualify(uschar *recipient, BOOL dummy1)
  42 {
  43 return recipient;
  44 }
  45
  46 #endif
  47
  48
  49
  50
  51 /*************************************************
  52 *             Find the end of an address         *
  53 *************************************************/
  54
  55 /* Scan over a string looking for the termination of an address at a comma,
  56 or end of the string. It's the source-routed addresses which cause much pain
  57 here. Although Exim ignores source routes, it must recognize such addresses, so
  58 we cannot get rid of this logic.
  59
  60 Argument:
  61   s        pointer to the start of an address
  62   nl_ends  if TRUE, '\n' terminates an address
  63
  64 Returns:   pointer past the end of the address
  65            (i.e. points to null or comma)
  66 */
  67
  68 uschar *
  69 parse_find_address_end(uschar *s, BOOL nl_ends)
  70 {
  71 BOOL source_routing = *s == '@';
  72 int no_term = source_routing? 1 : 0;
  73
  74 while (*s != 0 && (*s != ',' || no_term > 0) && (*s != '\n' || !nl_ends))
  75   {
  76   /* Skip single quoted characters. Strictly these should not occur outside
  77   quoted strings in RFC 822 addresses, but they can in RFC 821 addresses. Pity
  78   about the lack of consistency, isn't it? */
  79
  80   if (*s == '\\' && s[1] != 0) s += 2;
  81
  82   /* Skip quoted items that are not inside brackets. Note that
  83   quoted pairs are allowed inside quoted strings. */
  84
  85   else if (*s == '\"')
  86     {
  87     while (*(++s) != 0 && (*s != '\n' || !nl_ends))
  88       {
  89       if (*s == '\\' && s[1] != 0) s++;
  90         else if (*s == '\"') { s++; break; }
  91       }
  92     }
  93
  94   /* Skip comments, which may include nested brackets, but quotes
  95   are not recognized inside comments, though quoted pairs are. */
  96
  97   else if (*s == '(')
  98     {
  99     int level = 1;
 100     while (*(++s) != 0 && (*s != '\n' || !nl_ends))
 101       {
 102       if (*s == '\\' && s[1] != 0) s++;
 103         else if (*s == '(') level++;
 104           else if (*s == ')' && --level <= 0) { s++; break; }
 105       }
 106     }
 107
 108   /* Non-special character; just advance. Passing the colon in a source
 109   routed address means that any subsequent comma or colon may terminate unless
 110   inside angle brackets. */
 111
 112   else
 113     {
 114     if (*s == '<')
 115       {
 116       source_routing = s[1] == '@';
 117       no_term = source_routing? 2 : 1;
 118       }
 119     else if (*s == '>') no_term--;
 120     else if (source_routing && *s == ':') no_term--;
 121     s++;
 122     }
 123   }
 124
 125 return s;
 126 }
 127
 128
 129
 130 /*************************************************
 131 *            Find last @ in an address           *
 132 *************************************************/
 133
 134 /* This function is used when we have something that may not qualified. If we
 135 know it's qualified, searching for the rightmost '@' is sufficient. Here we
 136 have to be a bit more clever than just a plain search, in order to handle
 137 unqualified local parts like "thing@thong" correctly. Since quotes may not
 138 legally be part of a domain name, we can give up on hitting the first quote
 139 when searching from the right. Now that the parsing also permits the RFC 821
 140 form of address, where quoted-pairs are allowed in unquoted local parts, we
 141 must take care to handle that too.
 142
 143 Argument:  pointer to an address, possibly unqualified
 144 Returns:   pointer to the last @ in an address, or NULL if none
 145 */
 146
 147 uschar *
 148 parse_find_at(uschar *s)
 149 {
 150 uschar *t = s + Ustrlen(s);
 151 while (--t >= s)
 152   {
 153   if (*t == '@')
 154     {
 155     int backslash_count = 0;
 156     uschar *tt = t - 1;
 157     while (tt > s && *tt-- == '\\') backslash_count++;
 158     if ((backslash_count & 1) == 0) return t;
 159     }
 160   else if (*t == '\"') return NULL;
 161   }
 162 return NULL;
 163 }
 164
 165
 166
 167
 168 /***************************************************************************
 169 * In all the functions below that read a particular object type from       *
 170 * the input, return the new value of the pointer s (the first argument),   *
 171 * and put the object into the store pointed to by t (the second argument), *
 172 * adding a terminating zero. If no object is found, t will point to zero   *
 173 * on return.                                                               *
 174 ***************************************************************************/
 175
 176
 177 /*************************************************
 178 *          Skip white space and comment          *
 179 *************************************************/
 180
 181 /* Algorithm:
 182   (1) Skip spaces.
 183   (2) If uschar not '(', return.
 184   (3) Skip till matching ')', not counting any characters
 185       escaped with '\'.
 186   (4) Move past ')' and goto (1).
 187
 188 The start of the last potential comment position is remembered to
 189 make it possible to ignore comments at the end of compound items.
 190
 191 Argument: current character pointer
 192 Regurns:  new character pointer
 193 */
 194
 195 static uschar *
 196 skip_comment(uschar *s)
 197 {
 198 last_comment_position = s;
 199 while (*s)
 200   {
 201   int c, level;
 202   while (isspace(*s)) s++;
 203   if (*s != '(') break;
 204   level = 1;
 205   while((c = *(++s)) != 0)
 206     {
 207     if (c == '(') level++;
 208     else if (c == ')') { if (--level <= 0) { s++; break; } }
 209     else if (c == '\\' && s[1] != 0) s++;
 210     }
 211   }
 212 return s;
 213 }
 214
 215
 216
 217 /*************************************************
 218 *             Read a domain                      *
 219 *************************************************/
 220
 221 /* A domain is a sequence of subdomains, separated by dots. See comments below
 222 for detailed syntax of the subdomains.
 223
 224 If allow_domain_literals is TRUE, a "domain" may also be an IP address enclosed
 225 in []. Make sure the output is set to the null string if there is a syntax
 226 error as well as if there is no domain at all.
 227
 228 Arguments:
 229   s          current character pointer
 230   t          where to put the domain
 231   errorptr   put error message here on failure (*t will be 0 on exit)
 232
 233 Returns:     new character pointer
 234 */
 235
 236 static uschar *
 237 read_domain(uschar *s, uschar *t, uschar **errorptr)
 238 {
 239 uschar *tt = t;
 240 s = skip_comment(s);
 241
 242 /* Handle domain literals if permitted. An RFC 822 domain literal may contain
 243 any character except [ ] \, including linear white space, and may contain
 244 quoted characters. However, RFC 821 restricts literals to being dot-separated
 245 3-digit numbers, and we make the obvious extension for IPv6. Go for a sequence
 246 of digits, dots, hex digits, and colons here; later this will be checked for
 247 being a syntactically valid IP address if it ever gets to a router.
 248
 249 Allow both the formal IPv6 form, with IPV6: at the start, and the informal form
 250 without it, and accept IPV4: as well, 'cause someone will use it sooner or
 251 later. */
 252
 253 if (*s == '[')
 254   {
 255   *t++ = *s++;
 256
 257   if (strncmpic(s, US"IPv6:", 5) == 0 || strncmpic(s, US"IPv4:", 5) == 0)
 258     {
 259     memcpy(t, s, 5);
 260     t += 5;
 261     s += 5;
 262     }
 263   while (*s == '.' || *s == ':' || isxdigit(*s)) *t++ = *s++;
 264
 265   if (*s == ']') *t++ = *s++; else
 266     {
 267     *errorptr = US"malformed domain literal";
 268     *tt = 0;
 269     }
 270
 271   if (!allow_domain_literals)
 272     {
 273     *errorptr = US"domain literals not allowed";
 274     *tt = 0;
 275     }
 276   *t = 0;
 277   return skip_comment(s);
 278   }
 279
 280 /* Handle a proper domain, which is a sequence of dot-separated atoms. Remove
 281 trailing dots if strip_trailing_dot is set. A subdomain is an atom.
 282
 283 An atom is a sequence of any characters except specials, space, and controls.
 284 The specials are ( ) < > @ , ; : \ " . [ and ]. This is the rule for RFC 822
 285 and its successor (RFC 2822). However, RFC 821 and its successor (RFC 2821) is
 286 tighter, allowing only letters, digits, and hyphens, not starting with a
 287 hyphen.
 288
 289 There used to be a global flag that got set when checking addresses that came
 290 in over SMTP and which should therefore should be checked according to the
 291 stricter rule. However, it seems silly to make the distinction, because I don't
 292 suppose anybody ever uses local domains that are 822-compliant and not
 293 821-compliant. Furthermore, Exim now has additional data on the spool file line
 294 after an address (after "one_time" processing), and it makes use of a #
 295 character to delimit it. When I wrote that code, I forgot about this 822-domain
 296 stuff, and assumed # could never appear in a domain.
 297
 298 So the old code is now cut out for Release 4.11 onwards, on 09-Aug-02. In a few
 299 years, when we are sure this isn't actually causing trouble, throw it away.
 300
 301 March 2003: the story continues: There is a camp that is arguing for the use of
 302 UTF-8 in domain names as the way to internationalization, and other MTAs
 303 support this. Therefore, we now have a flag that permits the use of characters
 304 with values greater than 127, encoded in UTF-8, in subdomains, so that Exim can
 305 be used experimentally in this way. */
 306
 307 for (;;)
 308   {
 309   uschar *tsave = t;
 310
 311 /*********************
 312   if (rfc821_domains)
 313     {
 314     if (*s != '-') while (isalnum(*s) || *s == '-') *t++ = *s++;
 315     }
 316   else
 317     while (!mac_iscntrl_or_special(*s)) *t++ = *s++;
 318 *********************/
 319
 320   if (*s != '-')
 321     {
 322     /* Only letters, digits, and hyphens */
 323
 324     if (!allow_utf8_domains)
 325       {
 326       while (isalnum(*s) || *s == '-') *t++ = *s++;
 327       }
 328
 329     /* Permit legal UTF-8 characters to be included */
 330
 331     else for(;;)
 332       {
 333       int i, d;
 334       if (isalnum(*s) || *s == '-')    /* legal ascii characters */
 335         {
 336         *t++ = *s++;
 337         continue;
 338         }
 339       if ((*s & 0xc0) != 0xc0) break;  /* not start of UTF-8 character */
 340       d = *s << 2;
 341       for (i = 1; i < 6; i++)          /* i is the number of additional bytes */
 342         {
 343         if ((d & 0x80) == 0) break;
 344         d <<= 1;
 345         }
 346       if (i == 6) goto BAD_UTF8;       /* invalid UTF-8 */
 347       *t++ = *s++;                     /* leading UTF-8 byte */
 348       while (i-- > 0)                  /* copy and check remainder */
 349         {
 350         if ((*s & 0xc0) != 0x80)
 351           {
 352           BAD_UTF8:
 353           *errorptr = US"invalid UTF-8 byte sequence";
 354           *tt = 0;
 355           return s;
 356           }
 357         *t++ = *s++;
 358         }
 359       }    /* End of loop for UTF-8 character */
 360     }      /* End of subdomain */
 361
 362   s = skip_comment(s);
 363   *t = 0;
 364
 365   if (t == tsave)   /* empty component */
 366     {
 367     if (strip_trailing_dot && t > tt && *s != '.') t[-1] = 0; else
 368       {
 369       *errorptr = US"domain missing or malformed";
 370       *tt = 0;
 371       }
 372     return s;
 373     }
 374
 375   if (*s != '.') break;
 376   *t++ = *s++;
 377   s = skip_comment(s);
 378   }
 379
 380 return s;
 381 }
 382
 383
 384
 385 /*************************************************
 386 *            Read a local-part                   *
 387 *************************************************/
 388
 389 /* A local-part is a sequence of words, separated by periods. A null word
 390 between dots is not strictly allowed but apparently many mailers permit it,
 391 so, sigh, better be compatible. Even accept a trailing dot...
 392
 393 A <word> is either a quoted string, or an <atom>, which is a sequence
 394 of any characters except specials, space, and controls. The specials are
 395 ( ) < > @ , ; : \ " . [ and ]. In RFC 822, a single quoted character, (a
 396 quoted-pair) is not allowed in a word. However, in RFC 821, it is permitted in
 397 the local part of an address. Rather than have separate parsing functions for
 398 the different cases, take the liberal attitude always. At least one MUA is
 399 happy to recognize this case; I don't know how many other programs do.
 400
 401 Arguments:
 402   s           current character pointer
 403   t           where to put the local part
 404   error       where to point error text
 405   allow_null  TRUE if an empty local part is not an error
 406
 407 Returns:   new character pointer
 408 */
 409
 410 static uschar *
 411 read_local_part(uschar *s, uschar *t, uschar **error, BOOL allow_null)
 412 {
 413 uschar *tt = t;
 414 *error = NULL;
 415 for (;;)
 416   {
 417   int c;
 418   uschar *tsave = t;
 419   s = skip_comment(s);
 420
 421   /* Handle a quoted string */
 422
 423   if (*s == '\"')
 424     {
 425     *t++ = '\"';
 426     while ((c = *(++s)) != 0 && c != '\"')
 427       {
 428       *t++ = c;
 429       if (c == '\\' && s[1] != 0) *t++ = *(++s);
 430       }
 431     if (c == '\"')
 432       {
 433       s++;
 434       *t++ = '\"';
 435       }
 436     else
 437       {
 438       *error = US"unmatched doublequote in local part";
 439       return s;
 440       }
 441     }
 442
 443   /* Handle an atom, but allow quoted pairs within it. */
 444
 445   else while (!mac_iscntrl_or_special(*s) || *s == '\\')
 446     {
 447     c = *t++ = *s++;
 448     if (c == '\\' && *s != 0) *t++ = *s++;
 449     }
 450
 451   /* Terminate the word and skip subsequent comment */
 452
 453   *t = 0;
 454   s = skip_comment(s);
 455
 456   /* If we have read a null component at this point, give an error unless it is
 457   terminated by a dot - an extension to RFC 822 - or if it is the first
 458   component of the local part and an empty local part is permitted, in which
 459   case just return normally. */
 460
 461   if (t == tsave && *s != '.')
 462     {
 463     if (t == tt && !allow_null)
 464       *error = US"missing or malformed local part";
 465     return s;
 466     }
 467
 468   /* Anything other than a dot terminates the local part. Treat multiple dots
 469   as a single dot, as this seems to be a common extension. */
 470
 471   if (*s != '.') break;
 472   do { *t++ = *s++; } while (*s == '.');
 473   }
 474
 475 return s;
 476 }
 477
 478
 479 /*************************************************
 480 *            Read route part of route-addr       *
 481 *************************************************/
 482
 483 /* The pointer is at the initial "@" on entry. Return it following the
 484 terminating colon. Exim no longer supports the use of source routes, but it is
 485 required to accept the syntax.
 486
 487 Arguments:
 488   s          current character pointer
 489   t          where to put the route
 490   errorptr   where to put an error message
 491
 492 Returns:     new character pointer
 493 */
 494
 495 static uschar *
 496 read_route(uschar *s, uschar *t, uschar **errorptr)
 497 {
 498 BOOL commas = FALSE;
 499 *errorptr = NULL;
 500
 501 while (*s == '@')
 502   {
 503   *t++ = '@';
 504   s = read_domain(s+1, t, errorptr);
 505   if (*t == 0) return s;
 506   t += Ustrlen((const uschar *)t);
 507   if (*s != ',') break;
 508   *t++ = *s++;
 509   commas = TRUE;
 510   s = skip_comment(s);
 511   }
 512
 513 if (*s == ':') *t++ = *s++;
 514
 515 /* If there is no colon, and there were no commas, the most likely error
 516 is in fact a missing local part in the address rather than a missing colon
 517 after the route. */
 518
 519 else *errorptr = commas?
 520   US"colon expected after route list" :
 521   US"no local part";
 522
 523 /* Terminate the route and return */
 524
 525 *t = 0;
 526 return skip_comment(s);
 527 }
 528
 529
 530
 531 /*************************************************
 532 *                Read addr-spec                  *
 533 *************************************************/
 534
 535 /* Addr-spec is local-part@domain. We make the domain optional -
 536 the expected terminator for the whole thing is passed to check this.
 537 This function is called only when we know we have a route-addr.
 538
 539 Arguments:
 540   s          current character pointer
 541   t          where to put the addr-spec
 542   term       expected terminator (0 or >)
 543   errorptr   where to put an error message
 544   domainptr  set to point to the start of the domain
 545
 546 Returns:     new character pointer
 547 */
 548
 549 static uschar *
 550 read_addr_spec(uschar *s, uschar *t, int term, uschar **errorptr,
 551   uschar **domainptr)
 552 {
 553 s = read_local_part(s, t, errorptr, FALSE);
 554 if (*errorptr == NULL)
 555   {
 556   if (*s != term)
 557     {
 558     if (*s != '@')
 559       *errorptr = string_sprintf("\"@\" or \".\" expected after \"%s\"", t);
 560     else
 561       {
 562       t += Ustrlen((const uschar *)t);
 563       *t++ = *s++;
 564       *domainptr = t;
 565       s = read_domain(s, t, errorptr);
 566       }
 567     }
 568   }
 569 return s;
 570 }
 571
 572
 573
 574 /*************************************************
 575 *         Extract operative address              *
 576 *************************************************/
 577
 578 /* This function extracts an operative address from a full RFC822 mailbox and
 579 returns it in a piece of dynamic store. We take the easy way and get a piece
 580 of store the same size as the input, and then copy into it whatever is
 581 necessary. If we cannot find a valid address (syntax error), return NULL, and
 582 point the error pointer to the reason. The arguments "start" and "end" are used
 583 to return the offsets of the first and one past the last characters in the
 584 original mailbox of the address that has been extracted, to aid in re-writing.
 585 The argument "domain" is set to point to the first character after "@" in the
 586 final part of the returned address, or zero if there is no @.
 587
 588 Exim no longer supports the use of source routed addresses (those of the form
 589 @domain,...:route_addr). It recognizes the syntax, but collapses such addresses
 590 down to their final components. Formerly, collapse_source_routes had to be set
 591 to achieve this effect. RFC 1123 allows collapsing with MAY, while the revision
 592 of RFC 821 had increased this to SHOULD, so I've gone for it, because it makes
 593 a lot of code elsewhere in Exim much simpler.
 594
 595 There are some special fudges here for handling RFC 822 group address notation
 596 which may appear in certain headers. If the flag parse_allow_group is set
 597 TRUE and parse_found_group is FALSE when this function is called, an address
 598 which is the start of a group (i.e. preceded by a phrase and a colon) is
 599 recognized; the phrase is ignored and the flag parse_found_group is set. If
 600 this flag is TRUE at the end of an address, and if an extraneous semicolon is
 601 found, it is ignored and the flag is cleared.
 602
 603 This logic is used only when scanning through addresses in headers, either to
 604 fulfil the -t option, or for rewriting, or for checking header syntax. Because
 605 the group "state" has to be remembered between multiple calls of this function,
 606 the variables parse_{allow,found}_group are global. It is important to ensure
 607 that they are reset to FALSE at the end of scanning a header's list of
 608 addresses.
 609
 610 Arguments:
 611   mailbox     points to the RFC822 mailbox
 612   errorptr    where to point an error message
 613   start       set to start offset in mailbox
 614   end         set to end offset in mailbox
 615   domain      set to domain offset in result, or 0 if no domain present
 616   allow_null  allow <> if TRUE
 617
 618 Returns:      points to the extracted address, or NULL on error
 619 */
 620
 621 #define FAILED(s) { *errorptr = s; goto PARSE_FAILED; }
 622
 623 uschar *
 624 parse_extract_address(uschar *mailbox, uschar **errorptr, int *start, int *end,
 625   int *domain, BOOL allow_null)
 626 {
 627 uschar *yield = store_get(Ustrlen(mailbox) + 1);
 628 uschar *startptr, *endptr;
 629 uschar *s = (uschar *)mailbox;
 630 uschar *t = (uschar *)yield;
 631
 632 *domain = 0;
 633
 634 /* At the start of the string we expect either an addr-spec or a phrase
 635 preceding a <route-addr>. If groups are allowed, we might also find a phrase
 636 preceding a colon and an address. If we find an initial word followed by
 637 a dot, strict interpretation of the RFC would cause it to be taken
 638 as the start of an addr-spec. However, many mailers break the rules
 639 and use addresses of the form "a.n.other <ano@somewhere>" and so we
 640 allow this case. */
 641
 642 RESTART:   /* Come back here after passing a group name */
 643
 644 s = skip_comment(s);
 645 startptr = s;                                 /* In case addr-spec */
 646 s = read_local_part(s, t, errorptr, TRUE);    /* Dot separated words */
 647 if (*errorptr != NULL) goto PARSE_FAILED;
 648
 649 /* If the terminator is neither < nor @ then the format of the address
 650 must either be a bare local-part (we are now at the end), or a phrase
 651 followed by a route-addr (more words must follow). */
 652
 653 if (*s != '@' && *s != '<')
 654   {
 655   if (*s == 0 || *s == ';')
 656     {
 657     if (*t == 0) FAILED(US"empty address");
 658     endptr = last_comment_position;
 659     goto PARSE_SUCCEEDED;              /* Bare local part */
 660     }
 661
 662   /* Expect phrase route-addr, or phrase : if groups permitted, but allow
 663   dots in the phrase; complete the loop only when '<' or ':' is encountered -
 664   end of string will produce a null local_part and therefore fail. We don't
 665   need to keep updating t, as the phrase isn't to be kept. */
 666
 667   while (*s != '<' && (!parse_allow_group || *s != ':'))
 668     {
 669     s = read_local_part(s, t, errorptr, FALSE);
 670     if (*errorptr != NULL)
 671       {
 672       *errorptr = string_sprintf("%s (expected word or \"<\")", *errorptr);
 673       goto PARSE_FAILED;
 674       }
 675     }
 676
 677   if (*s == ':')
 678     {
 679     parse_found_group = TRUE;
 680     parse_allow_group = FALSE;
 681     s++;
 682     goto RESTART;
 683     }
 684
 685   /* Assert *s == '<' */
 686   }
 687
 688 /* At this point the next character is either '@' or '<'. If it is '@', only a
 689 single local-part has previously been read. An angle bracket signifies the
 690 start of an <addr-spec>. Throw away anything we have saved so far before
 691 processing it. Note that this is "if" rather than "else if" because it's also
 692 used after reading a preceding phrase.
 693
 694 There are a lot of broken sendmails out there that put additional pairs of <>
 695 round <route-addr>s. If strip_excess_angle_brackets is set, allow any number of
 696 them, as long as they match. */
 697
 698 if (*s == '<')
 699   {
 700   uschar *domainptr = yield;
 701   BOOL source_routed = FALSE;
 702   int bracket_count = 1;
 703
 704   s++;
 705   if (strip_excess_angle_brackets)
 706     while (*s == '<') { bracket_count++; s++; }
 707
 708   t = yield;
 709   startptr = s;
 710   s = skip_comment(s);
 711
 712   /* Read an optional series of routes, each of which is a domain. They
 713   are separated by commas and terminated by a colon. However, we totally ignore
 714   such routes (RFC 1123 says we MAY, and the revision of RFC 821 says we
 715   SHOULD). */
 716
 717   if (*s == '@')
 718     {
 719     s = read_route(s, t, errorptr);
 720     if (*errorptr != NULL) goto PARSE_FAILED;
 721     *t = 0;                  /* Ensure route is ignored - probably overkill */
 722     source_routed = TRUE;
 723     }
 724
 725   /* Now an addr-spec, terminated by '>'. If there is no preceding route,
 726   we must allow an empty addr-spec if allow_null is TRUE, to permit the
 727   address "<>" in some circumstances. A source-routed address MUST have
 728   a domain in the final part. */
 729
 730   if (allow_null && !source_routed && *s == '>')
 731     {
 732     *t = 0;
 733     *errorptr = NULL;
 734     }
 735   else
 736     {
 737     s = read_addr_spec(s, t, '>', errorptr, &domainptr);
 738     if (*errorptr != NULL) goto PARSE_FAILED;
 739     *domain = domainptr - yield;
 740     if (source_routed && *domain == 0)
 741       FAILED(US"domain missing in source-routed address");
 742     }
 743
 744   endptr = s;
 745   if (*errorptr != NULL) goto PARSE_FAILED;
 746   while (bracket_count-- > 0) if (*s++ != '>')
 747     {
 748     *errorptr = (s[-1] == 0)? US"'>' missing at end of address" :
 749       string_sprintf("malformed address: %.32s may not follow %.*s",
 750         s-1, s - (uschar *)mailbox - 1, mailbox);
 751     goto PARSE_FAILED;
 752     }
 753
 754   s = skip_comment(s);
 755   }
 756
 757 /* Hitting '@' after the first local-part means we have definitely got an
 758 addr-spec, on a strict reading of the RFC, and the rest of the string
 759 should be the domain. However, for flexibility we allow for a route-address
 760 not enclosed in <> as well, which is indicated by an empty first local
 761 part preceding '@'. The source routing is, however, ignored. */
 762
 763 else if (*t == 0)
 764   {
 765   uschar *domainptr = yield;
 766   s = read_route(s, t, errorptr);
 767   if (*errorptr != NULL) goto PARSE_FAILED;
 768   *t = 0;         /* Ensure route is ignored - probably overkill */
 769   s = read_addr_spec(s, t, 0, errorptr, &domainptr);
 770   if (*errorptr != NULL) goto PARSE_FAILED;
 771   *domain = domainptr - yield;
 772   endptr = last_comment_position;
 773   if (*domain == 0) FAILED(US"domain missing in source-routed address");
 774   }
 775
 776 /* This is the strict case of local-part@domain. */
 777
 778 else
 779   {
 780   t += Ustrlen((const uschar *)t);
 781   *t++ = *s++;
 782   *domain = t - yield;
 783   s = read_domain(s, t, errorptr);
 784   if (*t == 0) goto PARSE_FAILED;
 785   endptr = last_comment_position;
 786   }
 787
 788 /* Use goto to get here from the bare local part case. Arrive by falling
 789 through for other cases. Endptr may have been moved over whitespace, so
 790 move it back past white space if necessary. */
 791
 792 PARSE_SUCCEEDED:
 793 if (*s != 0)
 794   {
 795   if (parse_found_group && *s == ';')
 796     {
 797     parse_found_group = FALSE;
 798     parse_allow_group = TRUE;
 799     }
 800   else
 801     {
 802     *errorptr = string_sprintf("malformed address: %.32s may not follow %.*s",
 803       s, s - (uschar *)mailbox, mailbox);
 804     goto PARSE_FAILED;
 805     }
 806   }
 807 *start = startptr - (uschar *)mailbox;      /* Return offsets */
 808 while (isspace(endptr[-1])) endptr--;
 809 *end = endptr - (uschar *)mailbox;
 810
 811 /* Although this code has no limitation on the length of address extracted,
 812 other parts of Exim may have limits, and in any case, RFC 2821 limits local
 813 parts to 64 and domains to 255, so we do a check here, giving an error if the
 814 address is ridiculously long. */
 815
 816 if (*end - *start > ADDRESS_MAXLENGTH)
 817   {
 818   *errorptr = string_sprintf("address is ridiculously long: %.64s...", yield);
 819   return NULL;
 820   }
 821
 822 return (uschar *)yield;
 823
 824 /* Use goto (via the macro FAILED) to get to here from a variety of places.
 825 We might have an empty address in a group - the caller can choose to ignore
 826 this. We must, however, keep the flags correct. */
 827
 828 PARSE_FAILED:
 829 if (parse_found_group && *s == ';')
 830   {
 831   parse_found_group = FALSE;
 832   parse_allow_group = TRUE;
 833   }
 834 return NULL;
 835 }
 836
 837 #undef FAILED
 838
 839
 840
 841 /*************************************************
 842 *        Quote according to RFC 2047             *
 843 *************************************************/
 844
 845 /* This function is used for quoting text in headers according to RFC 2047.
 846 If the only characters that strictly need quoting are spaces, we return the
 847 original string, unmodified. If a quoted string is too long for the buffer, it
 848 is truncated. (This shouldn't happen: this is normally handling short strings.)
 849
 850 Hmmph. As always, things get perverted for other uses. This function was
 851 originally for the "phrase" part of addresses. Now it is being used for much
 852 longer texts in ACLs and via the ${rfc2047: expansion item. This means we have
 853 to check for overlong "encoded-word"s and split them. November 2004.
 854
 855 Arguments:
 856   string       the string to quote - already checked to contain non-printing
 857                  chars
 858   len          the length of the string
 859   charset      the name of the character set; NULL => iso-8859-1
 860   buffer       the buffer to put the answer in
 861   buffer_size  the size of the buffer
 862   fold         if TRUE, a newline is inserted before the separating space when
 863                  more than one encoded-word is generated
 864
 865 Returns:       pointer to the original string, if no quoting needed, or
 866                pointer to buffer containing the quoted string, or
 867                a pointer to "String too long" if the buffer can't even hold
 868                the introduction
 869 */
 870
 871 uschar *
 872 parse_quote_2047(uschar *string, int len, uschar *charset, uschar *buffer,
 873   int buffer_size, BOOL fold)
 874 {
 875 uschar *s = string;
 876 uschar *p, *t;
 877 int hlen;
 878 BOOL coded = FALSE;
 879
 880 if (charset == NULL) charset = US"iso-8859-1";
 881
 882 /* We don't expect this to fail! */
 883
 884 if (!string_format(buffer, buffer_size, "=?%s?Q?", charset))
 885   return US"String too long";
 886
 887 hlen = Ustrlen(buffer);
 888 t = buffer + hlen;
 889 p = buffer;
 890
 891 for (; len > 0; len--)
 892   {
 893   int ch = *s++;
 894   if (t > buffer + buffer_size - hlen - 8) break;
 895
 896   if (t - p > 70)
 897     {
 898     *t++ = '?';
 899     *t++ = '=';
 900     if (fold) *t++ = '\n';
 901     *t++ = ' ';
 902     p = t;
 903     Ustrncpy(p, buffer, hlen);
 904     t += hlen;
 905     }
 906
 907   if (ch < 33 || ch > 126 ||
 908       Ustrchr("?=()<>@,;:\\\".[]_", ch) != NULL)
 909     {
 910     if (ch == ' ') *t++ = '_'; else
 911       {
 912       sprintf(CS t, "=%02X", ch);
 913       while (*t != 0) t++;
 914       coded = TRUE;
 915       }
 916     }
 917   else *t++ = ch;
 918   }
 919
 920 *t++ = '?';
 921 *t++ = '=';
 922 *t = 0;
 923
 924 return coded? buffer : string;
 925 }
 926
 927
 928
 929
 930 /*************************************************
 931 *            Fix up an RFC 822 "phrase"          *
 932 *************************************************/
 933
 934 /* This function is called to repair any syntactic defects in the "phrase" part
 935 of an RFC822 address. In particular, it is applied to the user's name as read
 936 from the passwd file when accepting a local message, and to the data from the
 937 -F option.
 938
 939 If the string contains existing quoted strings or comments containing
 940 freestanding quotes, then we just quote those bits that need quoting -
 941 otherwise it would get awfully messy and probably not look good. If not, we
 942 quote the whole thing if necessary. Thus
 943
 944    John Q. Smith            =>  "John Q. Smith"
 945    John "Jack" Smith        =>  John "Jack" Smith
 946    John "Jack" Q. Smith     =>  John "Jack" "Q." Smith
 947    John (Jack) Q. Smith     =>  "John (Jack) Q. Smith"
 948    John ("Jack") Q. Smith   =>  John ("Jack") "Q." Smith
 949 but
 950    John (\"Jack\") Q. Smith =>  "John (\"Jack\") Q. Smith"
 951
 952 Sheesh! This is tedious code. It is a great pity that the syntax of RFC822 is
 953 the way it is...
 954
 955 August 2000: Additional code added:
 956
 957   Previously, non-printing characters were turned into question marks, which do
 958   not need to be quoted.
 959
 960   Now, a different tactic is used if there are any non-printing ASCII
 961   characters. The encoding method from RFC 2047 is used, assuming iso-8859-1 as
 962   the character set.
 963
 964   We *could* use this for all cases, getting rid of the messy original code,
 965   but leave it for now. It would complicate simple cases like "John Q. Smith".
 966
 967 The result is passed back in the buffer; it is usually going to be added to
 968 some other string. In order to be sure there is going to be no overflow,
 969 restrict the length of the input to 1/4 of the buffer size - this allows for
 970 every single character to be quoted or encoded without overflowing, and that
 971 wouldn't happen because of amalgamation. If the phrase is too long, return a
 972 fixed string.
 973
 974 Arguments:
 975   phrase       an RFC822 phrase
 976   len          the length of the phrase
 977   buffer       a buffer to put the result in
 978   buffer_size  the size of the buffer
 979
 980 Returns:       the fixed RFC822 phrase
 981 */
 982
 983 uschar *
 984 parse_fix_phrase(uschar *phrase, int len, uschar *buffer, int buffer_size)
 985 {
 986 int ch, i;
 987 BOOL quoted = FALSE;
 988 uschar *s, *t, *end, *yield;
 989
 990 while (len > 0 && isspace(*phrase)) { phrase++; len--; }
 991 if (len > buffer_size/4) return US"Name too long";
 992
 993 /* See if there are any non-printing characters, and if so, use the RFC 2047
 994 encoding for the whole thing. */
 995
 996 for (i = 0, s = phrase; i < len; i++, s++)
 997   if ((*s < 32 && *s != '\t') || *s > 126) break;
 998
 999 if (i < len) return parse_quote_2047(phrase, len, headers_charset, buffer,
1000   buffer_size, FALSE);
1001
1002 /* No non-printers; use the RFC 822 quoting rules */
1003
1004 s = phrase;
1005 end = s + len;
1006 yield = t = buffer + 1;
1007
1008 while (s < end)
1009   {
1010   ch = *s++;
1011
1012   /* Copy over quoted strings, remembering we encountered one */
1013
1014   if (ch == '\"')
1015     {
1016     *t++ = '\"';
1017     while (s < end && (ch = *s++) != '\"')
1018       {
1019       *t++ = ch;
1020       if (ch == '\\' && s < end) *t++ = *s++;
1021       }
1022     *t++ = '\"';
1023     if (s >= end) break;
1024     quoted = TRUE;
1025     }
1026
1027   /* Copy over comments, noting if they contain freestanding quote
1028   characters */
1029
1030   else if (ch == '(')
1031     {
1032     int level = 1;
1033     *t++ = '(';
1034     while (s < end)
1035       {
1036       ch = *s++;
1037       *t++ = ch;
1038       if (ch == '(') level++;
1039       else if (ch == ')') { if (--level <= 0) break; }
1040       else if (ch == '\\' && s < end) *t++ = *s++ & 127;
1041       else if (ch == '\"') quoted = TRUE;
1042       }
1043     if (ch == 0)
1044       {
1045       while (level--) *t++ = ')';
1046       break;
1047       }
1048     }
1049
1050   /* Handle special characters that need to be quoted */
1051
1052   else if (Ustrchr(")<>@,;:\\.[]", ch) != NULL)
1053     {
1054     /* If hit previous quotes just make one quoted "word" */
1055
1056     if (quoted)
1057       {
1058       uschar *tt = t++;
1059       while (*(--tt) != ' ' && *tt != '\"' && *tt != ')') tt[1] = *tt;
1060       tt[1] = '\"';
1061       *t++ = ch;
1062       while (s < end)
1063         {
1064         ch = *s++;
1065         if (ch == ' ' || ch == '\"') { s--; break; } else *t++ = ch;
1066         }
1067       *t++ = '\"';
1068       }
1069
1070     /* Else quote the whole string so far, and the rest up to any following
1071     quotes. We must treat anything following a backslash as a literal. */
1072
1073     else
1074       {
1075       BOOL escaped = (ch == '\\');
1076       *(--yield) = '\"';
1077       *t++ = ch;
1078
1079       /* Now look for the end or a quote */
1080
1081       while (s < end)
1082         {
1083         ch = *s++;
1084
1085         /* Handle escaped pairs */
1086
1087         if (escaped)
1088           {
1089           *t++ = ch;
1090           escaped = FALSE;
1091           }
1092
1093         else if (ch == '\\')
1094           {
1095           *t++ = ch;
1096           escaped = TRUE;
1097           }
1098
1099         /* If hit subsequent quotes, insert our quote before any trailing
1100         spaces and back up to re-handle the quote in the outer loop. */
1101
1102         else if (ch == '\"')
1103           {
1104           int count = 0;
1105           while (t[-1] == ' ') { t--; count++; }
1106           *t++ = '\"';
1107           while (count-- > 0) *t++ = ' ';
1108           s--;
1109           break;
1110           }
1111
1112         /* If hit a subsequent comment, check it for unescaped quotes,
1113         and if so, end our quote before it. */
1114
1115         else if (ch == '(')
1116           {
1117           uschar *ss = s;     /* uschar after '(' */
1118           int level = 1;
1119           while(ss < end)
1120             {
1121             ch = *ss++;
1122             if (ch == '(') level++;
1123             else if (ch == ')') { if (--level <= 0) break; }
1124             else if (ch == '\\' && ss+1 < end) ss++;
1125             else if (ch == '\"') { quoted = TRUE; break; }
1126             }
1127
1128           /* Comment contains unescaped quotes; end our quote before
1129           the start of the comment. */
1130
1131           if (quoted)
1132             {
1133             int count = 0;
1134             while (t[-1] == ' ') { t--; count++; }
1135             *t++ = '\"';
1136             while (count-- > 0) *t++ = ' ';
1137             break;
1138             }
1139
1140           /* Comment does not contain unescaped quotes; include it in
1141           our quote. */
1142
1143           else
1144             {
1145             if (ss >= end) ss--;
1146             *t++ = '(';
1147             Ustrncpy(t, s, ss-s);
1148             t += ss-s;
1149             s = ss;
1150             }
1151           }
1152
1153         /* Not a comment or quote; include this character in our quotes. */
1154
1155         else *t++ = ch;
1156         }
1157       }
1158
1159     /* Add a final quote if we hit the end of the string. */
1160
1161     if (s >= end) *t++ = '\"';
1162     }
1163
1164   /* Non-special character; just copy it over */
1165
1166   else *t++ = ch;
1167   }
1168
1169 *t = 0;
1170 return yield;
1171 }
1172
1173
1174 /*************************************************
1175 *          Extract addresses from a list         *
1176 *************************************************/
1177
1178 /* This function is called by the redirect router to scan a string containing a
1179 list of addresses separated by commas (with optional white space) or by
1180 newlines, and to generate a chain of address items from them. In other words,
1181 to unpick data from an alias or .forward file.
1182
1183 The SunOS5 documentation for alias files is not very clear on the syntax; it
1184 does not say that either a comma or a newline can be used for separation.
1185 However, that is the way Smail does it, so we follow suit.
1186
1187 If a # character is encountered in a white space position, then characters from
1188 there to the next newline are skipped.
1189
1190 If an unqualified address begins with '\', just skip that character. This gives
1191 compatibility with Sendmail's use of \ to prevent looping. Exim has its own
1192 loop prevention scheme which handles other cases too - see the code in
1193 route_address().
1194
1195 An "address" can be a specification of a file or a pipe; the latter may often
1196 need to be quoted because it may contain spaces, but we don't want to retain
1197 the quotes. Quotes may appear in normal addresses too, and should be retained.
1198 We can distinguish between these cases, because in addresses, quotes are used
1199 only for parts of the address, not the whole thing. Therefore, we remove quotes
1200 from items when they entirely enclose them, but not otherwise.
1201
1202 An "address" can also be of the form :include:pathname to include a list of
1203 addresses contained in the specified file.
1204
1205 Any unqualified addresses are qualified with and rewritten if necessary, via
1206 the rewrite_address() function.
1207
1208 Arguments:
1209   s                the list of addresses (typically a complete
1210                      .forward file or a list of entries in an alias file)
1211   options          option bits for permitting or denying various special cases;
1212                      not all bits are relevant here - some are for filter
1213                      files; those we use here are:
1214                        RDO_DEFER
1215                        RDO_FREEZE
1216                        RDO_FAIL
1217                        RDO_BLACKHOLE
1218                        RDO_REWRITE
1219                        RDO_INCLUDE
1220   anchor           where to hang the chain of newly-created addresses. This
1221                      should be initialized to NULL.
1222   error            where to return an error text
1223   incoming domain  domain of the incoming address; used to qualify unqualified
1224                      local parts preceded by \
1225   directory        if NULL, no checks are done on :include: files
1226                    otherwise, included file names must start with the given
1227                      directory
1228   syntax_errors    if not NULL, it carries on after syntax errors in addresses,
1229                      building up a list of errors as error blocks chained on
1230                      here.
1231
1232 Returns:      FF_DELIVERED      addresses extracted
1233               FF_NOTDELIVERED   no addresses extracted, but no errors
1234               FF_BLACKHOLE      :blackhole:
1235               FF_DEFER          :defer:
1236               FF_FAIL           :fail:
1237               FF_INCLUDEFAIL    some problem with :include:; *error set
1238               FF_ERROR          other problems; *error is set
1239 */
1240
1241 int
1242 parse_forward_list(uschar *s, int options, address_item **anchor,
1243   uschar **error, uschar *incoming_domain, uschar *directory,
1244   error_block **syntax_errors)
1245 {
1246 int count = 0;
1247
1248 DEBUG(D_route) debug_printf("parse_forward_list: %s\n", s);
1249
1250 for (;;)
1251   {
1252   int len;
1253   int special = 0;
1254   int specopt = 0;
1255   int specbit = 0;
1256   uschar *ss, *nexts;
1257   address_item *addr;
1258   BOOL inquote = FALSE;
1259
1260   for (;;)
1261     {
1262     while (isspace(*s) || *s == ',') s++;
1263     if (*s == '#') { while (*s != 0 && *s != '\n') s++; } else break;
1264     }
1265
1266   /* When we reach the end of the list, we return FF_DELIVERED if any child
1267   addresses have been generated. If nothing has been generated, there are two
1268   possibilities: either the list is really empty, or there were syntax errors
1269   that are being skipped. (If syntax errors are not being skipped, an FF_ERROR
1270   return is generated on hitting a syntax error and we don't get here.) For a
1271   truly empty list we return FF_NOTDELIVERED so that the router can decline.
1272   However, if the list is empty only because syntax errors were skipped, we
1273   return FF_DELIVERED. */
1274
1275   if (*s == 0)
1276     {
1277     return (count > 0 || (syntax_errors != NULL && *syntax_errors != NULL))?
1278       FF_DELIVERED : FF_NOTDELIVERED;
1279
1280     /* This previous code returns FF_ERROR if nothing is generated but a
1281     syntax error has been skipped. I now think it is the wrong approach, but
1282     have left this here just in case, and for the record. */
1283
1284     #ifdef NEVER
1285     if (count > 0) return FF_DELIVERED;   /* Something was generated */
1286
1287     if (syntax_errors == NULL ||          /* Not skipping syntax errors, or */
1288        *syntax_errors == NULL)            /*   we didn't actually skip any */
1289       return FF_NOTDELIVERED;
1290
1291     *error = string_sprintf("no addresses generated: syntax error in %s: %s",
1292        (*syntax_errors)->text2, (*syntax_errors)->text1);
1293     return FF_ERROR;
1294     #endif
1295
1296     }
1297
1298   /* Find the end of the next address. Quoted strings in addresses may contain
1299   escaped characters; I haven't found a proper specification of .forward or
1300   alias files that mentions the quoting properties, but it seems right to do
1301   the escaping thing in all cases, so use the function that finds the end of an
1302   address. However, don't let a quoted string extend over the end of a line. */
1303
1304   ss = parse_find_address_end(s, TRUE);
1305
1306   /* Remember where we finished, for starting the next one. */
1307
1308   nexts = ss;
1309
1310   /* Remove any trailing spaces; we know there's at least one non-space. */
1311
1312   while (isspace((ss[-1]))) ss--;
1313
1314   /* We now have s->start and ss->end of the next address. Remove quotes
1315   if they completely enclose, remembering the address started with a quote
1316   for handling pipes and files. Another round of removal of leading and
1317   trailing spaces is then required. */
1318
1319   if (*s == '\"' && ss[-1] == '\"')
1320     {
1321     s++;
1322     ss--;
1323     inquote = TRUE;
1324     while (s < ss && isspace(*s)) s++;
1325     while (ss > s && isspace((ss[-1]))) ss--;
1326     }
1327
1328   /* Set up the length of the address. */
1329
1330   len = ss - s;
1331
1332   DEBUG(D_route)
1333     {
1334     int save = s[len];
1335     s[len] = 0;
1336     debug_printf("extract item: %s\n", s);
1337     s[len] = save;
1338     }
1339
1340   /* Handle special addresses if permitted. If the address is :unknown:
1341   ignore it - this is for backward compatibility with old alias files. You
1342   don't need to use it nowadays - just generate an empty string. For :defer:,
1343   :blackhole:, or :fail: we have to set up the error message and give up right
1344   away. */
1345
1346   if (Ustrncmp(s, ":unknown:", len) == 0)
1347     {
1348     s = nexts;
1349     continue;
1350     }
1351
1352   if      (Ustrncmp(s, ":defer:", 7) == 0)
1353     { special = FF_DEFER; specopt = RDO_DEFER; }  /* specbit is 0 */
1354   else if (Ustrncmp(s, ":blackhole:", 11) == 0)
1355     { special = FF_BLACKHOLE; specopt = specbit = RDO_BLACKHOLE; }
1356   else if (Ustrncmp(s, ":fail:", 6) == 0)
1357     { special = FF_FAIL; specopt = RDO_FAIL; }  /* specbit is 0 */
1358
1359   if (special != 0)
1360     {
1361     uschar *ss = Ustrchr(s+1, ':') + 1;
1362     if ((options & specopt) == specbit)
1363       {
1364       *error = string_sprintf("\"%.*s\" is not permitted", len, s);
1365       return FF_ERROR;
1366       }
1367     while (*ss != 0 && isspace(*ss)) ss++;
1368     while (s[len] != 0 && s[len] != '\n') len++;
1369     s[len] = 0;
1370     *error = string_copy(ss);
1371     return special;
1372     }
1373
1374   /* If the address is of the form :include:pathname, read the file, and call
1375   this function recursively to extract the addresses from it. If directory is
1376   NULL, do no checks. Otherwise, insist that the file name starts with the
1377   given directory and is a regular file. */
1378
1379   if (Ustrncmp(s, ":include:", 9) == 0)
1380     {
1381     uschar *filebuf;
1382     uschar filename[256];
1383     uschar *t = s+9;
1384     int flen = len - 9;
1385     int frc;
1386     struct stat statbuf;
1387     address_item *last;
1388     FILE *f;
1389
1390     while (flen > 0 && isspace(*t)) { t++; flen--; }
1391
1392     if (flen <= 0)
1393       {
1394       *error = string_sprintf("file name missing after :include:");
1395       return FF_ERROR;
1396       }
1397
1398     if (flen > 255)
1399       {
1400       *error = string_sprintf("included file name \"%s\" is too long", t);
1401       return FF_ERROR;
1402       }
1403
1404     Ustrncpy(filename, t, flen);
1405     filename[flen] = 0;
1406
1407     /* Insist on absolute path */
1408
1409     if (filename[0]!= '/')
1410       {
1411       *error = string_sprintf("included file \"%s\" is not an absolute path",
1412         filename);
1413       return FF_ERROR;
1414       }
1415
1416     /* Check if include is permitted */
1417
1418     if ((options & RDO_INCLUDE) != 0)
1419       {
1420       *error = US"included files not permitted";
1421       return FF_ERROR;
1422       }
1423
1424     /* Check file name if required */
1425
1426     if (directory != NULL)
1427       {
1428       int len = Ustrlen(directory);
1429       uschar *p = filename + len;
1430
1431       if (Ustrncmp(filename, directory, len) != 0 || *p != '/')
1432         {
1433         *error = string_sprintf("included file %s is not in directory %s",
1434           filename, directory);
1435         return FF_ERROR;
1436         }
1437
1438       /* It is necessary to check that every component inside the directory
1439       is NOT a symbolic link, in order to keep the file inside the directory.
1440       This is mighty tedious. It is also not totally foolproof in that it
1441       leaves the possibility of a race attack, but I don't know how to do
1442       any better. */
1443
1444       while (*p != 0)
1445         {
1446         int temp;
1447         while (*(++p) != 0 && *p != '/');
1448         temp = *p;
1449         *p = 0;
1450         if (Ulstat(filename, &statbuf) != 0)
1451           {
1452           *error = string_sprintf("failed to stat %s (component of included "
1453             "file)", filename);
1454           *p = temp;
1455           return FF_ERROR;
1456           }
1457
1458         *p = temp;
1459
1460         if ((statbuf.st_mode & S_IFMT) == S_IFLNK)
1461           {
1462           *error = string_sprintf("included file %s in the %s directory "
1463             "involves a symbolic link", filename, directory);
1464           return FF_ERROR;
1465           }
1466         }
1467       }
1468
1469     /* Open and stat the file */
1470
1471     if ((f = Ufopen(filename, "rb")) == NULL)
1472       {
1473       *error = string_open_failed(errno, "included file %s", filename);
1474       return FF_INCLUDEFAIL;
1475       }
1476
1477     if (fstat(fileno(f), &statbuf) != 0)
1478       {
1479       *error = string_sprintf("failed to stat included file %s: %s",
1480         filename, strerror(errno));
1481       (void)fclose(f);
1482       return FF_INCLUDEFAIL;
1483       }
1484
1485     /* If directory was checked, double check that we opened a regular file */
1486
1487     if (directory != NULL && (statbuf.st_mode & S_IFMT) != S_IFREG)
1488       {
1489       *error = string_sprintf("included file %s is not a regular file in "
1490         "the %s directory", filename, directory);
1491       return FF_ERROR;
1492       }
1493
1494     /* Get a buffer and read the contents */
1495
1496     if (statbuf.st_size > MAX_INCLUDE_SIZE)
1497       {
1498       *error = string_sprintf("included file %s is too big (max %d)",
1499         filename, MAX_INCLUDE_SIZE);
1500       return FF_ERROR;
1501       }
1502
1503     filebuf = store_get(statbuf.st_size + 1);
1504     if (fread(filebuf, 1, statbuf.st_size, f) != statbuf.st_size)
1505       {
1506       *error = string_sprintf("error while reading included file %s: %s",
1507         filename, strerror(errno));
1508       (void)fclose(f);
1509       return FF_ERROR;
1510       }
1511     filebuf[statbuf.st_size] = 0;
1512     (void)fclose(f);
1513
1514     addr = NULL;
1515     frc = parse_forward_list(filebuf, options, &addr,
1516       error, incoming_domain, directory, syntax_errors);
1517     if (frc != FF_DELIVERED && frc != FF_NOTDELIVERED) return frc;
1518
1519     if (addr != NULL)
1520       {
1521       last = addr;
1522       while (last->next != NULL) { count++; last = last->next; }
1523       last->next = *anchor;
1524       *anchor = addr;
1525       count++;
1526       }
1527     }
1528
1529   /* Else (not :include:) ensure address is syntactically correct and fully
1530   qualified if not a pipe or a file, removing a leading \ if present on an
1531   unqualified address. For pipes and files we must handle quoting. It's
1532   not quite clear exactly what to do for partially quoted things, but the
1533   common case of having the whole thing in quotes is straightforward. If this
1534   was the case, inquote will have been set TRUE above and the quotes removed.
1535
1536   There is a possible ambiguity over addresses whose local parts start with
1537   a vertical bar or a slash, and the latter do in fact occur, thanks to X.400.
1538   Consider a .forward file that contains the line
1539
1540      /X=xxx/Y=xxx/OU=xxx/@some.gate.way
1541
1542   Is this a file or an X.400 address? Does it make any difference if it is in
1543   quotes? On the grounds that file names of this type are rare, Exim treats
1544   something that parses as an RFC 822 address and has a domain as an address
1545   rather than a file or a pipe. This is also how an address such as the above
1546   would be treated if it came in from outside. */
1547
1548   else
1549     {
1550     int start, end, domain;
1551     uschar *recipient = NULL;
1552     int save = s[len];
1553     s[len] = 0;
1554
1555     /* If it starts with \ and the rest of it parses as a valid mail address
1556     without a domain, carry on with that address, but qualify it with the
1557     incoming domain. Otherwise arrange for the address to fall through,
1558     causing an error message on the re-parse. */
1559
1560     if (*s == '\\')
1561       {
1562       recipient =
1563         parse_extract_address(s+1, error, &start, &end, &domain, FALSE);
1564       if (recipient != NULL)
1565         recipient = (domain != 0)? NULL :
1566           string_sprintf("%s@%s", recipient, incoming_domain);
1567       }
1568
1569     /* Try parsing the item as an address. */
1570
1571     if (recipient == NULL) recipient =
1572       parse_extract_address(s, error, &start, &end, &domain, FALSE);
1573
1574     /* If item starts with / or | and is not a valid address, or there
1575     is no domain, treat it as a file or pipe. If it was a quoted item,
1576     remove the quoting occurrences of \ within it. */
1577
1578     if ((*s == '|' || *s == '/') && (recipient == NULL || domain == 0))
1579       {
1580       uschar *t = store_get(Ustrlen(s) + 1);
1581       uschar *p = t;
1582       uschar *q = s;
1583       while (*q != 0)
1584         {
1585         if (inquote)
1586           {
1587           *p++ = (*q == '\\')? *(++q) : *q;
1588           q++;
1589           }
1590         else *p++ = *q++;
1591         }
1592       *p = 0;
1593       addr = deliver_make_addr(t, TRUE);
1594       setflag(addr, af_pfr);                   /* indicates pipe/file/reply */
1595       if (*s != '|') setflag(addr, af_file);   /* indicates file */
1596       }
1597
1598     /* Item must be an address. Complain if not, else qualify, rewrite and set
1599     up the control block. It appears that people are in the habit of using
1600     empty addresses but with comments as a way of putting comments into
1601     alias and forward files. Therefore, ignore the error "empty address".
1602     Mailing lists might want to tolerate syntax errors; there is therefore
1603     an option to do so. */
1604
1605     else
1606       {
1607       if (recipient == NULL)
1608         {
1609         if (Ustrcmp(*error, "empty address") == 0)
1610           {
1611           *error = NULL;
1612           s[len] = save;
1613           s = nexts;
1614           continue;
1615           }
1616
1617         if (syntax_errors != NULL)
1618           {
1619           error_block *e = store_get(sizeof(error_block));
1620           error_block *last = *syntax_errors;
1621           if (last == NULL) *syntax_errors = e; else
1622             {
1623             while (last->next != NULL) last = last->next;
1624             last->next = e;
1625             }
1626           e->next = NULL;
1627           e->text1 = *error;
1628           e->text2 = string_copy(s);
1629           s[len] = save;
1630           s = nexts;
1631           continue;
1632           }
1633         else
1634           {
1635           *error = string_sprintf("%s in \"%s\"", *error, s);
1636           s[len] = save;   /* _after_ using it for *error */
1637           return FF_ERROR;
1638           }
1639         }
1640
1641       /* Address was successfully parsed. Rewrite, and then make an address
1642       block. */
1643
1644       recipient = ((options & RDO_REWRITE) != 0)?
1645         rewrite_address(recipient, TRUE, FALSE, global_rewrite_rules,
1646           rewrite_existflags) :
1647         rewrite_address_qualify(recipient, TRUE);
1648       addr = deliver_make_addr(recipient, TRUE);  /* TRUE => copy recipient */
1649       }
1650
1651     /* Restore the final character in the original data, and add to the
1652     output chain. */
1653
1654     s[len] = save;
1655     addr->next = *anchor;
1656     *anchor = addr;
1657     count++;
1658     }
1659
1660   /* Advance pointer for the next address */
1661
1662   s = nexts;
1663   }
1664 }
1665
1666
1667
1668 /*************************************************
1669 *            Extract a Message-ID                *
1670 *************************************************/
1671
1672 /* This function is used to extract message ids from In-Reply-To: and
1673 References: header lines.
1674
1675 Arguments:
1676   str          pointer to the start of the message-id
1677   yield        put pointer to the message id (in dynamic memory) here
1678   error        put error message here on failure
1679
1680 Returns:       points after the processed message-id or NULL on error
1681 */
1682
1683 uschar *
1684 parse_message_id(uschar *str, uschar **yield, uschar **error)
1685 {
1686 uschar *domain = NULL;
1687 uschar *id;
1688
1689 str = skip_comment(str);
1690 if (*str != '<')
1691   {
1692   *error = US"Missing '<' before message-id";
1693   return NULL;
1694   }
1695
1696 /* Getting a block the size of the input string will definitely be sufficient
1697 for the answer, but it may also be very long if we are processing a header
1698 line. Therefore, take care to release unwanted store afterwards. */
1699
1700 id = *yield = store_get(Ustrlen(str) + 1);
1701 *id++ = *str++;
1702
1703 str = read_addr_spec(str, id, '>', error, &domain);
1704
1705 if (*error == NULL)
1706   {
1707   if (*str != '>') *error = US"Missing '>' after message-id";
1708     else if (domain == NULL) *error = US"domain missing in message-id";
1709   }
1710
1711 if (*error != NULL)
1712   {
1713   store_reset(*yield);
1714   return NULL;
1715   }
1716
1717 while (*id != 0) id++;
1718 *id++ = *str++;
1719 *id++ = 0;
1720 store_reset(id);
1721
1722 str = skip_comment(str);
1723 return str;
1724 }
1725
1726
1727
1728
1729 /*************************************************
1730 **************************************************
1731 *             Stand-alone test program           *
1732 **************************************************
1733 *************************************************/
1734
1735 #if defined STAND_ALONE
1736 int main(void)
1737 {
1738 int start, end, domain;
1739 uschar buffer[1024];
1740 uschar outbuff[1024];
1741
1742 big_buffer = store_malloc(big_buffer_size);
1743
1744 /* strip_trailing_dot = TRUE; */
1745 allow_domain_literals = TRUE;
1746
1747 printf("Testing parse_fix_phrase\n");
1748
1749 while (Ufgets(buffer, sizeof(buffer), stdin) != NULL)
1750   {
1751   buffer[Ustrlen(buffer)-1] = 0;
1752   if (buffer[0] == 0) break;
1753   printf("%s\n", CS parse_fix_phrase(buffer, Ustrlen(buffer), outbuff,
1754     sizeof(outbuff)));
1755   }
1756
1757 printf("Testing parse_extract_address without group syntax and without UTF-8\n");
1758
1759 while (Ufgets(buffer, sizeof(buffer), stdin) != NULL)
1760   {
1761   uschar *out;
1762   uschar *errmess;
1763   buffer[Ustrlen(buffer) - 1] = 0;
1764   if (buffer[0] == 0) break;
1765   out = parse_extract_address(buffer, &errmess, &start, &end, &domain, FALSE);
1766   if (out == NULL) printf("*** bad address: %s\n", errmess); else
1767     {
1768     uschar extract[1024];
1769     Ustrncpy(extract, buffer+start, end-start);
1770     extract[end-start] = 0;
1771     printf("%s %d %d %d \"%s\"\n", out, start, end, domain, extract);
1772     }
1773   }
1774
1775 printf("Testing parse_extract_address without group syntax but with UTF-8\n");
1776
1777 allow_utf8_domains = TRUE;
1778 while (Ufgets(buffer, sizeof(buffer), stdin) != NULL)
1779   {
1780   uschar *out;
1781   uschar *errmess;
1782   buffer[Ustrlen(buffer) - 1] = 0;
1783   if (buffer[0] == 0) break;
1784   out = parse_extract_address(buffer, &errmess, &start, &end, &domain, FALSE);
1785   if (out == NULL) printf("*** bad address: %s\n", errmess); else
1786     {
1787     uschar extract[1024];
1788     Ustrncpy(extract, buffer+start, end-start);
1789     extract[end-start] = 0;
1790     printf("%s %d %d %d \"%s\"\n", out, start, end, domain, extract);
1791     }
1792   }
1793 allow_utf8_domains = FALSE;
1794
1795 printf("Testing parse_extract_address with group syntax\n");
1796
1797 parse_allow_group = TRUE;
1798 while (Ufgets(buffer, sizeof(buffer), stdin) != NULL)
1799   {
1800   uschar *out;
1801   uschar *errmess;
1802   uschar *s;
1803   buffer[Ustrlen(buffer) - 1] = 0;
1804   if (buffer[0] == 0) break;
1805   s = buffer;
1806   while (*s != 0)
1807     {
1808     uschar *ss = parse_find_address_end(s, FALSE);
1809     int terminator = *ss;
1810     *ss = 0;
1811     out = parse_extract_address(buffer, &errmess, &start, &end, &domain, FALSE);
1812     *ss = terminator;
1813
1814     if (out == NULL) printf("*** bad address: %s\n", errmess); else
1815       {
1816       uschar extract[1024];
1817       Ustrncpy(extract, buffer+start, end-start);
1818       extract[end-start] = 0;
1819       printf("%s %d %d %d \"%s\"\n", out, start, end, domain, extract);
1820       }
1821
1822     s = ss + (terminator? 1:0);
1823     while (isspace(*s)) s++;
1824     }
1825   }
1826
1827 printf("Testing parse_find_at\n");
1828
1829 while (Ufgets(buffer, sizeof(buffer), stdin) != NULL)
1830   {
1831   uschar *s;
1832   buffer[Ustrlen(buffer)-1] = 0;
1833   if (buffer[0] == 0) break;
1834   s = parse_find_at(buffer);
1835   if (s == NULL) printf("no @ found\n");
1836     else printf("offset = %d\n", s - buffer);
1837   }
1838
1839 printf("Testing parse_extract_addresses\n");
1840
1841 while (Ufgets(buffer, sizeof(buffer), stdin) != NULL)
1842   {
1843   uschar *errmess;
1844   int extracted;
1845   address_item *anchor = NULL;
1846   buffer[Ustrlen(buffer) - 1] = 0;
1847   if (buffer[0] == 0) break;
1848   if ((extracted = parse_forward_list(buffer, -1, &anchor,
1849       &errmess, US"incoming.domain", NULL, NULL)) == FF_DELIVERED)
1850     {
1851     while (anchor != NULL)
1852       {
1853       address_item *addr = anchor;
1854       anchor = anchor->next;
1855       printf("%d %s\n", testflag(addr, af_pfr), addr->address);
1856       }
1857     }
1858   else printf("Failed: %d %s\n", extracted, errmess);
1859   }
1860
1861 printf("Testing parse_message_id\n");
1862
1863 while (Ufgets(buffer, sizeof(buffer), stdin) != NULL)
1864   {
1865   uschar *s, *t, *errmess;
1866   buffer[Ustrlen(buffer) - 1] = 0;
1867   if (buffer[0] == 0) break;
1868   s = buffer;
1869   while (*s != 0)
1870     {
1871     s = parse_message_id(s, &t, &errmess);
1872     if (errmess != NULL)
1873       {
1874       printf("Failed: %s\n", errmess);
1875       break;
1876       }
1877     printf("%s\n", t);
1878     }
1879   }
1880
1881 return 0;
1882 }
1883
1884 #endif
1885
1886 /* End of parse.c */