src/src/pcre/pcre_compile.c

   1 /* $Cambridge: exim/src/src/pcre/pcre_compile.c,v 1.1 2005/06/15 08:57:10 ph10 Exp $ */
   2
   3 /*************************************************
   4 *      Perl-Compatible Regular Expressions       *
   5 *************************************************/
   6
   7 /* PCRE is a library of functions to support regular expressions whose syntax
   8 and semantics are as close as possible to those of the Perl 5 language.
   9
  10                        Written by Philip Hazel
  11            Copyright (c) 1997-2005 University of Cambridge
  12
  13 -----------------------------------------------------------------------------
  14 Redistribution and use in source and binary forms, with or without
  15 modification, are permitted provided that the following conditions are met:
  16
  17     * Redistributions of source code must retain the above copyright notice,
  18       this list of conditions and the following disclaimer.
  19
  20     * Redistributions in binary form must reproduce the above copyright
  21       notice, this list of conditions and the following disclaimer in the
  22       documentation and/or other materials provided with the distribution.
  23
  24     * Neither the name of the University of Cambridge nor the names of its
  25       contributors may be used to endorse or promote products derived from
  26       this software without specific prior written permission.
  27
  28 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  29 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  30 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  31 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  32 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  33 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  34 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  35 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  36 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  37 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38 POSSIBILITY OF SUCH DAMAGE.
  39 -----------------------------------------------------------------------------
  40 */
  41
  42
  43 /* This module contains the external function pcre_compile(), along with
  44 supporting internal functions that are not used by other modules. */
  45
  46
  47 #include "pcre_internal.h"
  48
  49
  50 /*************************************************
  51 *      Code parameters and static tables         *
  52 *************************************************/
  53
  54 /* Maximum number of items on the nested bracket stacks at compile time. This
  55 applies to the nesting of all kinds of parentheses. It does not limit
  56 un-nested, non-capturing parentheses. This number can be made bigger if
  57 necessary - it is used to dimension one int and one unsigned char vector at
  58 compile time. */
  59
  60 #define BRASTACK_SIZE 200
  61
  62
  63 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
  64 are simple data values; negative values are for special things like \d and so
  65 on. Zero means further processing is needed (for things like \x), or the escape
  66 is invalid. */
  67
  68 #if !EBCDIC   /* This is the "normal" table for ASCII systems */
  69 static const short int escapes[] = {
  70      0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
  71      0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
  72    '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
  73      0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */
  74 -ESC_P, -ESC_Q,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
  75 -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
  76    '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
  77      0,      0,      0,      0,      0,      0,  ESC_n,      0,   /* h - o */
  78 -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */
  79      0,      0, -ESC_z                                            /* x - z */
  80 };
  81
  82 #else         /* This is the "abnormal" table for EBCDIC systems */
  83 static const short int escapes[] = {
  84 /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
  85 /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
  86 /*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
  87 /*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
  88 /*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
  89 /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
  90 /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
  91 /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
  92 /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,
  93 /*  90 */     0,     0,      0,     'l',      0, ESC_n,      0, -ESC_p,
  94 /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
  95 /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,
  96 /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
  97 /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
  98 /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
  99 /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
 100 /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,
 101 /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,
 102 /*  D8 */-ESC_Q,     0,      0,       0,      0,     0,      0,      0,
 103 /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,
 104 /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
 105 /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
 106 /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
 107 };
 108 #endif
 109
 110
 111 /* Tables of names of POSIX character classes and their lengths. The list is
 112 terminated by a zero length entry. The first three must be alpha, upper, lower,
 113 as this is assumed for handling case independence. */
 114
 115 static const char *const posix_names[] = {
 116   "alpha", "lower", "upper",
 117   "alnum", "ascii", "blank", "cntrl", "digit", "graph",
 118   "print", "punct", "space", "word",  "xdigit" };
 119
 120 static const uschar posix_name_lengths[] = {
 121   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
 122
 123 /* Table of class bit maps for each POSIX class; up to three may be combined
 124 to form the class. The table for [:blank:] is dynamically modified to remove
 125 the vertical space characters. */
 126
 127 static const int posix_class_maps[] = {
 128   cbit_lower, cbit_upper, -1,             /* alpha */
 129   cbit_lower, -1,         -1,             /* lower */
 130   cbit_upper, -1,         -1,             /* upper */
 131   cbit_digit, cbit_lower, cbit_upper,     /* alnum */
 132   cbit_print, cbit_cntrl, -1,             /* ascii */
 133   cbit_space, -1,         -1,             /* blank - a GNU extension */
 134   cbit_cntrl, -1,         -1,             /* cntrl */
 135   cbit_digit, -1,         -1,             /* digit */
 136   cbit_graph, -1,         -1,             /* graph */
 137   cbit_print, -1,         -1,             /* print */
 138   cbit_punct, -1,         -1,             /* punct */
 139   cbit_space, -1,         -1,             /* space */
 140   cbit_word,  -1,         -1,             /* word - a Perl extension */
 141   cbit_xdigit,-1,         -1              /* xdigit */
 142 };
 143
 144
 145 /* The texts of compile-time error messages. These are "char *" because they
 146 are passed to the outside world. */
 147
 148 static const char *error_texts[] = {
 149   "no error",
 150   "\\ at end of pattern",
 151   "\\c at end of pattern",
 152   "unrecognized character follows \\",
 153   "numbers out of order in {} quantifier",
 154   /* 5 */
 155   "number too big in {} quantifier",
 156   "missing terminating ] for character class",
 157   "invalid escape sequence in character class",
 158   "range out of order in character class",
 159   "nothing to repeat",
 160   /* 10 */
 161   "operand of unlimited repeat could match the empty string",
 162   "internal error: unexpected repeat",
 163   "unrecognized character after (?",
 164   "POSIX named classes are supported only within a class",
 165   "missing )",
 166   /* 15 */
 167   "reference to non-existent subpattern",
 168   "erroffset passed as NULL",
 169   "unknown option bit(s) set",
 170   "missing ) after comment",
 171   "parentheses nested too deeply",
 172   /* 20 */
 173   "regular expression too large",
 174   "failed to get memory",
 175   "unmatched parentheses",
 176   "internal error: code overflow",
 177   "unrecognized character after (?<",
 178   /* 25 */
 179   "lookbehind assertion is not fixed length",
 180   "malformed number after (?(",
 181   "conditional group contains more than two branches",
 182   "assertion expected after (?(",
 183   "(?R or (?digits must be followed by )",
 184   /* 30 */
 185   "unknown POSIX class name",
 186   "POSIX collating elements are not supported",
 187   "this version of PCRE is not compiled with PCRE_UTF8 support",
 188   "spare error",
 189   "character value in \\x{...} sequence is too large",
 190   /* 35 */
 191   "invalid condition (?(0)",
 192   "\\C not allowed in lookbehind assertion",
 193   "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
 194   "number after (?C is > 255",
 195   "closing ) for (?C expected",
 196   /* 40 */
 197   "recursive call could loop indefinitely",
 198   "unrecognized character after (?P",
 199   "syntax error after (?P",
 200   "two named groups have the same name",
 201   "invalid UTF-8 string",
 202   /* 45 */
 203   "support for \\P, \\p, and \\X has not been compiled",
 204   "malformed \\P or \\p sequence",
 205   "unknown property name after \\P or \\p"
 206 };
 207
 208
 209 /* Table to identify digits and hex digits. This is used when compiling
 210 patterns. Note that the tables in chartables are dependent on the locale, and
 211 may mark arbitrary characters as digits - but the PCRE compiling code expects
 212 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
 213 a private table here. It costs 256 bytes, but it is a lot faster than doing
 214 character value tests (at least in some simple cases I timed), and in some
 215 applications one wants PCRE to compile efficiently as well as match
 216 efficiently.
 217
 218 For convenience, we use the same bit definitions as in chartables:
 219
 220   0x04   decimal digit
 221   0x08   hexadecimal digit
 222
 223 Then we can use ctype_digit and ctype_xdigit in the code. */
 224
 225 #if !EBCDIC    /* This is the "normal" case, for ASCII systems */
 226 static const unsigned char digitab[] =
 227   {
 228   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
 229   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
 230   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
 231   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
 232   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
 233   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
 234   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
 235   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
 236   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
 237   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
 238   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
 239   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
 240   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
 241   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
 242   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
 243   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
 244   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
 245   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
 246   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
 247   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
 248   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
 249   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
 250   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
 251   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
 252   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
 253   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
 254   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
 255   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
 256   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
 257   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
 258   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
 259   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
 260
 261 #else          /* This is the "abnormal" case, for EBCDIC systems */
 262 static const unsigned char digitab[] =
 263   {
 264   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
 265   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
 266   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
 267   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
 268   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
 269   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
 270   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
 271   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
 272   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
 273   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
 274   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
 275   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- ¬     */
 276   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
 277   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
 278   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
 279   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
 280   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
 281   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
 282   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
 283   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
 284   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
 285   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
 286   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
 287   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
 288   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
 289   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
 290   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
 291   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
 292   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
 293   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
 294   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
 295   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
 296
 297 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
 298   0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
 299   0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
 300   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
 301   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
 302   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
 303   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
 304   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
 305   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
 306   0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
 307   0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
 308   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
 309   0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- ¬  */
 310   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
 311   0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
 312   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
 313   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
 314   0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
 315   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
 316   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
 317   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
 318   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
 319   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
 320   0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
 321   0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
 322   0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
 323   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
 324   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
 325   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
 326   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
 327   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
 328   0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
 329   0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
 330 #endif
 331
 332
 333 /* Definition to allow mutual recursion */
 334
 335 static BOOL
 336   compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,
 337     int *, int *, branch_chain *, compile_data *);
 338
 339
 340
 341 /*************************************************
 342 *            Handle escapes                      *
 343 *************************************************/
 344
 345 /* This function is called when a \ has been encountered. It either returns a
 346 positive value for a simple escape such as \n, or a negative value which
 347 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
 348 a positive value greater than 255 may be returned. On entry, ptr is pointing at
 349 the \. On exit, it is on the final character of the escape sequence.
 350
 351 Arguments:
 352   ptrptr         points to the pattern position pointer
 353   errorcodeptr   points to the errorcode variable
 354   bracount       number of previous extracting brackets
 355   options        the options bits
 356   isclass        TRUE if inside a character class
 357
 358 Returns:         zero or positive => a data character
 359                  negative => a special escape sequence
 360                  on error, errorptr is set
 361 */
 362
 363 static int
 364 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
 365   int options, BOOL isclass)
 366 {
 367 const uschar *ptr = *ptrptr;
 368 int c, i;
 369
 370 /* If backslash is at the end of the pattern, it's an error. */
 371
 372 c = *(++ptr);
 373 if (c == 0) *errorcodeptr = ERR1;
 374
 375 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
 376 a table. A non-zero result is something that can be returned immediately.
 377 Otherwise further processing may be required. */
 378
 379 #if !EBCDIC    /* ASCII coding */
 380 else if (c < '0' || c > 'z') {}                           /* Not alphameric */
 381 else if ((i = escapes[c - '0']) != 0) c = i;
 382
 383 #else          /* EBCDIC coding */
 384 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */
 385 else if ((i = escapes[c - 0x48]) != 0)  c = i;
 386 #endif
 387
 388 /* Escapes that need further processing, or are illegal. */
 389
 390 else
 391   {
 392   const uschar *oldptr;
 393   switch (c)
 394     {
 395     /* A number of Perl escapes are not handled by PCRE. We give an explicit
 396     error. */
 397
 398     case 'l':
 399     case 'L':
 400     case 'N':
 401     case 'u':
 402     case 'U':
 403     *errorcodeptr = ERR37;
 404     break;
 405
 406     /* The handling of escape sequences consisting of a string of digits
 407     starting with one that is not zero is not straightforward. By experiment,
 408     the way Perl works seems to be as follows:
 409
 410     Outside a character class, the digits are read as a decimal number. If the
 411     number is less than 10, or if there are that many previous extracting
 412     left brackets, then it is a back reference. Otherwise, up to three octal
 413     digits are read to form an escaped byte. Thus \123 is likely to be octal
 414     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
 415     value is greater than 377, the least significant 8 bits are taken. Inside a
 416     character class, \ followed by a digit is always an octal number. */
 417
 418     case '1': case '2': case '3': case '4': case '5':
 419     case '6': case '7': case '8': case '9':
 420
 421     if (!isclass)
 422       {
 423       oldptr = ptr;
 424       c -= '0';
 425       while ((digitab[ptr[1]] & ctype_digit) != 0)
 426         c = c * 10 + *(++ptr) - '0';
 427       if (c < 10 || c <= bracount)
 428         {
 429         c = -(ESC_REF + c);
 430         break;
 431         }
 432       ptr = oldptr;      /* Put the pointer back and fall through */
 433       }
 434
 435     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
 436     generates a binary zero byte and treats the digit as a following literal.
 437     Thus we have to pull back the pointer by one. */
 438
 439     if ((c = *ptr) >= '8')
 440       {
 441       ptr--;
 442       c = 0;
 443       break;
 444       }
 445
 446     /* \0 always starts an octal number, but we may drop through to here with a
 447     larger first octal digit. */
 448
 449     case '0':
 450     c -= '0';
 451     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
 452         c = c * 8 + *(++ptr) - '0';
 453     c &= 255;     /* Take least significant 8 bits */
 454     break;
 455
 456     /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
 457     which can be greater than 0xff, but only if the ddd are hex digits. */
 458
 459     case 'x':
 460 #ifdef SUPPORT_UTF8
 461     if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
 462       {
 463       const uschar *pt = ptr + 2;
 464       register int count = 0;
 465       c = 0;
 466       while ((digitab[*pt] & ctype_xdigit) != 0)
 467         {
 468         int cc = *pt++;
 469         count++;
 470 #if !EBCDIC    /* ASCII coding */
 471         if (cc >= 'a') cc -= 32;               /* Convert to upper case */
 472         c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
 473 #else          /* EBCDIC coding */
 474         if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
 475         c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
 476 #endif
 477         }
 478       if (*pt == '}')
 479         {
 480         if (c < 0 || count > 8) *errorcodeptr = ERR34;
 481         ptr = pt;
 482         break;
 483         }
 484       /* If the sequence of hex digits does not end with '}', then we don't
 485       recognize this construct; fall through to the normal \x handling. */
 486       }
 487 #endif
 488
 489     /* Read just a single hex char */
 490
 491     c = 0;
 492     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
 493       {
 494       int cc;                               /* Some compilers don't like ++ */
 495       cc = *(++ptr);                        /* in initializers */
 496 #if !EBCDIC    /* ASCII coding */
 497       if (cc >= 'a') cc -= 32;              /* Convert to upper case */
 498       c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
 499 #else          /* EBCDIC coding */
 500       if (cc <= 'z') cc += 64;              /* Convert to upper case */
 501       c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
 502 #endif
 503       }
 504     break;
 505
 506     /* Other special escapes not starting with a digit are straightforward */
 507
 508     case 'c':
 509     c = *(++ptr);
 510     if (c == 0)
 511       {
 512       *errorcodeptr = ERR2;
 513       return 0;
 514       }
 515
 516     /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
 517     is ASCII-specific, but then the whole concept of \cx is ASCII-specific.
 518     (However, an EBCDIC equivalent has now been added.) */
 519
 520 #if !EBCDIC    /* ASCII coding */
 521     if (c >= 'a' && c <= 'z') c -= 32;
 522     c ^= 0x40;
 523 #else          /* EBCDIC coding */
 524     if (c >= 'a' && c <= 'z') c += 64;
 525     c ^= 0xC0;
 526 #endif
 527     break;
 528
 529     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
 530     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
 531     for Perl compatibility, it is a literal. This code looks a bit odd, but
 532     there used to be some cases other than the default, and there may be again
 533     in future, so I haven't "optimized" it. */
 534
 535     default:
 536     if ((options & PCRE_EXTRA) != 0) switch(c)
 537       {
 538       default:
 539       *errorcodeptr = ERR3;
 540       break;
 541       }
 542     break;
 543     }
 544   }
 545
 546 *ptrptr = ptr;
 547 return c;
 548 }
 549
 550
 551
 552 #ifdef SUPPORT_UCP
 553 /*************************************************
 554 *               Handle \P and \p                 *
 555 *************************************************/
 556
 557 /* This function is called after \P or \p has been encountered, provided that
 558 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
 559 pointing at the P or p. On exit, it is pointing at the final character of the
 560 escape sequence.
 561
 562 Argument:
 563   ptrptr         points to the pattern position pointer
 564   negptr         points to a boolean that is set TRUE for negation else FALSE
 565   errorcodeptr   points to the error code variable
 566
 567 Returns:     value from ucp_type_table, or -1 for an invalid type
 568 */
 569
 570 static int
 571 get_ucp(const uschar **ptrptr, BOOL *negptr, int *errorcodeptr)
 572 {
 573 int c, i, bot, top;
 574 const uschar *ptr = *ptrptr;
 575 char name[4];
 576
 577 c = *(++ptr);
 578 if (c == 0) goto ERROR_RETURN;
 579
 580 *negptr = FALSE;
 581
 582 /* \P or \p can be followed by a one- or two-character name in {}, optionally
 583 preceded by ^ for negation. */
 584
 585 if (c == '{')
 586   {
 587   if (ptr[1] == '^')
 588     {
 589     *negptr = TRUE;
 590     ptr++;
 591     }
 592   for (i = 0; i <= 2; i++)
 593     {
 594     c = *(++ptr);
 595     if (c == 0) goto ERROR_RETURN;
 596     if (c == '}') break;
 597     name[i] = c;
 598     }
 599   if (c !='}')   /* Try to distinguish error cases */
 600     {
 601     while (*(++ptr) != 0 && *ptr != '}');
 602     if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;
 603     }
 604   name[i] = 0;
 605   }
 606
 607 /* Otherwise there is just one following character */
 608
 609 else
 610   {
 611   name[0] = c;
 612   name[1] = 0;
 613   }
 614
 615 *ptrptr = ptr;
 616
 617 /* Search for a recognized property name using binary chop */
 618
 619 bot = 0;
 620 top = _pcre_utt_size;
 621
 622 while (bot < top)
 623   {
 624   i = (bot + top)/2;
 625   c = strcmp(name, _pcre_utt[i].name);
 626   if (c == 0) return _pcre_utt[i].value;
 627   if (c > 0) bot = i + 1; else top = i;
 628   }
 629
 630 UNKNOWN_RETURN:
 631 *errorcodeptr = ERR47;
 632 *ptrptr = ptr;
 633 return -1;
 634
 635 ERROR_RETURN:
 636 *errorcodeptr = ERR46;
 637 *ptrptr = ptr;
 638 return -1;
 639 }
 640 #endif
 641
 642
 643
 644
 645 /*************************************************
 646 *            Check for counted repeat            *
 647 *************************************************/
 648
 649 /* This function is called when a '{' is encountered in a place where it might
 650 start a quantifier. It looks ahead to see if it really is a quantifier or not.
 651 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
 652 where the ddds are digits.
 653
 654 Arguments:
 655   p         pointer to the first char after '{'
 656
 657 Returns:    TRUE or FALSE
 658 */
 659
 660 static BOOL
 661 is_counted_repeat(const uschar *p)
 662 {
 663 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
 664 while ((digitab[*p] & ctype_digit) != 0) p++;
 665 if (*p == '}') return TRUE;
 666
 667 if (*p++ != ',') return FALSE;
 668 if (*p == '}') return TRUE;
 669
 670 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
 671 while ((digitab[*p] & ctype_digit) != 0) p++;
 672
 673 return (*p == '}');
 674 }
 675
 676
 677
 678 /*************************************************
 679 *         Read repeat counts                     *
 680 *************************************************/
 681
 682 /* Read an item of the form {n,m} and return the values. This is called only
 683 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
 684 so the syntax is guaranteed to be correct, but we need to check the values.
 685
 686 Arguments:
 687   p              pointer to first char after '{'
 688   minp           pointer to int for min
 689   maxp           pointer to int for max
 690                  returned as -1 if no max
 691   errorcodeptr   points to error code variable
 692
 693 Returns:         pointer to '}' on success;
 694                  current ptr on error, with errorcodeptr set non-zero
 695 */
 696
 697 static const uschar *
 698 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
 699 {
 700 int min = 0;
 701 int max = -1;
 702
 703 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
 704
 705 if (*p == '}') max = min; else
 706   {
 707   if (*(++p) != '}')
 708     {
 709     max = 0;
 710     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
 711     if (max < min)
 712       {
 713       *errorcodeptr = ERR4;
 714       return p;
 715       }
 716     }
 717   }
 718
 719 /* Do paranoid checks, then fill in the required variables, and pass back the
 720 pointer to the terminating '}'. */
 721
 722 if (min > 65535 || max > 65535)
 723   *errorcodeptr = ERR5;
 724 else
 725   {
 726   *minp = min;
 727   *maxp = max;
 728   }
 729 return p;
 730 }
 731
 732
 733
 734 /*************************************************
 735 *      Find first significant op code            *
 736 *************************************************/
 737
 738 /* This is called by several functions that scan a compiled expression looking
 739 for a fixed first character, or an anchoring op code etc. It skips over things
 740 that do not influence this. For some calls, a change of option is important.
 741 For some calls, it makes sense to skip negative forward and all backward
 742 assertions, and also the \b assertion; for others it does not.
 743
 744 Arguments:
 745   code         pointer to the start of the group
 746   options      pointer to external options
 747   optbit       the option bit whose changing is significant, or
 748                  zero if none are
 749   skipassert   TRUE if certain assertions are to be skipped
 750
 751 Returns:       pointer to the first significant opcode
 752 */
 753
 754 static const uschar*
 755 first_significant_code(const uschar *code, int *options, int optbit,
 756   BOOL skipassert)
 757 {
 758 for (;;)
 759   {
 760   switch ((int)*code)
 761     {
 762     case OP_OPT:
 763     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
 764       *options = (int)code[1];
 765     code += 2;
 766     break;
 767
 768     case OP_ASSERT_NOT:
 769     case OP_ASSERTBACK:
 770     case OP_ASSERTBACK_NOT:
 771     if (!skipassert) return code;
 772     do code += GET(code, 1); while (*code == OP_ALT);
 773     code += _pcre_OP_lengths[*code];
 774     break;
 775
 776     case OP_WORD_BOUNDARY:
 777     case OP_NOT_WORD_BOUNDARY:
 778     if (!skipassert) return code;
 779     /* Fall through */
 780
 781     case OP_CALLOUT:
 782     case OP_CREF:
 783     case OP_BRANUMBER:
 784     code += _pcre_OP_lengths[*code];
 785     break;
 786
 787     default:
 788     return code;
 789     }
 790   }
 791 /* Control never reaches here */
 792 }
 793
 794
 795
 796
 797 /*************************************************
 798 *        Find the fixed length of a pattern      *
 799 *************************************************/
 800
 801 /* Scan a pattern and compute the fixed length of subject that will match it,
 802 if the length is fixed. This is needed for dealing with backward assertions.
 803 In UTF8 mode, the result is in characters rather than bytes.
 804
 805 Arguments:
 806   code     points to the start of the pattern (the bracket)
 807   options  the compiling options
 808
 809 Returns:   the fixed length, or -1 if there is no fixed length,
 810              or -2 if \C was encountered
 811 */
 812
 813 static int
 814 find_fixedlength(uschar *code, int options)
 815 {
 816 int length = -1;
 817
 818 register int branchlength = 0;
 819 register uschar *cc = code + 1 + LINK_SIZE;
 820
 821 /* Scan along the opcodes for this branch. If we get to the end of the
 822 branch, check the length against that of the other branches. */
 823
 824 for (;;)
 825   {
 826   int d;
 827   register int op = *cc;
 828   if (op >= OP_BRA) op = OP_BRA;
 829
 830   switch (op)
 831     {
 832     case OP_BRA:
 833     case OP_ONCE:
 834     case OP_COND:
 835     d = find_fixedlength(cc, options);
 836     if (d < 0) return d;
 837     branchlength += d;
 838     do cc += GET(cc, 1); while (*cc == OP_ALT);
 839     cc += 1 + LINK_SIZE;
 840     break;
 841
 842     /* Reached end of a branch; if it's a ket it is the end of a nested
 843     call. If it's ALT it is an alternation in a nested call. If it is
 844     END it's the end of the outer call. All can be handled by the same code. */
 845
 846     case OP_ALT:
 847     case OP_KET:
 848     case OP_KETRMAX:
 849     case OP_KETRMIN:
 850     case OP_END:
 851     if (length < 0) length = branchlength;
 852       else if (length != branchlength) return -1;
 853     if (*cc != OP_ALT) return length;
 854     cc += 1 + LINK_SIZE;
 855     branchlength = 0;
 856     break;
 857
 858     /* Skip over assertive subpatterns */
 859
 860     case OP_ASSERT:
 861     case OP_ASSERT_NOT:
 862     case OP_ASSERTBACK:
 863     case OP_ASSERTBACK_NOT:
 864     do cc += GET(cc, 1); while (*cc == OP_ALT);
 865     /* Fall through */
 866
 867     /* Skip over things that don't match chars */
 868
 869     case OP_REVERSE:
 870     case OP_BRANUMBER:
 871     case OP_CREF:
 872     case OP_OPT:
 873     case OP_CALLOUT:
 874     case OP_SOD:
 875     case OP_SOM:
 876     case OP_EOD:
 877     case OP_EODN:
 878     case OP_CIRC:
 879     case OP_DOLL:
 880     case OP_NOT_WORD_BOUNDARY:
 881     case OP_WORD_BOUNDARY:
 882     cc += _pcre_OP_lengths[*cc];
 883     break;
 884
 885     /* Handle literal characters */
 886
 887     case OP_CHAR:
 888     case OP_CHARNC:
 889     branchlength++;
 890     cc += 2;
 891 #ifdef SUPPORT_UTF8
 892     if ((options & PCRE_UTF8) != 0)
 893       {
 894       while ((*cc & 0xc0) == 0x80) cc++;
 895       }
 896 #endif
 897     break;
 898
 899     /* Handle exact repetitions. The count is already in characters, but we
 900     need to skip over a multibyte character in UTF8 mode.  */
 901
 902     case OP_EXACT:
 903     branchlength += GET2(cc,1);
 904     cc += 4;
 905 #ifdef SUPPORT_UTF8
 906     if ((options & PCRE_UTF8) != 0)
 907       {
 908       while((*cc & 0x80) == 0x80) cc++;
 909       }
 910 #endif
 911     break;
 912
 913     case OP_TYPEEXACT:
 914     branchlength += GET2(cc,1);
 915     cc += 4;
 916     break;
 917
 918     /* Handle single-char matchers */
 919
 920     case OP_PROP:
 921     case OP_NOTPROP:
 922     cc++;
 923     /* Fall through */
 924
 925     case OP_NOT_DIGIT:
 926     case OP_DIGIT:
 927     case OP_NOT_WHITESPACE:
 928     case OP_WHITESPACE:
 929     case OP_NOT_WORDCHAR:
 930     case OP_WORDCHAR:
 931     case OP_ANY:
 932     branchlength++;
 933     cc++;
 934     break;
 935
 936     /* The single-byte matcher isn't allowed */
 937
 938     case OP_ANYBYTE:
 939     return -2;
 940
 941     /* Check a class for variable quantification */
 942
 943 #ifdef SUPPORT_UTF8
 944     case OP_XCLASS:
 945     cc += GET(cc, 1) - 33;
 946     /* Fall through */
 947 #endif
 948
 949     case OP_CLASS:
 950     case OP_NCLASS:
 951     cc += 33;
 952
 953     switch (*cc)
 954       {
 955       case OP_CRSTAR:
 956       case OP_CRMINSTAR:
 957       case OP_CRQUERY:
 958       case OP_CRMINQUERY:
 959       return -1;
 960
 961       case OP_CRRANGE:
 962       case OP_CRMINRANGE:
 963       if (GET2(cc,1) != GET2(cc,3)) return -1;
 964       branchlength += GET2(cc,1);
 965       cc += 5;
 966       break;
 967
 968       default:
 969       branchlength++;
 970       }
 971     break;
 972
 973     /* Anything else is variable length */
 974
 975     default:
 976     return -1;
 977     }
 978   }
 979 /* Control never gets here */
 980 }
 981
 982
 983
 984
 985 /*************************************************
 986 *    Scan compiled regex for numbered bracket    *
 987 *************************************************/
 988
 989 /* This little function scans through a compiled pattern until it finds a
 990 capturing bracket with the given number.
 991
 992 Arguments:
 993   code        points to start of expression
 994   utf8        TRUE in UTF-8 mode
 995   number      the required bracket number
 996
 997 Returns:      pointer to the opcode for the bracket, or NULL if not found
 998 */
 999
1000 static const uschar *
1001 find_bracket(const uschar *code, BOOL utf8, int number)
1002 {
1003 #ifndef SUPPORT_UTF8
1004 utf8 = utf8;               /* Stop pedantic compilers complaining */
1005 #endif
1006
1007 for (;;)
1008   {
1009   register int c = *code;
1010   if (c == OP_END) return NULL;
1011   else if (c > OP_BRA)
1012     {
1013     int n = c - OP_BRA;
1014     if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1015     if (n == number) return (uschar *)code;
1016     code += _pcre_OP_lengths[OP_BRA];
1017     }
1018   else
1019     {
1020     code += _pcre_OP_lengths[c];
1021
1022 #ifdef SUPPORT_UTF8
1023
1024     /* In UTF-8 mode, opcodes that are followed by a character may be followed
1025     by a multi-byte character. The length in the table is a minimum, so we have
1026     to scan along to skip the extra bytes. All opcodes are less than 128, so we
1027     can use relatively efficient code. */
1028
1029     if (utf8) switch(c)
1030       {
1031       case OP_CHAR:
1032       case OP_CHARNC:
1033       case OP_EXACT:
1034       case OP_UPTO:
1035       case OP_MINUPTO:
1036       case OP_STAR:
1037       case OP_MINSTAR:
1038       case OP_PLUS:
1039       case OP_MINPLUS:
1040       case OP_QUERY:
1041       case OP_MINQUERY:
1042       while ((*code & 0xc0) == 0x80) code++;
1043       break;
1044
1045       /* XCLASS is used for classes that cannot be represented just by a bit
1046       map. This includes negated single high-valued characters. The length in
1047       the table is zero; the actual length is stored in the compiled code. */
1048
1049       case OP_XCLASS:
1050       code += GET(code, 1) + 1;
1051       break;
1052       }
1053 #endif
1054     }
1055   }
1056 }
1057
1058
1059
1060 /*************************************************
1061 *   Scan compiled regex for recursion reference  *
1062 *************************************************/
1063
1064 /* This little function scans through a compiled pattern until it finds an
1065 instance of OP_RECURSE.
1066
1067 Arguments:
1068   code        points to start of expression
1069   utf8        TRUE in UTF-8 mode
1070
1071 Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
1072 */
1073
1074 static const uschar *
1075 find_recurse(const uschar *code, BOOL utf8)
1076 {
1077 #ifndef SUPPORT_UTF8
1078 utf8 = utf8;               /* Stop pedantic compilers complaining */
1079 #endif
1080
1081 for (;;)
1082   {
1083   register int c = *code;
1084   if (c == OP_END) return NULL;
1085   else if (c == OP_RECURSE) return code;
1086   else if (c > OP_BRA)
1087     {
1088     code += _pcre_OP_lengths[OP_BRA];
1089     }
1090   else
1091     {
1092     code += _pcre_OP_lengths[c];
1093
1094 #ifdef SUPPORT_UTF8
1095
1096     /* In UTF-8 mode, opcodes that are followed by a character may be followed
1097     by a multi-byte character. The length in the table is a minimum, so we have
1098     to scan along to skip the extra bytes. All opcodes are less than 128, so we
1099     can use relatively efficient code. */
1100
1101     if (utf8) switch(c)
1102       {
1103       case OP_CHAR:
1104       case OP_CHARNC:
1105       case OP_EXACT:
1106       case OP_UPTO:
1107       case OP_MINUPTO:
1108       case OP_STAR:
1109       case OP_MINSTAR:
1110       case OP_PLUS:
1111       case OP_MINPLUS:
1112       case OP_QUERY:
1113       case OP_MINQUERY:
1114       while ((*code & 0xc0) == 0x80) code++;
1115       break;
1116
1117       /* XCLASS is used for classes that cannot be represented just by a bit
1118       map. This includes negated single high-valued characters. The length in
1119       the table is zero; the actual length is stored in the compiled code. */
1120
1121       case OP_XCLASS:
1122       code += GET(code, 1) + 1;
1123       break;
1124       }
1125 #endif
1126     }
1127   }
1128 }
1129
1130
1131
1132 /*************************************************
1133 *    Scan compiled branch for non-emptiness      *
1134 *************************************************/
1135
1136 /* This function scans through a branch of a compiled pattern to see whether it
1137 can match the empty string or not. It is called only from could_be_empty()
1138 below. Note that first_significant_code() skips over assertions. If we hit an
1139 unclosed bracket, we return "empty" - this means we've struck an inner bracket
1140 whose current branch will already have been scanned.
1141
1142 Arguments:
1143   code        points to start of search
1144   endcode     points to where to stop
1145   utf8        TRUE if in UTF8 mode
1146
1147 Returns:      TRUE if what is matched could be empty
1148 */
1149
1150 static BOOL
1151 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1152 {
1153 register int c;
1154 for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);
1155      code < endcode;
1156      code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1157   {
1158   const uschar *ccode;
1159
1160   c = *code;
1161
1162   if (c >= OP_BRA)
1163     {
1164     BOOL empty_branch;
1165     if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
1166
1167     /* Scan a closed bracket */
1168
1169     empty_branch = FALSE;
1170     do
1171       {
1172       if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1173         empty_branch = TRUE;
1174       code += GET(code, 1);
1175       }
1176     while (*code == OP_ALT);
1177     if (!empty_branch) return FALSE;   /* All branches are non-empty */
1178     code += 1 + LINK_SIZE;
1179     c = *code;
1180     }
1181
1182   else switch (c)
1183     {
1184     /* Check for quantifiers after a class */
1185
1186 #ifdef SUPPORT_UTF8
1187     case OP_XCLASS:
1188     ccode = code + GET(code, 1);
1189     goto CHECK_CLASS_REPEAT;
1190 #endif
1191
1192     case OP_CLASS:
1193     case OP_NCLASS:
1194     ccode = code + 33;
1195
1196 #ifdef SUPPORT_UTF8
1197     CHECK_CLASS_REPEAT:
1198 #endif
1199
1200     switch (*ccode)
1201       {
1202       case OP_CRSTAR:            /* These could be empty; continue */
1203       case OP_CRMINSTAR:
1204       case OP_CRQUERY:
1205       case OP_CRMINQUERY:
1206       break;
1207
1208       default:                   /* Non-repeat => class must match */
1209       case OP_CRPLUS:            /* These repeats aren't empty */
1210       case OP_CRMINPLUS:
1211       return FALSE;
1212
1213       case OP_CRRANGE:
1214       case OP_CRMINRANGE:
1215       if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
1216       break;
1217       }
1218     break;
1219
1220     /* Opcodes that must match a character */
1221
1222     case OP_PROP:
1223     case OP_NOTPROP:
1224     case OP_EXTUNI:
1225     case OP_NOT_DIGIT:
1226     case OP_DIGIT:
1227     case OP_NOT_WHITESPACE:
1228     case OP_WHITESPACE:
1229     case OP_NOT_WORDCHAR:
1230     case OP_WORDCHAR:
1231     case OP_ANY:
1232     case OP_ANYBYTE:
1233     case OP_CHAR:
1234     case OP_CHARNC:
1235     case OP_NOT:
1236     case OP_PLUS:
1237     case OP_MINPLUS:
1238     case OP_EXACT:
1239     case OP_NOTPLUS:
1240     case OP_NOTMINPLUS:
1241     case OP_NOTEXACT:
1242     case OP_TYPEPLUS:
1243     case OP_TYPEMINPLUS:
1244     case OP_TYPEEXACT:
1245     return FALSE;
1246
1247     /* End of branch */
1248
1249     case OP_KET:
1250     case OP_KETRMAX:
1251     case OP_KETRMIN:
1252     case OP_ALT:
1253     return TRUE;
1254
1255     /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO  may be
1256     followed by a multibyte character */
1257
1258 #ifdef SUPPORT_UTF8
1259     case OP_STAR:
1260     case OP_MINSTAR:
1261     case OP_QUERY:
1262     case OP_MINQUERY:
1263     case OP_UPTO:
1264     case OP_MINUPTO:
1265     if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1266     break;
1267 #endif
1268     }
1269   }
1270
1271 return TRUE;
1272 }
1273
1274
1275
1276 /*************************************************
1277 *    Scan compiled regex for non-emptiness       *
1278 *************************************************/
1279
1280 /* This function is called to check for left recursive calls. We want to check
1281 the current branch of the current pattern to see if it could match the empty
1282 string. If it could, we must look outwards for branches at other levels,
1283 stopping when we pass beyond the bracket which is the subject of the recursion.
1284
1285 Arguments:
1286   code        points to start of the recursion
1287   endcode     points to where to stop (current RECURSE item)
1288   bcptr       points to the chain of current (unclosed) branch starts
1289   utf8        TRUE if in UTF-8 mode
1290
1291 Returns:      TRUE if what is matched could be empty
1292 */
1293
1294 static BOOL
1295 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1296   BOOL utf8)
1297 {
1298 while (bcptr != NULL && bcptr->current >= code)
1299   {
1300   if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1301   bcptr = bcptr->outer;
1302   }
1303 return TRUE;
1304 }
1305
1306
1307
1308 /*************************************************
1309 *           Check for POSIX class syntax         *
1310 *************************************************/
1311
1312 /* This function is called when the sequence "[:" or "[." or "[=" is
1313 encountered in a character class. It checks whether this is followed by an
1314 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1315 ".]" or "=]".
1316
1317 Argument:
1318   ptr      pointer to the initial [
1319   endptr   where to return the end pointer
1320   cd       pointer to compile data
1321
1322 Returns:   TRUE or FALSE
1323 */
1324
1325 static BOOL
1326 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1327 {
1328 int terminator;          /* Don't combine these lines; the Solaris cc */
1329 terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
1330 if (*(++ptr) == '^') ptr++;
1331 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1332 if (*ptr == terminator && ptr[1] == ']')
1333   {
1334   *endptr = ptr;
1335   return TRUE;
1336   }
1337 return FALSE;
1338 }
1339
1340
1341
1342
1343 /*************************************************
1344 *          Check POSIX class name                *
1345 *************************************************/
1346
1347 /* This function is called to check the name given in a POSIX-style class entry
1348 such as [:alnum:].
1349
1350 Arguments:
1351   ptr        points to the first letter
1352   len        the length of the name
1353
1354 Returns:     a value representing the name, or -1 if unknown
1355 */
1356
1357 static int
1358 check_posix_name(const uschar *ptr, int len)
1359 {
1360 register int yield = 0;
1361 while (posix_name_lengths[yield] != 0)
1362   {
1363   if (len == posix_name_lengths[yield] &&
1364     strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1365   yield++;
1366   }
1367 return -1;
1368 }
1369
1370
1371 /*************************************************
1372 *    Adjust OP_RECURSE items in repeated group   *
1373 *************************************************/
1374
1375 /* OP_RECURSE items contain an offset from the start of the regex to the group
1376 that is referenced. This means that groups can be replicated for fixed
1377 repetition simply by copying (because the recursion is allowed to refer to
1378 earlier groups that are outside the current group). However, when a group is
1379 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1380 it, after it has been compiled. This means that any OP_RECURSE items within it
1381 that refer to the group itself or any contained groups have to have their
1382 offsets adjusted. That is the job of this function. Before it is called, the
1383 partially compiled regex must be temporarily terminated with OP_END.
1384
1385 Arguments:
1386   group      points to the start of the group
1387   adjust     the amount by which the group is to be moved
1388   utf8       TRUE in UTF-8 mode
1389   cd         contains pointers to tables etc.
1390
1391 Returns:     nothing
1392 */
1393
1394 static void
1395 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)
1396 {
1397 uschar *ptr = group;
1398 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1399   {
1400   int offset = GET(ptr, 1);
1401   if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1402   ptr += 1 + LINK_SIZE;
1403   }
1404 }
1405
1406
1407
1408 /*************************************************
1409 *        Insert an automatic callout point       *
1410 *************************************************/
1411
1412 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1413 callout points before each pattern item.
1414
1415 Arguments:
1416   code           current code pointer
1417   ptr            current pattern pointer
1418   cd             pointers to tables etc
1419
1420 Returns:         new code pointer
1421 */
1422
1423 static uschar *
1424 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1425 {
1426 *code++ = OP_CALLOUT;
1427 *code++ = 255;
1428 PUT(code, 0, ptr - cd->start_pattern);  /* Pattern offset */
1429 PUT(code, LINK_SIZE, 0);                /* Default length */
1430 return code + 2*LINK_SIZE;
1431 }
1432
1433
1434
1435 /*************************************************
1436 *         Complete a callout item                *
1437 *************************************************/
1438
1439 /* A callout item contains the length of the next item in the pattern, which
1440 we can't fill in till after we have reached the relevant point. This is used
1441 for both automatic and manual callouts.
1442
1443 Arguments:
1444   previous_callout   points to previous callout item
1445   ptr                current pattern pointer
1446   cd                 pointers to tables etc
1447
1448 Returns:             nothing
1449 */
1450
1451 static void
1452 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1453 {
1454 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1455 PUT(previous_callout, 2 + LINK_SIZE, length);
1456 }
1457
1458
1459
1460 #ifdef SUPPORT_UCP
1461 /*************************************************
1462 *           Get othercase range                  *
1463 *************************************************/
1464
1465 /* This function is passed the start and end of a class range, in UTF-8 mode
1466 with UCP support. It searches up the characters, looking for internal ranges of
1467 characters in the "other" case. Each call returns the next one, updating the
1468 start address.
1469
1470 Arguments:
1471   cptr        points to starting character value; updated
1472   d           end value
1473   ocptr       where to put start of othercase range
1474   odptr       where to put end of othercase range
1475
1476 Yield:        TRUE when range returned; FALSE when no more
1477 */
1478
1479 static BOOL
1480 get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)
1481 {
1482 int c, chartype, othercase, next;
1483
1484 for (c = *cptr; c <= d; c++)
1485   {
1486   if (_pcre_ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0)
1487     break;
1488   }
1489
1490 if (c > d) return FALSE;
1491
1492 *ocptr = othercase;
1493 next = othercase + 1;
1494
1495 for (++c; c <= d; c++)
1496   {
1497   if (_pcre_ucp_findchar(c, &chartype, &othercase) != ucp_L ||
1498         othercase != next)
1499     break;
1500   next++;
1501   }
1502
1503 *odptr = next - 1;
1504 *cptr = c;
1505
1506 return TRUE;
1507 }
1508 #endif  /* SUPPORT_UCP */
1509
1510
1511 /*************************************************
1512 *           Compile one branch                   *
1513 *************************************************/
1514
1515 /* Scan the pattern, compiling it into the code vector. If the options are
1516 changed during the branch, the pointer is used to change the external options
1517 bits.
1518
1519 Arguments:
1520   optionsptr     pointer to the option bits
1521   brackets       points to number of extracting brackets used
1522   codeptr        points to the pointer to the current code point
1523   ptrptr         points to the current pattern pointer
1524   errorcodeptr   points to error code variable
1525   firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
1526   reqbyteptr     set to the last literal character required, else < 0
1527   bcptr          points to current branch chain
1528   cd             contains pointers to tables etc.
1529
1530 Returns:         TRUE on success
1531                  FALSE, with *errorcodeptr set non-zero on error
1532 */
1533
1534 static BOOL
1535 compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
1536   const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,
1537   int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
1538 {
1539 int repeat_type, op_type;
1540 int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
1541 int bravalue = 0;
1542 int greedy_default, greedy_non_default;
1543 int firstbyte, reqbyte;
1544 int zeroreqbyte, zerofirstbyte;
1545 int req_caseopt, reqvary, tempreqvary;
1546 int condcount = 0;
1547 int options = *optionsptr;
1548 int after_manual_callout = 0;
1549 register int c;
1550 register uschar *code = *codeptr;
1551 uschar *tempcode;
1552 BOOL inescq = FALSE;
1553 BOOL groupsetfirstbyte = FALSE;
1554 const uschar *ptr = *ptrptr;
1555 const uschar *tempptr;
1556 uschar *previous = NULL;
1557 uschar *previous_callout = NULL;
1558 uschar classbits[32];
1559
1560 #ifdef SUPPORT_UTF8
1561 BOOL class_utf8;
1562 BOOL utf8 = (options & PCRE_UTF8) != 0;
1563 uschar *class_utf8data;
1564 uschar utf8_char[6];
1565 #else
1566 BOOL utf8 = FALSE;
1567 #endif
1568
1569 /* Set up the default and non-default settings for greediness */
1570
1571 greedy_default = ((options & PCRE_UNGREEDY) != 0);
1572 greedy_non_default = greedy_default ^ 1;
1573
1574 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
1575 matching encountered yet". It gets changed to REQ_NONE if we hit something that
1576 matches a non-fixed char first char; reqbyte just remains unset if we never
1577 find one.
1578
1579 When we hit a repeat whose minimum is zero, we may have to adjust these values
1580 to take the zero repeat into account. This is implemented by setting them to
1581 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
1582 item types that can be repeated set these backoff variables appropriately. */
1583
1584 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
1585
1586 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
1587 according to the current setting of the caseless flag. REQ_CASELESS is a bit
1588 value > 255. It is added into the firstbyte or reqbyte variables to record the
1589 case status of the value. This is used only for ASCII characters. */
1590
1591 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
1592
1593 /* Switch on next character until the end of the branch */
1594
1595 for (;; ptr++)
1596   {
1597   BOOL negate_class;
1598   BOOL possessive_quantifier;
1599   BOOL is_quantifier;
1600   int class_charcount;
1601   int class_lastchar;
1602   int newoptions;
1603   int recno;
1604   int skipbytes;
1605   int subreqbyte;
1606   int subfirstbyte;
1607   int mclength;
1608   uschar mcbuffer[8];
1609
1610   /* Next byte in the pattern */
1611
1612   c = *ptr;
1613
1614   /* If in \Q...\E, check for the end; if not, we have a literal */
1615
1616   if (inescq && c != 0)
1617     {
1618     if (c == '\\' && ptr[1] == 'E')
1619       {
1620       inescq = FALSE;
1621       ptr++;
1622       continue;
1623       }
1624     else
1625       {
1626       if (previous_callout != NULL)
1627         {
1628         complete_callout(previous_callout, ptr, cd);
1629         previous_callout = NULL;
1630         }
1631       if ((options & PCRE_AUTO_CALLOUT) != 0)
1632         {
1633         previous_callout = code;
1634         code = auto_callout(code, ptr, cd);
1635         }
1636       goto NORMAL_CHAR;
1637       }
1638     }
1639
1640   /* Fill in length of a previous callout, except when the next thing is
1641   a quantifier. */
1642
1643   is_quantifier = c == '*' || c == '+' || c == '?' ||
1644     (c == '{' && is_counted_repeat(ptr+1));
1645
1646   if (!is_quantifier && previous_callout != NULL &&
1647        after_manual_callout-- <= 0)
1648     {
1649     complete_callout(previous_callout, ptr, cd);
1650     previous_callout = NULL;
1651     }
1652
1653   /* In extended mode, skip white space and comments */
1654
1655   if ((options & PCRE_EXTENDED) != 0)
1656     {
1657     if ((cd->ctypes[c] & ctype_space) != 0) continue;
1658     if (c == '#')
1659       {
1660       /* The space before the ; is to avoid a warning on a silly compiler
1661       on the Macintosh. */
1662       while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
1663       if (c != 0) continue;   /* Else fall through to handle end of string */
1664       }
1665     }
1666
1667   /* No auto callout for quantifiers. */
1668
1669   if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
1670     {
1671     previous_callout = code;
1672     code = auto_callout(code, ptr, cd);
1673     }
1674
1675   switch(c)
1676     {
1677     /* The branch terminates at end of string, |, or ). */
1678
1679     case 0:
1680     case '|':
1681     case ')':
1682     *firstbyteptr = firstbyte;
1683     *reqbyteptr = reqbyte;
1684     *codeptr = code;
1685     *ptrptr = ptr;
1686     return TRUE;
1687
1688     /* Handle single-character metacharacters. In multiline mode, ^ disables
1689     the setting of any following char as a first character. */
1690
1691     case '^':
1692     if ((options & PCRE_MULTILINE) != 0)
1693       {
1694       if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1695       }
1696     previous = NULL;
1697     *code++ = OP_CIRC;
1698     break;
1699
1700     case '$':
1701     previous = NULL;
1702     *code++ = OP_DOLL;
1703     break;
1704
1705     /* There can never be a first char if '.' is first, whatever happens about
1706     repeats. The value of reqbyte doesn't change either. */
1707
1708     case '.':
1709     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1710     zerofirstbyte = firstbyte;
1711     zeroreqbyte = reqbyte;
1712     previous = code;
1713     *code++ = OP_ANY;
1714     break;
1715
1716     /* Character classes. If the included characters are all < 255 in value, we
1717     build a 32-byte bitmap of the permitted characters, except in the special
1718     case where there is only one such character. For negated classes, we build
1719     the map as usual, then invert it at the end. However, we use a different
1720     opcode so that data characters > 255 can be handled correctly.
1721
1722     If the class contains characters outside the 0-255 range, a different
1723     opcode is compiled. It may optionally have a bit map for characters < 256,
1724     but those above are are explicitly listed afterwards. A flag byte tells
1725     whether the bitmap is present, and whether this is a negated class or not.
1726     */
1727
1728     case '[':
1729     previous = code;
1730
1731     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
1732     they are encountered at the top level, so we'll do that too. */
1733
1734     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1735         check_posix_syntax(ptr, &tempptr, cd))
1736       {
1737       *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
1738       goto FAILED;
1739       }
1740
1741     /* If the first character is '^', set the negation flag and skip it. */
1742
1743     if ((c = *(++ptr)) == '^')
1744       {
1745       negate_class = TRUE;
1746       c = *(++ptr);
1747       }
1748     else
1749       {
1750       negate_class = FALSE;
1751       }
1752
1753     /* Keep a count of chars with values < 256 so that we can optimize the case
1754     of just a single character (as long as it's < 256). For higher valued UTF-8
1755     characters, we don't yet do any optimization. */
1756
1757     class_charcount = 0;
1758     class_lastchar = -1;
1759
1760 #ifdef SUPPORT_UTF8
1761     class_utf8 = FALSE;                       /* No chars >= 256 */
1762     class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */
1763 #endif
1764
1765     /* Initialize the 32-char bit map to all zeros. We have to build the
1766     map in a temporary bit of store, in case the class contains only 1
1767     character (< 256), because in that case the compiled code doesn't use the
1768     bit map. */
1769
1770     memset(classbits, 0, 32 * sizeof(uschar));
1771
1772     /* Process characters until ] is reached. By writing this as a "do" it
1773     means that an initial ] is taken as a data character. The first pass
1774     through the regex checked the overall syntax, so we don't need to be very
1775     strict here. At the start of the loop, c contains the first byte of the
1776     character. */
1777
1778     do
1779       {
1780 #ifdef SUPPORT_UTF8
1781       if (utf8 && c > 127)
1782         {                           /* Braces are required because the */
1783         GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
1784         }
1785 #endif
1786
1787       /* Inside \Q...\E everything is literal except \E */
1788
1789       if (inescq)
1790         {
1791         if (c == '\\' && ptr[1] == 'E')
1792           {
1793           inescq = FALSE;
1794           ptr++;
1795           continue;
1796           }
1797         else goto LONE_SINGLE_CHARACTER;
1798         }
1799
1800       /* Handle POSIX class names. Perl allows a negation extension of the
1801       form [:^name:]. A square bracket that doesn't match the syntax is
1802       treated as a literal. We also recognize the POSIX constructions
1803       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1804       5.6 and 5.8 do. */
1805
1806       if (c == '[' &&
1807           (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1808           check_posix_syntax(ptr, &tempptr, cd))
1809         {
1810         BOOL local_negate = FALSE;
1811         int posix_class, i;
1812         register const uschar *cbits = cd->cbits;
1813
1814         if (ptr[1] != ':')
1815           {
1816           *errorcodeptr = ERR31;
1817           goto FAILED;
1818           }
1819
1820         ptr += 2;
1821         if (*ptr == '^')
1822           {
1823           local_negate = TRUE;
1824           ptr++;
1825           }
1826
1827         posix_class = check_posix_name(ptr, tempptr - ptr);
1828         if (posix_class < 0)
1829           {
1830           *errorcodeptr = ERR30;
1831           goto FAILED;
1832           }
1833
1834         /* If matching is caseless, upper and lower are converted to
1835         alpha. This relies on the fact that the class table starts with
1836         alpha, lower, upper as the first 3 entries. */
1837
1838         if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1839           posix_class = 0;
1840
1841         /* Or into the map we are building up to 3 of the static class
1842         tables, or their negations. The [:blank:] class sets up the same
1843         chars as the [:space:] class (all white space). We remove the vertical
1844         white space chars afterwards. */
1845
1846         posix_class *= 3;
1847         for (i = 0; i < 3; i++)
1848           {
1849           BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;
1850           int taboffset = posix_class_maps[posix_class + i];
1851           if (taboffset < 0) break;
1852           if (local_negate)
1853             {
1854             if (i == 0)
1855               for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset];
1856             else
1857               for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset];
1858             if (blankclass) classbits[1] |= 0x3c;
1859             }
1860           else
1861             {
1862             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset];
1863             if (blankclass) classbits[1] &= ~0x3c;
1864             }
1865           }
1866
1867         ptr = tempptr + 1;
1868         class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
1869         continue;    /* End of POSIX syntax handling */
1870         }
1871
1872       /* Backslash may introduce a single character, or it may introduce one
1873       of the specials, which just set a flag. Escaped items are checked for
1874       validity in the pre-compiling pass. The sequence \b is a special case.
1875       Inside a class (and only there) it is treated as backspace. Elsewhere
1876       it marks a word boundary. Other escapes have preset maps ready to
1877       or into the one we are building. We assume they have more than one
1878       character in them, so set class_charcount bigger than one. */
1879
1880       if (c == '\\')
1881         {
1882         c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);
1883
1884         if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */
1885         else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
1886         else if (-c == ESC_Q)            /* Handle start of quoted string */
1887           {
1888           if (ptr[1] == '\\' && ptr[2] == 'E')
1889             {
1890             ptr += 2; /* avoid empty string */
1891             }
1892           else inescq = TRUE;
1893           continue;
1894           }
1895
1896         if (c < 0)
1897           {
1898           register const uschar *cbits = cd->cbits;
1899           class_charcount += 2;     /* Greater than 1 is what matters */
1900           switch (-c)
1901             {
1902             case ESC_d:
1903             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
1904             continue;
1905
1906             case ESC_D:
1907             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
1908             continue;
1909
1910             case ESC_w:
1911             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
1912             continue;
1913
1914             case ESC_W:
1915             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
1916             continue;
1917
1918             case ESC_s:
1919             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
1920             classbits[1] &= ~0x08;   /* Perl 5.004 onwards omits VT from \s */
1921             continue;
1922
1923             case ESC_S:
1924             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
1925             classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
1926             continue;
1927
1928 #ifdef SUPPORT_UCP
1929             case ESC_p:
1930             case ESC_P:
1931               {
1932               BOOL negated;
1933               int property = get_ucp(&ptr, &negated, errorcodeptr);
1934               if (property < 0) goto FAILED;
1935               class_utf8 = TRUE;
1936               *class_utf8data++ = ((-c == ESC_p) != negated)?
1937                 XCL_PROP : XCL_NOTPROP;
1938               *class_utf8data++ = property;
1939               class_charcount -= 2;   /* Not a < 256 character */
1940               }
1941             continue;
1942 #endif
1943
1944             /* Unrecognized escapes are faulted if PCRE is running in its
1945             strict mode. By default, for compatibility with Perl, they are
1946             treated as literals. */
1947
1948             default:
1949             if ((options & PCRE_EXTRA) != 0)
1950               {
1951               *errorcodeptr = ERR7;
1952               goto FAILED;
1953               }
1954             c = *ptr;              /* The final character */
1955             class_charcount -= 2;  /* Undo the default count from above */
1956             }
1957           }
1958
1959         /* Fall through if we have a single character (c >= 0). This may be
1960         > 256 in UTF-8 mode. */
1961
1962         }   /* End of backslash handling */
1963
1964       /* A single character may be followed by '-' to form a range. However,
1965       Perl does not permit ']' to be the end of the range. A '-' character
1966       here is treated as a literal. */
1967
1968       if (ptr[1] == '-' && ptr[2] != ']')
1969         {
1970         int d;
1971         ptr += 2;
1972
1973 #ifdef SUPPORT_UTF8
1974         if (utf8)
1975           {                           /* Braces are required because the */
1976           GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
1977           }
1978         else
1979 #endif
1980         d = *ptr;  /* Not UTF-8 mode */
1981
1982         /* The second part of a range can be a single-character escape, but
1983         not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1984         in such circumstances. */
1985
1986         if (d == '\\')
1987           {
1988           const uschar *oldptr = ptr;
1989           d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);
1990
1991           /* \b is backslash; \X is literal X; any other special means the '-'
1992           was literal */
1993
1994           if (d < 0)
1995             {
1996             if (d == -ESC_b) d = '\b';
1997             else if (d == -ESC_X) d = 'X'; else
1998               {
1999               ptr = oldptr - 2;
2000               goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2001               }
2002             }
2003           }
2004
2005         /* The check that the two values are in the correct order happens in
2006         the pre-pass. Optimize one-character ranges */
2007
2008         if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2009
2010         /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2011         matching, we have to use an XCLASS with extra data items. Caseless
2012         matching for characters > 127 is available only if UCP support is
2013         available. */
2014
2015 #ifdef SUPPORT_UTF8
2016         if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2017           {
2018           class_utf8 = TRUE;
2019
2020           /* With UCP support, we can find the other case equivalents of
2021           the relevant characters. There may be several ranges. Optimize how
2022           they fit with the basic range. */
2023
2024 #ifdef SUPPORT_UCP
2025           if ((options & PCRE_CASELESS) != 0)
2026             {
2027             int occ, ocd;
2028             int cc = c;
2029             int origd = d;
2030             while (get_othercase_range(&cc, origd, &occ, &ocd))
2031               {
2032               if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */
2033
2034               if (occ < c  && ocd >= c - 1)        /* Extend the basic range */
2035                 {                                  /* if there is overlap,   */
2036                 c = occ;                           /* noting that if occ < c */
2037                 continue;                          /* we can't have ocd > d  */
2038                 }                                  /* because a subrange is  */
2039               if (ocd > d && occ <= d + 1)         /* always shorter than    */
2040                 {                                  /* the basic range.       */
2041                 d = ocd;
2042                 continue;
2043                 }
2044
2045               if (occ == ocd)
2046                 {
2047                 *class_utf8data++ = XCL_SINGLE;
2048                 }
2049               else
2050                 {
2051                 *class_utf8data++ = XCL_RANGE;
2052                 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2053                 }
2054               class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2055               }
2056             }
2057 #endif  /* SUPPORT_UCP */
2058
2059           /* Now record the original range, possibly modified for UCP caseless
2060           overlapping ranges. */
2061
2062           *class_utf8data++ = XCL_RANGE;
2063           class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2064           class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2065
2066           /* With UCP support, we are done. Without UCP support, there is no
2067           caseless matching for UTF-8 characters > 127; we can use the bit map
2068           for the smaller ones. */
2069
2070 #ifdef SUPPORT_UCP
2071           continue;    /* With next character in the class */
2072 #else
2073           if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2074
2075           /* Adjust upper limit and fall through to set up the map */
2076
2077           d = 127;
2078
2079 #endif  /* SUPPORT_UCP */
2080           }
2081 #endif  /* SUPPORT_UTF8 */
2082
2083         /* We use the bit map for all cases when not in UTF-8 mode; else
2084         ranges that lie entirely within 0-127 when there is UCP support; else
2085         for partial ranges without UCP support. */
2086
2087         for (; c <= d; c++)
2088           {
2089           classbits[c/8] |= (1 << (c&7));
2090           if ((options & PCRE_CASELESS) != 0)
2091             {
2092             int uc = cd->fcc[c];           /* flip case */
2093             classbits[uc/8] |= (1 << (uc&7));
2094             }
2095           class_charcount++;                /* in case a one-char range */
2096           class_lastchar = c;
2097           }
2098
2099         continue;   /* Go get the next char in the class */
2100         }
2101
2102       /* Handle a lone single character - we can get here for a normal
2103       non-escape char, or after \ that introduces a single character or for an
2104       apparent range that isn't. */
2105
2106       LONE_SINGLE_CHARACTER:
2107
2108       /* Handle a character that cannot go in the bit map */
2109
2110 #ifdef SUPPORT_UTF8
2111       if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2112         {
2113         class_utf8 = TRUE;
2114         *class_utf8data++ = XCL_SINGLE;
2115         class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2116
2117 #ifdef SUPPORT_UCP
2118         if ((options & PCRE_CASELESS) != 0)
2119           {
2120           int chartype;
2121           int othercase;
2122           if (_pcre_ucp_findchar(c, &chartype, &othercase) >= 0 &&
2123                othercase > 0)
2124             {
2125             *class_utf8data++ = XCL_SINGLE;
2126             class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
2127             }
2128           }
2129 #endif  /* SUPPORT_UCP */
2130
2131         }
2132       else
2133 #endif  /* SUPPORT_UTF8 */
2134
2135       /* Handle a single-byte character */
2136         {
2137         classbits[c/8] |= (1 << (c&7));
2138         if ((options & PCRE_CASELESS) != 0)
2139           {
2140           c = cd->fcc[c];   /* flip case */
2141           classbits[c/8] |= (1 << (c&7));
2142           }
2143         class_charcount++;
2144         class_lastchar = c;
2145         }
2146       }
2147
2148     /* Loop until ']' reached; the check for end of string happens inside the
2149     loop. This "while" is the end of the "do" above. */
2150
2151     while ((c = *(++ptr)) != ']' || inescq);
2152
2153     /* If class_charcount is 1, we saw precisely one character whose value is
2154     less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2155     can optimize the negative case only if there were no characters >= 128
2156     because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2157     single-bytes only. This is an historical hangover. Maybe one day we can
2158     tidy these opcodes to handle multi-byte characters.
2159
2160     The optimization throws away the bit map. We turn the item into a
2161     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2162     that OP_NOT does not support multibyte characters. In the positive case, it
2163     can cause firstbyte to be set. Otherwise, there can be no first char if
2164     this item is first, whatever repeat count may follow. In the case of
2165     reqbyte, save the previous value for reinstating. */
2166
2167 #ifdef SUPPORT_UTF8
2168     if (class_charcount == 1 &&
2169           (!utf8 ||
2170           (!class_utf8 && (!negate_class || class_lastchar < 128))))
2171
2172 #else
2173     if (class_charcount == 1)
2174 #endif
2175       {
2176       zeroreqbyte = reqbyte;
2177
2178       /* The OP_NOT opcode works on one-byte characters only. */
2179
2180       if (negate_class)
2181         {
2182         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2183         zerofirstbyte = firstbyte;
2184         *code++ = OP_NOT;
2185         *code++ = class_lastchar;
2186         break;
2187         }
2188
2189       /* For a single, positive character, get the value into mcbuffer, and
2190       then we can handle this with the normal one-character code. */
2191
2192 #ifdef SUPPORT_UTF8
2193       if (utf8 && class_lastchar > 127)
2194         mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
2195       else
2196 #endif
2197         {
2198         mcbuffer[0] = class_lastchar;
2199         mclength = 1;
2200         }
2201       goto ONE_CHAR;
2202       }       /* End of 1-char optimization */
2203
2204     /* The general case - not the one-char optimization. If this is the first
2205     thing in the branch, there can be no first char setting, whatever the
2206     repeat count. Any reqbyte setting must remain unchanged after any kind of
2207     repeat. */
2208
2209     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2210     zerofirstbyte = firstbyte;
2211     zeroreqbyte = reqbyte;
2212
2213     /* If there are characters with values > 255, we have to compile an
2214     extended class, with its own opcode. If there are no characters < 256,
2215     we can omit the bitmap. */
2216
2217 #ifdef SUPPORT_UTF8
2218     if (class_utf8)
2219       {
2220       *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
2221       *code++ = OP_XCLASS;
2222       code += LINK_SIZE;
2223       *code = negate_class? XCL_NOT : 0;
2224
2225       /* If the map is required, install it, and move on to the end of
2226       the extra data */
2227
2228       if (class_charcount > 0)
2229         {
2230         *code++ |= XCL_MAP;
2231         memcpy(code, classbits, 32);
2232         code = class_utf8data;
2233         }
2234
2235       /* If the map is not required, slide down the extra data. */
2236
2237       else
2238         {
2239         int len = class_utf8data - (code + 33);
2240         memmove(code + 1, code + 33, len);
2241         code += len + 1;
2242         }
2243
2244       /* Now fill in the complete length of the item */
2245
2246       PUT(previous, 1, code - previous);
2247       break;   /* End of class handling */
2248       }
2249 #endif
2250
2251     /* If there are no characters > 255, negate the 32-byte map if necessary,
2252     and copy it into the code vector. If this is the first thing in the branch,
2253     there can be no first char setting, whatever the repeat count. Any reqbyte
2254     setting must remain unchanged after any kind of repeat. */
2255
2256     if (negate_class)
2257       {
2258       *code++ = OP_NCLASS;
2259       for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2260       }
2261     else
2262       {
2263       *code++ = OP_CLASS;
2264       memcpy(code, classbits, 32);
2265       }
2266     code += 32;
2267     break;
2268
2269     /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2270     has been tested above. */
2271
2272     case '{':
2273     if (!is_quantifier) goto NORMAL_CHAR;
2274     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
2275     if (*errorcodeptr != 0) goto FAILED;
2276     goto REPEAT;
2277
2278     case '*':
2279     repeat_min = 0;
2280     repeat_max = -1;
2281     goto REPEAT;
2282
2283     case '+':
2284     repeat_min = 1;
2285     repeat_max = -1;
2286     goto REPEAT;
2287
2288     case '?':
2289     repeat_min = 0;
2290     repeat_max = 1;
2291
2292     REPEAT:
2293     if (previous == NULL)
2294       {
2295       *errorcodeptr = ERR9;
2296       goto FAILED;
2297       }
2298
2299     if (repeat_min == 0)
2300       {
2301       firstbyte = zerofirstbyte;    /* Adjust for zero repeat */
2302       reqbyte = zeroreqbyte;        /* Ditto */
2303       }
2304
2305     /* Remember whether this is a variable length repeat */
2306
2307     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2308
2309     op_type = 0;                    /* Default single-char op codes */
2310     possessive_quantifier = FALSE;  /* Default not possessive quantifier */
2311
2312     /* Save start of previous item, in case we have to move it up to make space
2313     for an inserted OP_ONCE for the additional '+' extension. */
2314
2315     tempcode = previous;
2316
2317     /* If the next character is '+', we have a possessive quantifier. This
2318     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2319     If the next character is '?' this is a minimizing repeat, by default,
2320     but if PCRE_UNGREEDY is set, it works the other way round. We change the
2321     repeat type to the non-default. */
2322
2323     if (ptr[1] == '+')
2324       {
2325       repeat_type = 0;                  /* Force greedy */
2326       possessive_quantifier = TRUE;
2327       ptr++;
2328       }
2329     else if (ptr[1] == '?')
2330       {
2331       repeat_type = greedy_non_default;
2332       ptr++;
2333       }
2334     else repeat_type = greedy_default;
2335
2336     /* If previous was a recursion, we need to wrap it inside brackets so that
2337     it can be replicated if necessary. */
2338
2339     if (*previous == OP_RECURSE)
2340       {
2341       memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2342       code += 1 + LINK_SIZE;
2343       *previous = OP_BRA;
2344       PUT(previous, 1, code - previous);
2345       *code = OP_KET;
2346       PUT(code, 1, code - previous);
2347       code += 1 + LINK_SIZE;
2348       }
2349
2350     /* If previous was a character match, abolish the item and generate a
2351     repeat item instead. If a char item has a minumum of more than one, ensure
2352     that it is set in reqbyte - it might not be if a sequence such as x{3} is
2353     the first thing in a branch because the x will have gone into firstbyte
2354     instead.  */
2355
2356     if (*previous == OP_CHAR || *previous == OP_CHARNC)
2357       {
2358       /* Deal with UTF-8 characters that take up more than one byte. It's
2359       easier to write this out separately than try to macrify it. Use c to
2360       hold the length of the character in bytes, plus 0x80 to flag that it's a
2361       length rather than a small character. */
2362
2363 #ifdef SUPPORT_UTF8
2364       if (utf8 && (code[-1] & 0x80) != 0)
2365         {
2366         uschar *lastchar = code - 1;
2367         while((*lastchar & 0xc0) == 0x80) lastchar--;
2368         c = code - lastchar;            /* Length of UTF-8 character */
2369         memcpy(utf8_char, lastchar, c); /* Save the char */
2370         c |= 0x80;                      /* Flag c as a length */
2371         }
2372       else
2373 #endif
2374
2375       /* Handle the case of a single byte - either with no UTF8 support, or
2376       with UTF-8 disabled, or for a UTF-8 character < 128. */
2377
2378         {
2379         c = code[-1];
2380         if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2381         }
2382
2383       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
2384       }
2385
2386     /* If previous was a single negated character ([^a] or similar), we use
2387     one of the special opcodes, replacing it. The code is shared with single-
2388     character repeats by setting opt_type to add a suitable offset into
2389     repeat_type. OP_NOT is currently used only for single-byte chars. */
2390
2391     else if (*previous == OP_NOT)
2392       {
2393       op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
2394       c = previous[1];
2395       goto OUTPUT_SINGLE_REPEAT;
2396       }
2397
2398     /* If previous was a character type match (\d or similar), abolish it and
2399     create a suitable repeat item. The code is shared with single-character
2400     repeats by setting op_type to add a suitable offset into repeat_type. Note
2401     the the Unicode property types will be present only when SUPPORT_UCP is
2402     defined, but we don't wrap the little bits of code here because it just
2403     makes it horribly messy. */
2404
2405     else if (*previous < OP_EODN)
2406       {
2407       uschar *oldcode;
2408       int prop_type;
2409       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
2410       c = *previous;
2411
2412       OUTPUT_SINGLE_REPEAT:
2413       prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)?
2414         previous[1] : -1;
2415
2416       oldcode = code;
2417       code = previous;                  /* Usually overwrite previous item */
2418
2419       /* If the maximum is zero then the minimum must also be zero; Perl allows
2420       this case, so we do too - by simply omitting the item altogether. */
2421
2422       if (repeat_max == 0) goto END_REPEAT;
2423
2424       /* All real repeats make it impossible to handle partial matching (maybe
2425       one day we will be able to remove this restriction). */
2426
2427       if (repeat_max != 1) cd->nopartial = TRUE;
2428
2429       /* Combine the op_type with the repeat_type */
2430
2431       repeat_type += op_type;
2432
2433       /* A minimum of zero is handled either as the special case * or ?, or as
2434       an UPTO, with the maximum given. */
2435
2436       if (repeat_min == 0)
2437         {
2438         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
2439           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
2440         else
2441           {
2442           *code++ = OP_UPTO + repeat_type;
2443           PUT2INC(code, 0, repeat_max);
2444           }
2445         }
2446
2447       /* A repeat minimum of 1 is optimized into some special cases. If the
2448       maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
2449       left in place and, if the maximum is greater than 1, we use OP_UPTO with
2450       one less than the maximum. */
2451
2452       else if (repeat_min == 1)
2453         {
2454         if (repeat_max == -1)
2455           *code++ = OP_PLUS + repeat_type;
2456         else
2457           {
2458           code = oldcode;                 /* leave previous item in place */
2459           if (repeat_max == 1) goto END_REPEAT;
2460           *code++ = OP_UPTO + repeat_type;
2461           PUT2INC(code, 0, repeat_max - 1);
2462           }
2463         }
2464
2465       /* The case {n,n} is just an EXACT, while the general case {n,m} is
2466       handled as an EXACT followed by an UPTO. */
2467
2468       else
2469         {
2470         *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
2471         PUT2INC(code, 0, repeat_min);
2472
2473         /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
2474         we have to insert the character for the previous code. For a repeated
2475         Unicode property match, there is an extra byte that defines the
2476         required property. In UTF-8 mode, long characters have their length in
2477         c, with the 0x80 bit as a flag. */
2478
2479         if (repeat_max < 0)
2480           {
2481 #ifdef SUPPORT_UTF8
2482           if (utf8 && c >= 128)
2483             {
2484             memcpy(code, utf8_char, c & 7);
2485             code += c & 7;
2486             }
2487           else
2488 #endif
2489             {
2490             *code++ = c;
2491             if (prop_type >= 0) *code++ = prop_type;
2492             }
2493           *code++ = OP_STAR + repeat_type;
2494           }
2495
2496         /* Else insert an UPTO if the max is greater than the min, again
2497         preceded by the character, for the previously inserted code. */
2498
2499         else if (repeat_max != repeat_min)
2500           {
2501 #ifdef SUPPORT_UTF8
2502           if (utf8 && c >= 128)
2503             {
2504             memcpy(code, utf8_char, c & 7);
2505             code += c & 7;
2506             }
2507           else
2508 #endif
2509           *code++ = c;
2510           if (prop_type >= 0) *code++ = prop_type;
2511           repeat_max -= repeat_min;
2512           *code++ = OP_UPTO + repeat_type;
2513           PUT2INC(code, 0, repeat_max);
2514           }
2515         }
2516
2517       /* The character or character type itself comes last in all cases. */
2518
2519 #ifdef SUPPORT_UTF8
2520       if (utf8 && c >= 128)
2521         {
2522         memcpy(code, utf8_char, c & 7);
2523         code += c & 7;
2524         }
2525       else
2526 #endif
2527       *code++ = c;
2528
2529       /* For a repeated Unicode property match, there is an extra byte that
2530       defines the required property. */
2531
2532 #ifdef SUPPORT_UCP
2533       if (prop_type >= 0) *code++ = prop_type;
2534 #endif
2535       }
2536
2537     /* If previous was a character class or a back reference, we put the repeat
2538     stuff after it, but just skip the item if the repeat was {0,0}. */
2539
2540     else if (*previous == OP_CLASS ||
2541              *previous == OP_NCLASS ||
2542 #ifdef SUPPORT_UTF8
2543              *previous == OP_XCLASS ||
2544 #endif
2545              *previous == OP_REF)
2546       {
2547       if (repeat_max == 0)
2548         {
2549         code = previous;
2550         goto END_REPEAT;
2551         }
2552
2553       /* All real repeats make it impossible to handle partial matching (maybe
2554       one day we will be able to remove this restriction). */
2555
2556       if (repeat_max != 1) cd->nopartial = TRUE;
2557
2558       if (repeat_min == 0 && repeat_max == -1)
2559         *code++ = OP_CRSTAR + repeat_type;
2560       else if (repeat_min == 1 && repeat_max == -1)
2561         *code++ = OP_CRPLUS + repeat_type;
2562       else if (repeat_min == 0 && repeat_max == 1)
2563         *code++ = OP_CRQUERY + repeat_type;
2564       else
2565         {
2566         *code++ = OP_CRRANGE + repeat_type;
2567         PUT2INC(code, 0, repeat_min);
2568         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
2569         PUT2INC(code, 0, repeat_max);
2570         }
2571       }
2572
2573     /* If previous was a bracket group, we may have to replicate it in certain
2574     cases. */
2575
2576     else if (*previous >= OP_BRA || *previous == OP_ONCE ||
2577              *previous == OP_COND)
2578       {
2579       register int i;
2580       int ketoffset = 0;
2581       int len = code - previous;
2582       uschar *bralink = NULL;
2583
2584       /* If the maximum repeat count is unlimited, find the end of the bracket
2585       by scanning through from the start, and compute the offset back to it
2586       from the current code pointer. There may be an OP_OPT setting following
2587       the final KET, so we can't find the end just by going back from the code
2588       pointer. */
2589
2590       if (repeat_max == -1)
2591         {
2592         register uschar *ket = previous;
2593         do ket += GET(ket, 1); while (*ket != OP_KET);
2594         ketoffset = code - ket;
2595         }
2596
2597       /* The case of a zero minimum is special because of the need to stick
2598       OP_BRAZERO in front of it, and because the group appears once in the
2599       data, whereas in other cases it appears the minimum number of times. For
2600       this reason, it is simplest to treat this case separately, as otherwise
2601       the code gets far too messy. There are several special subcases when the
2602       minimum is zero. */
2603
2604       if (repeat_min == 0)
2605         {
2606         /* If the maximum is also zero, we just omit the group from the output
2607         altogether. */
2608
2609         if (repeat_max == 0)
2610           {
2611           code = previous;
2612           goto END_REPEAT;
2613           }
2614
2615         /* If the maximum is 1 or unlimited, we just have to stick in the
2616         BRAZERO and do no more at this point. However, we do need to adjust
2617         any OP_RECURSE calls inside the group that refer to the group itself or
2618         any internal group, because the offset is from the start of the whole
2619         regex. Temporarily terminate the pattern while doing this. */
2620
2621         if (repeat_max <= 1)
2622           {
2623           *code = OP_END;
2624           adjust_recurse(previous, 1, utf8, cd);
2625           memmove(previous+1, previous, len);
2626           code++;
2627           *previous++ = OP_BRAZERO + repeat_type;
2628           }
2629
2630         /* If the maximum is greater than 1 and limited, we have to replicate
2631         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
2632         The first one has to be handled carefully because it's the original
2633         copy, which has to be moved up. The remainder can be handled by code
2634         that is common with the non-zero minimum case below. We have to
2635         adjust the value or repeat_max, since one less copy is required. Once
2636         again, we may have to adjust any OP_RECURSE calls inside the group. */
2637
2638         else
2639           {
2640           int offset;
2641           *code = OP_END;
2642           adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);
2643           memmove(previous + 2 + LINK_SIZE, previous, len);
2644           code += 2 + LINK_SIZE;
2645           *previous++ = OP_BRAZERO + repeat_type;
2646           *previous++ = OP_BRA;
2647
2648           /* We chain together the bracket offset fields that have to be
2649           filled in later when the ends of the brackets are reached. */
2650
2651           offset = (bralink == NULL)? 0 : previous - bralink;
2652           bralink = previous;
2653           PUTINC(previous, 0, offset);
2654           }
2655
2656         repeat_max--;
2657         }
2658
2659       /* If the minimum is greater than zero, replicate the group as many
2660       times as necessary, and adjust the maximum to the number of subsequent
2661       copies that we need. If we set a first char from the group, and didn't
2662       set a required char, copy the latter from the former. */
2663
2664       else
2665         {
2666         if (repeat_min > 1)
2667           {
2668           if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
2669           for (i = 1; i < repeat_min; i++)
2670             {
2671             memcpy(code, previous, len);
2672             code += len;
2673             }
2674           }
2675         if (repeat_max > 0) repeat_max -= repeat_min;
2676         }
2677
2678       /* This code is common to both the zero and non-zero minimum cases. If
2679       the maximum is limited, it replicates the group in a nested fashion,
2680       remembering the bracket starts on a stack. In the case of a zero minimum,
2681       the first one was set up above. In all cases the repeat_max now specifies
2682       the number of additional copies needed. */
2683
2684       if (repeat_max >= 0)
2685         {
2686         for (i = repeat_max - 1; i >= 0; i--)
2687           {
2688           *code++ = OP_BRAZERO + repeat_type;
2689
2690           /* All but the final copy start a new nesting, maintaining the
2691           chain of brackets outstanding. */
2692
2693           if (i != 0)
2694             {
2695             int offset;
2696             *code++ = OP_BRA;
2697             offset = (bralink == NULL)? 0 : code - bralink;
2698             bralink = code;
2699             PUTINC(code, 0, offset);
2700             }
2701
2702           memcpy(code, previous, len);
2703           code += len;
2704           }
2705
2706         /* Now chain through the pending brackets, and fill in their length
2707         fields (which are holding the chain links pro tem). */
2708
2709         while (bralink != NULL)
2710           {
2711           int oldlinkoffset;
2712           int offset = code - bralink + 1;
2713           uschar *bra = code - offset;
2714           oldlinkoffset = GET(bra, 1);
2715           bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
2716           *code++ = OP_KET;
2717           PUTINC(code, 0, offset);
2718           PUT(bra, 1, offset);
2719           }
2720         }
2721
2722       /* If the maximum is unlimited, set a repeater in the final copy. We
2723       can't just offset backwards from the current code point, because we
2724       don't know if there's been an options resetting after the ket. The
2725       correct offset was computed above. */
2726
2727       else code[-ketoffset] = OP_KETRMAX + repeat_type;
2728       }
2729
2730     /* Else there's some kind of shambles */
2731
2732     else
2733       {
2734       *errorcodeptr = ERR11;
2735       goto FAILED;
2736       }
2737
2738     /* If the character following a repeat is '+', we wrap the entire repeated
2739     item inside OP_ONCE brackets. This is just syntactic sugar, taken from
2740     Sun's Java package. The repeated item starts at tempcode, not at previous,
2741     which might be the first part of a string whose (former) last char we
2742     repeated. However, we don't support '+' after a greediness '?'. */
2743
2744     if (possessive_quantifier)
2745       {
2746       int len = code - tempcode;
2747       memmove(tempcode + 1+LINK_SIZE, tempcode, len);
2748       code += 1 + LINK_SIZE;
2749       len += 1 + LINK_SIZE;
2750       tempcode[0] = OP_ONCE;
2751       *code++ = OP_KET;
2752       PUTINC(code, 0, len);
2753       PUT(tempcode, 1, len);
2754       }
2755
2756     /* In all case we no longer have a previous item. We also set the
2757     "follows varying string" flag for subsequently encountered reqbytes if
2758     it isn't already set and we have just passed a varying length item. */
2759
2760     END_REPEAT:
2761     previous = NULL;
2762     cd->req_varyopt |= reqvary;
2763     break;
2764
2765
2766     /* Start of nested bracket sub-expression, or comment or lookahead or
2767     lookbehind or option setting or condition. First deal with special things
2768     that can come after a bracket; all are introduced by ?, and the appearance
2769     of any of them means that this is not a referencing group. They were
2770     checked for validity in the first pass over the string, so we don't have to
2771     check for syntax errors here.  */
2772
2773     case '(':
2774     newoptions = options;
2775     skipbytes = 0;
2776
2777     if (*(++ptr) == '?')
2778       {
2779       int set, unset;
2780       int *optset;
2781
2782       switch (*(++ptr))
2783         {
2784         case '#':                 /* Comment; skip to ket */
2785         ptr++;
2786         while (*ptr != ')') ptr++;
2787         continue;
2788
2789         case ':':                 /* Non-extracting bracket */
2790         bravalue = OP_BRA;
2791         ptr++;
2792         break;
2793
2794         case '(':
2795         bravalue = OP_COND;       /* Conditional group */
2796
2797         /* Condition to test for recursion */
2798
2799         if (ptr[1] == 'R')
2800           {
2801           code[1+LINK_SIZE] = OP_CREF;
2802           PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
2803           skipbytes = 3;
2804           ptr += 3;
2805           }
2806
2807         /* Condition to test for a numbered subpattern match. We know that
2808         if a digit follows ( then there will just be digits until ) because
2809         the syntax was checked in the first pass. */
2810
2811         else if ((digitab[ptr[1]] && ctype_digit) != 0)
2812           {
2813           int condref;                 /* Don't amalgamate; some compilers */
2814           condref = *(++ptr) - '0';    /* grumble at autoincrement in declaration */
2815           while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
2816           if (condref == 0)
2817             {
2818             *errorcodeptr = ERR35;
2819             goto FAILED;
2820             }
2821           ptr++;
2822           code[1+LINK_SIZE] = OP_CREF;
2823           PUT2(code, 2+LINK_SIZE, condref);
2824           skipbytes = 3;
2825           }
2826         /* For conditions that are assertions, we just fall through, having
2827         set bravalue above. */
2828         break;
2829
2830         case '=':                 /* Positive lookahead */
2831         bravalue = OP_ASSERT;
2832         ptr++;
2833         break;
2834
2835         case '!':                 /* Negative lookahead */
2836         bravalue = OP_ASSERT_NOT;
2837         ptr++;
2838         break;
2839
2840         case '<':                 /* Lookbehinds */
2841         switch (*(++ptr))
2842           {
2843           case '=':               /* Positive lookbehind */
2844           bravalue = OP_ASSERTBACK;
2845           ptr++;
2846           break;
2847
2848           case '!':               /* Negative lookbehind */
2849           bravalue = OP_ASSERTBACK_NOT;
2850           ptr++;
2851           break;
2852           }
2853         break;
2854
2855         case '>':                 /* One-time brackets */
2856         bravalue = OP_ONCE;
2857         ptr++;
2858         break;
2859
2860         case 'C':                 /* Callout - may be followed by digits; */
2861         previous_callout = code;  /* Save for later completion */
2862         after_manual_callout = 1; /* Skip one item before completing */
2863         *code++ = OP_CALLOUT;     /* Already checked that the terminating */
2864           {                       /* closing parenthesis is present. */
2865           int n = 0;
2866           while ((digitab[*(++ptr)] & ctype_digit) != 0)
2867             n = n * 10 + *ptr - '0';
2868           if (n > 255)
2869             {
2870             *errorcodeptr = ERR38;
2871             goto FAILED;
2872             }
2873           *code++ = n;
2874           PUT(code, 0, ptr - cd->start_pattern + 1);  /* Pattern offset */
2875           PUT(code, LINK_SIZE, 0);                    /* Default length */
2876           code += 2 * LINK_SIZE;
2877           }
2878         previous = NULL;
2879         continue;
2880
2881         case 'P':                 /* Named subpattern handling */
2882         if (*(++ptr) == '<')      /* Definition */
2883           {
2884           int i, namelen;
2885           uschar *slot = cd->name_table;
2886           const uschar *name;     /* Don't amalgamate; some compilers */
2887           name = ++ptr;           /* grumble at autoincrement in declaration */
2888
2889           while (*ptr++ != '>');
2890           namelen = ptr - name - 1;
2891
2892           for (i = 0; i < cd->names_found; i++)
2893             {
2894             int crc = memcmp(name, slot+2, namelen);
2895             if (crc == 0)
2896               {
2897               if (slot[2+namelen] == 0)
2898                 {
2899                 *errorcodeptr = ERR43;
2900                 goto FAILED;
2901                 }
2902               crc = -1;             /* Current name is substring */
2903               }
2904             if (crc < 0)
2905               {
2906               memmove(slot + cd->name_entry_size, slot,
2907                 (cd->names_found - i) * cd->name_entry_size);
2908               break;
2909               }
2910             slot += cd->name_entry_size;
2911             }
2912
2913           PUT2(slot, 0, *brackets + 1);
2914           memcpy(slot + 2, name, namelen);
2915           slot[2+namelen] = 0;
2916           cd->names_found++;
2917           goto NUMBERED_GROUP;
2918           }
2919
2920         if (*ptr == '=' || *ptr == '>')  /* Reference or recursion */
2921           {
2922           int i, namelen;
2923           int type = *ptr++;
2924           const uschar *name = ptr;
2925           uschar *slot = cd->name_table;
2926
2927           while (*ptr != ')') ptr++;
2928           namelen = ptr - name;
2929
2930           for (i = 0; i < cd->names_found; i++)
2931             {
2932             if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
2933             slot += cd->name_entry_size;
2934             }
2935           if (i >= cd->names_found)
2936             {
2937             *errorcodeptr = ERR15;
2938             goto FAILED;
2939             }
2940
2941           recno = GET2(slot, 0);
2942
2943           if (type == '>') goto HANDLE_RECURSION;  /* A few lines below */
2944
2945           /* Back reference */
2946
2947           previous = code;
2948           *code++ = OP_REF;
2949           PUT2INC(code, 0, recno);
2950           cd->backref_map |= (recno < 32)? (1 << recno) : 1;
2951           if (recno > cd->top_backref) cd->top_backref = recno;
2952           continue;
2953           }
2954
2955         /* Should never happen */
2956         break;
2957
2958         case 'R':                 /* Pattern recursion */
2959         ptr++;                    /* Same as (?0)      */
2960         /* Fall through */
2961
2962         /* Recursion or "subroutine" call */
2963
2964         case '0': case '1': case '2': case '3': case '4':
2965         case '5': case '6': case '7': case '8': case '9':
2966           {
2967           const uschar *called;
2968           recno = 0;
2969           while((digitab[*ptr] & ctype_digit) != 0)
2970             recno = recno * 10 + *ptr++ - '0';
2971
2972           /* Come here from code above that handles a named recursion */
2973
2974           HANDLE_RECURSION:
2975
2976           previous = code;
2977
2978           /* Find the bracket that is being referenced. Temporarily end the
2979           regex in case it doesn't exist. */
2980
2981           *code = OP_END;
2982           called = (recno == 0)?
2983             cd->start_code : find_bracket(cd->start_code, utf8, recno);
2984
2985           if (called == NULL)
2986             {
2987             *errorcodeptr = ERR15;
2988             goto FAILED;
2989             }
2990
2991           /* If the subpattern is still open, this is a recursive call. We
2992           check to see if this is a left recursion that could loop for ever,
2993           and diagnose that case. */
2994
2995           if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
2996             {
2997             *errorcodeptr = ERR40;
2998             goto FAILED;
2999             }
3000
3001           /* Insert the recursion/subroutine item */
3002
3003           *code = OP_RECURSE;
3004           PUT(code, 1, called - cd->start_code);
3005           code += 1 + LINK_SIZE;
3006           }
3007         continue;
3008
3009         /* Character after (? not specially recognized */
3010
3011         default:                  /* Option setting */
3012         set = unset = 0;
3013         optset = &set;
3014
3015         while (*ptr != ')' && *ptr != ':')
3016           {
3017           switch (*ptr++)
3018             {
3019             case '-': optset = &unset; break;
3020
3021             case 'i': *optset |= PCRE_CASELESS; break;
3022             case 'm': *optset |= PCRE_MULTILINE; break;
3023             case 's': *optset |= PCRE_DOTALL; break;
3024             case 'x': *optset |= PCRE_EXTENDED; break;
3025             case 'U': *optset |= PCRE_UNGREEDY; break;
3026             case 'X': *optset |= PCRE_EXTRA; break;
3027             }
3028           }
3029
3030         /* Set up the changed option bits, but don't change anything yet. */
3031
3032         newoptions = (options | set) & (~unset);
3033
3034         /* If the options ended with ')' this is not the start of a nested
3035         group with option changes, so the options change at this level. Compile
3036         code to change the ims options if this setting actually changes any of
3037         them. We also pass the new setting back so that it can be put at the
3038         start of any following branches, and when this group ends (if we are in
3039         a group), a resetting item can be compiled.
3040
3041         Note that if this item is right at the start of the pattern, the
3042         options will have been abstracted and made global, so there will be no
3043         change to compile. */
3044
3045         if (*ptr == ')')
3046           {
3047           if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
3048             {
3049             *code++ = OP_OPT;
3050             *code++ = newoptions & PCRE_IMS;
3051             }
3052
3053           /* Change options at this level, and pass them back for use
3054           in subsequent branches. Reset the greedy defaults and the case
3055           value for firstbyte and reqbyte. */
3056
3057           *optionsptr = options = newoptions;
3058           greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
3059           greedy_non_default = greedy_default ^ 1;
3060           req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3061
3062           previous = NULL;       /* This item can't be repeated */
3063           continue;              /* It is complete */
3064           }
3065
3066         /* If the options ended with ':' we are heading into a nested group
3067         with possible change of options. Such groups are non-capturing and are
3068         not assertions of any kind. All we need to do is skip over the ':';
3069         the newoptions value is handled below. */
3070
3071         bravalue = OP_BRA;
3072         ptr++;
3073         }
3074       }
3075
3076     /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
3077     non-capturing and behave like (?:...) brackets */
3078
3079     else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
3080       {
3081       bravalue = OP_BRA;
3082       }
3083
3084     /* Else we have a referencing group; adjust the opcode. If the bracket
3085     number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
3086     arrange for the true number to follow later, in an OP_BRANUMBER item. */
3087
3088     else
3089       {
3090       NUMBERED_GROUP:
3091       if (++(*brackets) > EXTRACT_BASIC_MAX)
3092         {
3093         bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
3094         code[1+LINK_SIZE] = OP_BRANUMBER;
3095         PUT2(code, 2+LINK_SIZE, *brackets);
3096         skipbytes = 3;
3097         }
3098       else bravalue = OP_BRA + *brackets;
3099       }
3100
3101     /* Process nested bracketed re. Assertions may not be repeated, but other
3102     kinds can be. We copy code into a non-register variable in order to be able
3103     to pass its address because some compilers complain otherwise. Pass in a
3104     new setting for the ims options if they have changed. */
3105
3106     previous = (bravalue >= OP_ONCE)? code : NULL;
3107     *code = bravalue;
3108     tempcode = code;
3109     tempreqvary = cd->req_varyopt;     /* Save value before bracket */
3110
3111     if (!compile_regex(
3112          newoptions,                   /* The complete new option state */
3113          options & PCRE_IMS,           /* The previous ims option state */
3114          brackets,                     /* Extracting bracket count */
3115          &tempcode,                    /* Where to put code (updated) */
3116          &ptr,                         /* Input pointer (updated) */
3117          errorcodeptr,                 /* Where to put an error message */
3118          (bravalue == OP_ASSERTBACK ||
3119           bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
3120          skipbytes,                    /* Skip over OP_COND/OP_BRANUMBER */
3121          &subfirstbyte,                /* For possible first char */
3122          &subreqbyte,                  /* For possible last char */
3123          bcptr,                        /* Current branch chain */
3124          cd))                          /* Tables block */
3125       goto FAILED;
3126
3127     /* At the end of compiling, code is still pointing to the start of the
3128     group, while tempcode has been updated to point past the end of the group
3129     and any option resetting that may follow it. The pattern pointer (ptr)
3130     is on the bracket. */
3131
3132     /* If this is a conditional bracket, check that there are no more than
3133     two branches in the group. */
3134
3135     else if (bravalue == OP_COND)
3136       {
3137       uschar *tc = code;
3138       condcount = 0;
3139
3140       do {
3141          condcount++;
3142          tc += GET(tc,1);
3143          }
3144       while (*tc != OP_KET);
3145
3146       if (condcount > 2)
3147         {
3148         *errorcodeptr = ERR27;
3149         goto FAILED;
3150         }
3151
3152       /* If there is just one branch, we must not make use of its firstbyte or
3153       reqbyte, because this is equivalent to an empty second branch. */
3154
3155       if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
3156       }
3157
3158     /* Handle updating of the required and first characters. Update for normal
3159     brackets of all kinds, and conditions with two branches (see code above).
3160     If the bracket is followed by a quantifier with zero repeat, we have to
3161     back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
3162     main loop so that they can be accessed for the back off. */
3163
3164     zeroreqbyte = reqbyte;
3165     zerofirstbyte = firstbyte;
3166     groupsetfirstbyte = FALSE;
3167
3168     if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
3169       {
3170       /* If we have not yet set a firstbyte in this branch, take it from the
3171       subpattern, remembering that it was set here so that a repeat of more
3172       than one can replicate it as reqbyte if necessary. If the subpattern has
3173       no firstbyte, set "none" for the whole branch. In both cases, a zero
3174       repeat forces firstbyte to "none". */
3175
3176       if (firstbyte == REQ_UNSET)
3177         {
3178         if (subfirstbyte >= 0)
3179           {
3180           firstbyte = subfirstbyte;
3181           groupsetfirstbyte = TRUE;
3182           }
3183         else firstbyte = REQ_NONE;
3184         zerofirstbyte = REQ_NONE;
3185         }
3186
3187       /* If firstbyte was previously set, convert the subpattern's firstbyte
3188       into reqbyte if there wasn't one, using the vary flag that was in
3189       existence beforehand. */
3190
3191       else if (subfirstbyte >= 0 && subreqbyte < 0)
3192         subreqbyte = subfirstbyte | tempreqvary;
3193
3194       /* If the subpattern set a required byte (or set a first byte that isn't
3195       really the first byte - see above), set it. */
3196
3197       if (subreqbyte >= 0) reqbyte = subreqbyte;
3198       }
3199
3200     /* For a forward assertion, we take the reqbyte, if set. This can be
3201     helpful if the pattern that follows the assertion doesn't set a different
3202     char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
3203     for an assertion, however because it leads to incorrect effect for patterns
3204     such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
3205     of a firstbyte. This is overcome by a scan at the end if there's no
3206     firstbyte, looking for an asserted first char. */
3207
3208     else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
3209
3210     /* Now update the main code pointer to the end of the group. */
3211
3212     code = tempcode;
3213
3214     /* Error if hit end of pattern */
3215
3216     if (*ptr != ')')
3217       {
3218       *errorcodeptr = ERR14;
3219       goto FAILED;
3220       }
3221     break;
3222
3223     /* Check \ for being a real metacharacter; if not, fall through and handle
3224     it as a data character at the start of a string. Escape items are checked
3225     for validity in the pre-compiling pass. */
3226
3227     case '\\':
3228     tempptr = ptr;
3229     c = check_escape(&ptr, errorcodeptr, *brackets, options, FALSE);
3230
3231     /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
3232     are arranged to be the negation of the corresponding OP_values. For the
3233     back references, the values are ESC_REF plus the reference number. Only
3234     back references and those types that consume a character may be repeated.
3235     We can test for values between ESC_b and ESC_Z for the latter; this may
3236     have to change if any new ones are ever created. */
3237
3238     if (c < 0)
3239       {
3240       if (-c == ESC_Q)            /* Handle start of quoted string */
3241         {
3242         if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
3243           else inescq = TRUE;
3244         continue;
3245         }
3246
3247       /* For metasequences that actually match a character, we disable the
3248       setting of a first character if it hasn't already been set. */
3249
3250       if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
3251         firstbyte = REQ_NONE;
3252
3253       /* Set values to reset to if this is followed by a zero repeat. */
3254
3255       zerofirstbyte = firstbyte;
3256       zeroreqbyte = reqbyte;
3257
3258       /* Back references are handled specially */
3259
3260       if (-c >= ESC_REF)
3261         {
3262         int number = -c - ESC_REF;
3263         previous = code;
3264         *code++ = OP_REF;
3265         PUT2INC(code, 0, number);
3266         }
3267
3268       /* So are Unicode property matches, if supported. We know that get_ucp
3269       won't fail because it was tested in the pre-pass. */
3270
3271 #ifdef SUPPORT_UCP
3272       else if (-c == ESC_P || -c == ESC_p)
3273         {
3274         BOOL negated;
3275         int value = get_ucp(&ptr, &negated, errorcodeptr);
3276         previous = code;
3277         *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
3278         *code++ = value;
3279         }
3280 #endif
3281
3282       /* For the rest, we can obtain the OP value by negating the escape
3283       value */
3284
3285       else
3286         {
3287         previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
3288         *code++ = -c;
3289         }
3290       continue;
3291       }
3292
3293     /* We have a data character whose value is in c. In UTF-8 mode it may have
3294     a value > 127. We set its representation in the length/buffer, and then
3295     handle it as a data character. */
3296
3297 #ifdef SUPPORT_UTF8
3298     if (utf8 && c > 127)
3299       mclength = _pcre_ord2utf8(c, mcbuffer);
3300     else
3301 #endif
3302
3303      {
3304      mcbuffer[0] = c;
3305      mclength = 1;
3306      }
3307
3308     goto ONE_CHAR;
3309
3310     /* Handle a literal character. It is guaranteed not to be whitespace or #
3311     when the extended flag is set. If we are in UTF-8 mode, it may be a
3312     multi-byte literal character. */
3313
3314     default:
3315     NORMAL_CHAR:
3316     mclength = 1;
3317     mcbuffer[0] = c;
3318
3319 #ifdef SUPPORT_UTF8
3320     if (utf8 && (c & 0xc0) == 0xc0)
3321       {
3322       while ((ptr[1] & 0xc0) == 0x80)
3323         mcbuffer[mclength++] = *(++ptr);
3324       }
3325 #endif
3326
3327     /* At this point we have the character's bytes in mcbuffer, and the length
3328     in mclength. When not in UTF-8 mode, the length is always 1. */
3329
3330     ONE_CHAR:
3331     previous = code;
3332     *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
3333     for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
3334
3335     /* Set the first and required bytes appropriately. If no previous first
3336     byte, set it from this character, but revert to none on a zero repeat.
3337     Otherwise, leave the firstbyte value alone, and don't change it on a zero
3338     repeat. */
3339
3340     if (firstbyte == REQ_UNSET)
3341       {
3342       zerofirstbyte = REQ_NONE;
3343       zeroreqbyte = reqbyte;
3344
3345       /* If the character is more than one byte long, we can set firstbyte
3346       only if it is not to be matched caselessly. */
3347
3348       if (mclength == 1 || req_caseopt == 0)
3349         {
3350         firstbyte = mcbuffer[0] | req_caseopt;
3351         if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
3352         }
3353       else firstbyte = reqbyte = REQ_NONE;
3354       }
3355
3356     /* firstbyte was previously set; we can set reqbyte only the length is
3357     1 or the matching is caseful. */
3358
3359     else
3360       {
3361       zerofirstbyte = firstbyte;
3362       zeroreqbyte = reqbyte;
3363       if (mclength == 1 || req_caseopt == 0)
3364         reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3365       }
3366
3367     break;            /* End of literal character handling */
3368     }
3369   }                   /* end of big loop */
3370
3371 /* Control never reaches here by falling through, only by a goto for all the
3372 error states. Pass back the position in the pattern so that it can be displayed
3373 to the user for diagnosing the error. */
3374
3375 FAILED:
3376 *ptrptr = ptr;
3377 return FALSE;
3378 }
3379
3380
3381
3382
3383 /*************************************************
3384 *     Compile sequence of alternatives           *
3385 *************************************************/
3386
3387 /* On entry, ptr is pointing past the bracket character, but on return
3388 it points to the closing bracket, or vertical bar, or end of string.
3389 The code variable is pointing at the byte into which the BRA operator has been
3390 stored. If the ims options are changed at the start (for a (?ims: group) or
3391 during any branch, we need to insert an OP_OPT item at the start of every
3392 following branch to ensure they get set correctly at run time, and also pass
3393 the new options into every subsequent branch compile.
3394
3395 Argument:
3396   options        option bits, including any changes for this subpattern
3397   oldims         previous settings of ims option bits
3398   brackets       -> int containing the number of extracting brackets used
3399   codeptr        -> the address of the current code pointer
3400   ptrptr         -> the address of the current pattern pointer
3401   errorcodeptr   -> pointer to error code variable
3402   lookbehind     TRUE if this is a lookbehind assertion
3403   skipbytes      skip this many bytes at start (for OP_COND, OP_BRANUMBER)
3404   firstbyteptr   place to put the first required character, or a negative number
3405   reqbyteptr     place to put the last required character, or a negative number
3406   bcptr          pointer to the chain of currently open branches
3407   cd             points to the data block with tables pointers etc.
3408
3409 Returns:      TRUE on success
3410 */
3411
3412 static BOOL
3413 compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
3414   const uschar **ptrptr, int *errorcodeptr, BOOL lookbehind, int skipbytes,
3415   int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
3416 {
3417 const uschar *ptr = *ptrptr;
3418 uschar *code = *codeptr;
3419 uschar *last_branch = code;
3420 uschar *start_bracket = code;
3421 uschar *reverse_count = NULL;
3422 int firstbyte, reqbyte;
3423 int branchfirstbyte, branchreqbyte;
3424 branch_chain bc;
3425
3426 bc.outer = bcptr;
3427 bc.current = code;
3428
3429 firstbyte = reqbyte = REQ_UNSET;
3430
3431 /* Offset is set zero to mark that this bracket is still open */
3432
3433 PUT(code, 1, 0);
3434 code += 1 + LINK_SIZE + skipbytes;
3435
3436 /* Loop for each alternative branch */
3437
3438 for (;;)
3439   {
3440   /* Handle a change of ims options at the start of the branch */
3441
3442   if ((options & PCRE_IMS) != oldims)
3443     {
3444     *code++ = OP_OPT;
3445     *code++ = options & PCRE_IMS;
3446     }
3447
3448   /* Set up dummy OP_REVERSE if lookbehind assertion */
3449
3450   if (lookbehind)
3451     {
3452     *code++ = OP_REVERSE;
3453     reverse_count = code;
3454     PUTINC(code, 0, 0);
3455     }
3456
3457   /* Now compile the branch */
3458
3459   if (!compile_branch(&options, brackets, &code, &ptr, errorcodeptr,
3460         &branchfirstbyte, &branchreqbyte, &bc, cd))
3461     {
3462     *ptrptr = ptr;
3463     return FALSE;
3464     }
3465
3466   /* If this is the first branch, the firstbyte and reqbyte values for the
3467   branch become the values for the regex. */
3468
3469   if (*last_branch != OP_ALT)
3470     {
3471     firstbyte = branchfirstbyte;
3472     reqbyte = branchreqbyte;
3473     }
3474
3475   /* If this is not the first branch, the first char and reqbyte have to
3476   match the values from all the previous branches, except that if the previous
3477   value for reqbyte didn't have REQ_VARY set, it can still match, and we set
3478   REQ_VARY for the regex. */
3479
3480   else
3481     {
3482     /* If we previously had a firstbyte, but it doesn't match the new branch,
3483     we have to abandon the firstbyte for the regex, but if there was previously
3484     no reqbyte, it takes on the value of the old firstbyte. */
3485
3486     if (firstbyte >= 0 && firstbyte != branchfirstbyte)
3487       {
3488       if (reqbyte < 0) reqbyte = firstbyte;
3489       firstbyte = REQ_NONE;
3490       }
3491
3492     /* If we (now or from before) have no firstbyte, a firstbyte from the
3493     branch becomes a reqbyte if there isn't a branch reqbyte. */
3494
3495     if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
3496         branchreqbyte = branchfirstbyte;
3497
3498     /* Now ensure that the reqbytes match */
3499
3500     if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
3501       reqbyte = REQ_NONE;
3502     else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */
3503     }
3504
3505   /* If lookbehind, check that this branch matches a fixed-length string,
3506   and put the length into the OP_REVERSE item. Temporarily mark the end of
3507   the branch with OP_END. */
3508
3509   if (lookbehind)
3510     {
3511     int length;
3512     *code = OP_END;
3513     length = find_fixedlength(last_branch, options);
3514     DPRINTF(("fixed length = %d\n", length));
3515     if (length < 0)
3516       {
3517       *errorcodeptr = (length == -2)? ERR36 : ERR25;
3518       *ptrptr = ptr;
3519       return FALSE;
3520       }
3521     PUT(reverse_count, 0, length);
3522     }
3523
3524   /* Reached end of expression, either ')' or end of pattern. Go back through
3525   the alternative branches and reverse the chain of offsets, with the field in
3526   the BRA item now becoming an offset to the first alternative. If there are
3527   no alternatives, it points to the end of the group. The length in the
3528   terminating ket is always the length of the whole bracketed item. If any of
3529   the ims options were changed inside the group, compile a resetting op-code
3530   following, except at the very end of the pattern. Return leaving the pointer
3531   at the terminating char. */
3532
3533   if (*ptr != '|')
3534     {
3535     int length = code - last_branch;
3536     do
3537       {
3538       int prev_length = GET(last_branch, 1);
3539       PUT(last_branch, 1, length);
3540       length = prev_length;
3541       last_branch -= length;
3542       }
3543     while (length > 0);
3544
3545     /* Fill in the ket */
3546
3547     *code = OP_KET;
3548     PUT(code, 1, code - start_bracket);
3549     code += 1 + LINK_SIZE;
3550
3551     /* Resetting option if needed */
3552
3553     if ((options & PCRE_IMS) != oldims && *ptr == ')')
3554       {
3555       *code++ = OP_OPT;
3556       *code++ = oldims;
3557       }
3558
3559     /* Set values to pass back */
3560
3561     *codeptr = code;
3562     *ptrptr = ptr;
3563     *firstbyteptr = firstbyte;
3564     *reqbyteptr = reqbyte;
3565     return TRUE;
3566     }
3567
3568   /* Another branch follows; insert an "or" node. Its length field points back
3569   to the previous branch while the bracket remains open. At the end the chain
3570   is reversed. It's done like this so that the start of the bracket has a
3571   zero offset until it is closed, making it possible to detect recursion. */
3572
3573   *code = OP_ALT;
3574   PUT(code, 1, code - last_branch);
3575   bc.current = last_branch = code;
3576   code += 1 + LINK_SIZE;
3577   ptr++;
3578   }
3579 /* Control never reaches here */
3580 }
3581
3582
3583
3584
3585 /*************************************************
3586 *          Check for anchored expression         *
3587 *************************************************/
3588
3589 /* Try to find out if this is an anchored regular expression. Consider each
3590 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
3591 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
3592 it's anchored. However, if this is a multiline pattern, then only OP_SOD
3593 counts, since OP_CIRC can match in the middle.
3594
3595 We can also consider a regex to be anchored if OP_SOM starts all its branches.
3596 This is the code for \G, which means "match at start of match position, taking
3597 into account the match offset".
3598
3599 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
3600 because that will try the rest of the pattern at all possible matching points,
3601 so there is no point trying again.... er ....
3602
3603 .... except when the .* appears inside capturing parentheses, and there is a
3604 subsequent back reference to those parentheses. We haven't enough information
3605 to catch that case precisely.
3606
3607 At first, the best we could do was to detect when .* was in capturing brackets
3608 and the highest back reference was greater than or equal to that level.
3609 However, by keeping a bitmap of the first 31 back references, we can catch some
3610 of the more common cases more precisely.
3611
3612 Arguments:
3613   code           points to start of expression (the bracket)
3614   options        points to the options setting
3615   bracket_map    a bitmap of which brackets we are inside while testing; this
3616                   handles up to substring 31; after that we just have to take
3617                   the less precise approach
3618   backref_map    the back reference bitmap
3619
3620 Returns:     TRUE or FALSE
3621 */
3622
3623 static BOOL
3624 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
3625   unsigned int backref_map)
3626 {
3627 do {
3628    const uschar *scode =
3629      first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE);
3630    register int op = *scode;
3631
3632    /* Capturing brackets */
3633
3634    if (op > OP_BRA)
3635      {
3636      int new_map;
3637      op -= OP_BRA;
3638      if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3639      new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3640      if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
3641      }
3642
3643    /* Other brackets */
3644
3645    else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3646      {
3647      if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
3648      }
3649
3650    /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3651    are or may be referenced. */
3652
3653    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
3654             (*options & PCRE_DOTALL) != 0)
3655      {
3656      if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3657      }
3658
3659    /* Check for explicit anchoring */
3660
3661    else if (op != OP_SOD && op != OP_SOM &&
3662            ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
3663      return FALSE;
3664    code += GET(code, 1);
3665    }
3666 while (*code == OP_ALT);   /* Loop for each alternative */
3667 return TRUE;
3668 }
3669
3670
3671
3672 /*************************************************
3673 *         Check for starting with ^ or .*        *
3674 *************************************************/
3675
3676 /* This is called to find out if every branch starts with ^ or .* so that
3677 "first char" processing can be done to speed things up in multiline
3678 matching and for non-DOTALL patterns that start with .* (which must start at
3679 the beginning or after \n). As in the case of is_anchored() (see above), we
3680 have to take account of back references to capturing brackets that contain .*
3681 because in that case we can't make the assumption.
3682
3683 Arguments:
3684   code           points to start of expression (the bracket)
3685   bracket_map    a bitmap of which brackets we are inside while testing; this
3686                   handles up to substring 31; after that we just have to take
3687                   the less precise approach
3688   backref_map    the back reference bitmap
3689
3690 Returns:         TRUE or FALSE
3691 */
3692
3693 static BOOL
3694 is_startline(const uschar *code, unsigned int bracket_map,
3695   unsigned int backref_map)
3696 {
3697 do {
3698    const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0,
3699      FALSE);
3700    register int op = *scode;
3701
3702    /* Capturing brackets */
3703
3704    if (op > OP_BRA)
3705      {
3706      int new_map;
3707      op -= OP_BRA;
3708      if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3709      new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3710      if (!is_startline(scode, new_map, backref_map)) return FALSE;
3711      }
3712
3713    /* Other brackets */
3714
3715    else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3716      { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
3717
3718    /* .* means "start at start or after \n" if it isn't in brackets that
3719    may be referenced. */
3720
3721    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
3722      {
3723      if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3724      }
3725
3726    /* Check for explicit circumflex */
3727
3728    else if (op != OP_CIRC) return FALSE;
3729
3730    /* Move on to the next alternative */
3731
3732    code += GET(code, 1);
3733    }
3734 while (*code == OP_ALT);  /* Loop for each alternative */
3735 return TRUE;
3736 }
3737
3738
3739
3740 /*************************************************
3741 *       Check for asserted fixed first char      *
3742 *************************************************/
3743
3744 /* During compilation, the "first char" settings from forward assertions are
3745 discarded, because they can cause conflicts with actual literals that follow.
3746 However, if we end up without a first char setting for an unanchored pattern,
3747 it is worth scanning the regex to see if there is an initial asserted first
3748 char. If all branches start with the same asserted char, or with a bracket all
3749 of whose alternatives start with the same asserted char (recurse ad lib), then
3750 we return that char, otherwise -1.
3751
3752 Arguments:
3753   code       points to start of expression (the bracket)
3754   options    pointer to the options (used to check casing changes)
3755   inassert   TRUE if in an assertion
3756
3757 Returns:     -1 or the fixed first char
3758 */
3759
3760 static int
3761 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
3762 {
3763 register int c = -1;
3764 do {
3765    int d;
3766    const uschar *scode =
3767      first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
3768    register int op = *scode;
3769
3770    if (op >= OP_BRA) op = OP_BRA;
3771
3772    switch(op)
3773      {
3774      default:
3775      return -1;
3776
3777      case OP_BRA:
3778      case OP_ASSERT:
3779      case OP_ONCE:
3780      case OP_COND:
3781      if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
3782        return -1;
3783      if (c < 0) c = d; else if (c != d) return -1;
3784      break;
3785
3786      case OP_EXACT:       /* Fall through */
3787      scode += 2;
3788
3789      case OP_CHAR:
3790      case OP_CHARNC:
3791      case OP_PLUS:
3792      case OP_MINPLUS:
3793      if (!inassert) return -1;
3794      if (c < 0)
3795        {
3796        c = scode[1];
3797        if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
3798        }
3799      else if (c != scode[1]) return -1;
3800      break;
3801      }
3802
3803    code += GET(code, 1);
3804    }
3805 while (*code == OP_ALT);
3806 return c;
3807 }
3808
3809
3810
3811 /*************************************************
3812 *        Compile a Regular Expression            *
3813 *************************************************/
3814
3815 /* This function takes a string and returns a pointer to a block of store
3816 holding a compiled version of the expression. The original API for this
3817 function had no error code return variable; it is retained for backwards
3818 compatibility. The new function is given a new name.
3819
3820 Arguments:
3821   pattern       the regular expression
3822   options       various option bits
3823   errorcodeptr  pointer to error code variable (pcre_compile2() only)
3824                   can be NULL if you don't want a code value
3825   errorptr      pointer to pointer to error text
3826   erroroffset   ptr offset in pattern where error was detected
3827   tables        pointer to character tables or NULL
3828
3829 Returns:        pointer to compiled data block, or NULL on error,
3830                 with errorptr and erroroffset set
3831 */
3832
3833 EXPORT pcre *
3834 pcre_compile(const char *pattern, int options, const char **errorptr,
3835   int *erroroffset, const unsigned char *tables)
3836 {
3837 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
3838 }
3839
3840
3841 EXPORT pcre *
3842 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
3843   const char **errorptr, int *erroroffset, const unsigned char *tables)
3844 {
3845 real_pcre *re;
3846 int length = 1 + LINK_SIZE;      /* For initial BRA plus length */
3847 int c, firstbyte, reqbyte;
3848 int bracount = 0;
3849 int branch_extra = 0;
3850 int branch_newextra;
3851 int item_count = -1;
3852 int name_count = 0;
3853 int max_name_size = 0;
3854 int lastitemlength = 0;
3855 int errorcode = 0;
3856 #ifdef SUPPORT_UTF8
3857 BOOL utf8;
3858 BOOL class_utf8;
3859 #endif
3860 BOOL inescq = FALSE;
3861 unsigned int brastackptr = 0;
3862 size_t size;
3863 uschar *code;
3864 const uschar *codestart;
3865 const uschar *ptr;
3866 compile_data compile_block;
3867 int brastack[BRASTACK_SIZE];
3868 uschar bralenstack[BRASTACK_SIZE];
3869
3870 /* We can't pass back an error message if errorptr is NULL; I guess the best we
3871 can do is just return NULL, but we can set a code value if there is a code
3872 pointer. */
3873
3874 if (errorptr == NULL)
3875   {
3876   if (errorcodeptr != NULL) *errorcodeptr = 99;
3877   return NULL;
3878   }
3879
3880 *errorptr = NULL;
3881 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
3882
3883 /* However, we can give a message for this error */
3884
3885 if (erroroffset == NULL)
3886   {
3887   errorcode = ERR16;
3888   goto PCRE_EARLY_ERROR_RETURN;
3889   }
3890
3891 *erroroffset = 0;
3892
3893 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
3894
3895 #ifdef SUPPORT_UTF8
3896 utf8 = (options & PCRE_UTF8) != 0;
3897 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
3898      (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
3899   {
3900   errorcode = ERR44;
3901   goto PCRE_EARLY_ERROR_RETURN;
3902   }
3903 #else
3904 if ((options & PCRE_UTF8) != 0)
3905   {
3906   errorcode = ERR32;
3907   goto PCRE_EARLY_ERROR_RETURN;
3908   }
3909 #endif
3910
3911 if ((options & ~PUBLIC_OPTIONS) != 0)
3912   {
3913   errorcode = ERR17;
3914   goto PCRE_EARLY_ERROR_RETURN;
3915   }
3916
3917 /* Set up pointers to the individual character tables */
3918
3919 if (tables == NULL) tables = _pcre_default_tables;
3920 compile_block.lcc = tables + lcc_offset;
3921 compile_block.fcc = tables + fcc_offset;
3922 compile_block.cbits = tables + cbits_offset;
3923 compile_block.ctypes = tables + ctypes_offset;
3924
3925 /* Maximum back reference and backref bitmap. This is updated for numeric
3926 references during the first pass, but for named references during the actual
3927 compile pass. The bitmap records up to 31 back references to help in deciding
3928 whether (.*) can be treated as anchored or not. */
3929
3930 compile_block.top_backref = 0;
3931 compile_block.backref_map = 0;
3932
3933 /* Reflect pattern for debugging output */
3934
3935 DPRINTF(("------------------------------------------------------------------\n"));
3936 DPRINTF(("%s\n", pattern));
3937
3938 /* The first thing to do is to make a pass over the pattern to compute the
3939 amount of store required to hold the compiled code. This does not have to be
3940 perfect as long as errors are overestimates. At the same time we can detect any
3941 flag settings right at the start, and extract them. Make an attempt to correct
3942 for any counted white space if an "extended" flag setting appears late in the
3943 pattern. We can't be so clever for #-comments. */
3944
3945 ptr = (const uschar *)(pattern - 1);
3946 while ((c = *(++ptr)) != 0)
3947   {
3948   int min, max;
3949   int class_optcount;
3950   int bracket_length;
3951   int duplength;
3952
3953   /* If we are inside a \Q...\E sequence, all chars are literal */
3954
3955   if (inescq)
3956     {
3957     if ((options & PCRE_AUTO_CALLOUT) != 0) length += 2 + 2*LINK_SIZE;
3958     goto NORMAL_CHAR;
3959     }
3960
3961   /* Otherwise, first check for ignored whitespace and comments */
3962
3963   if ((options & PCRE_EXTENDED) != 0)
3964     {
3965     if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
3966     if (c == '#')
3967       {
3968       /* The space before the ; is to avoid a warning on a silly compiler
3969       on the Macintosh. */
3970       while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
3971       if (c == 0) break;
3972       continue;
3973       }
3974     }
3975
3976   item_count++;    /* Is zero for the first non-comment item */
3977
3978   /* Allow space for auto callout before every item except quantifiers. */
3979
3980   if ((options & PCRE_AUTO_CALLOUT) != 0 &&
3981        c != '*' && c != '+' && c != '?' &&
3982        (c != '{' || !is_counted_repeat(ptr + 1)))
3983     length += 2 + 2*LINK_SIZE;
3984
3985   switch(c)
3986     {
3987     /* A backslashed item may be an escaped data character or it may be a
3988     character type. */
3989
3990     case '\\':
3991     c = check_escape(&ptr, &errorcode, bracount, options, FALSE);
3992     if (errorcode != 0) goto PCRE_ERROR_RETURN;
3993
3994     lastitemlength = 1;     /* Default length of last item for repeats */
3995
3996     if (c >= 0)             /* Data character */
3997       {
3998       length += 2;          /* For a one-byte character */
3999
4000 #ifdef SUPPORT_UTF8
4001       if (utf8 && c > 127)
4002         {
4003         int i;
4004         for (i = 0; i < _pcre_utf8_table1_size; i++)
4005           if (c <= _pcre_utf8_table1[i]) break;
4006         length += i;
4007         lastitemlength += i;
4008         }
4009 #endif
4010
4011       continue;
4012       }
4013
4014     /* If \Q, enter "literal" mode */
4015
4016     if (-c == ESC_Q)
4017       {
4018       inescq = TRUE;
4019       continue;
4020       }
4021
4022     /* \X is supported only if Unicode property support is compiled */
4023
4024 #ifndef SUPPORT_UCP
4025     if (-c == ESC_X)
4026       {
4027       errorcode = ERR45;
4028       goto PCRE_ERROR_RETURN;
4029       }
4030 #endif
4031
4032     /* \P and \p are for Unicode properties, but only when the support has
4033     been compiled. Each item needs 2 bytes. */
4034
4035     else if (-c == ESC_P || -c == ESC_p)
4036       {
4037 #ifdef SUPPORT_UCP
4038       BOOL negated;
4039       length += 2;
4040       lastitemlength = 2;
4041       if (get_ucp(&ptr, &negated, &errorcode) < 0) goto PCRE_ERROR_RETURN;
4042       continue;
4043 #else
4044       errorcode = ERR45;
4045       goto PCRE_ERROR_RETURN;
4046 #endif
4047       }
4048
4049     /* Other escapes need one byte */
4050
4051     length++;
4052
4053     /* A back reference needs an additional 2 bytes, plus either one or 5
4054     bytes for a repeat. We also need to keep the value of the highest
4055     back reference. */
4056
4057     if (c <= -ESC_REF)
4058       {
4059       int refnum = -c - ESC_REF;
4060       compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
4061       if (refnum > compile_block.top_backref)
4062         compile_block.top_backref = refnum;
4063       length += 2;   /* For single back reference */
4064       if (ptr[1] == '{' && is_counted_repeat(ptr+2))
4065         {
4066         ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
4067         if (errorcode != 0) goto PCRE_ERROR_RETURN;
4068         if ((min == 0 && (max == 1 || max == -1)) ||
4069           (min == 1 && max == -1))
4070             length++;
4071         else length += 5;
4072         if (ptr[1] == '?') ptr++;
4073         }
4074       }
4075     continue;
4076
4077     case '^':     /* Single-byte metacharacters */
4078     case '.':
4079     case '$':
4080     length++;
4081     lastitemlength = 1;
4082     continue;
4083
4084     case '*':            /* These repeats won't be after brackets; */
4085     case '+':            /* those are handled separately */
4086     case '?':
4087     length++;
4088     goto POSESSIVE;      /* A few lines below */
4089
4090     /* This covers the cases of braced repeats after a single char, metachar,
4091     class, or back reference. */
4092
4093     case '{':
4094     if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
4095     ptr = read_repeat_counts(ptr+1, &min, &max, &errorcode);
4096     if (errorcode != 0) goto PCRE_ERROR_RETURN;
4097
4098     /* These special cases just insert one extra opcode */
4099
4100     if ((min == 0 && (max == 1 || max == -1)) ||
4101       (min == 1 && max == -1))
4102         length++;
4103
4104     /* These cases might insert additional copies of a preceding character. */
4105
4106     else
4107       {
4108       if (min != 1)
4109         {
4110         length -= lastitemlength;   /* Uncount the original char or metachar */
4111         if (min > 0) length += 3 + lastitemlength;
4112         }
4113       length += lastitemlength + ((max > 0)? 3 : 1);
4114       }
4115
4116     if (ptr[1] == '?') ptr++;      /* Needs no extra length */
4117
4118     POSESSIVE:                     /* Test for possessive quantifier */
4119     if (ptr[1] == '+')
4120       {
4121       ptr++;
4122       length += 2 + 2*LINK_SIZE;   /* Allow for atomic brackets */
4123       }
4124     continue;
4125
4126     /* An alternation contains an offset to the next branch or ket. If any ims
4127     options changed in the previous branch(es), and/or if we are in a
4128     lookbehind assertion, extra space will be needed at the start of the
4129     branch. This is handled by branch_extra. */
4130
4131     case '|':
4132     length += 1 + LINK_SIZE + branch_extra;
4133     continue;
4134
4135     /* A character class uses 33 characters provided that all the character
4136     values are less than 256. Otherwise, it uses a bit map for low valued
4137     characters, and individual items for others. Don't worry about character
4138     types that aren't allowed in classes - they'll get picked up during the
4139     compile. A character class that contains only one single-byte character
4140     uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
4141     where we can. (In UTF-8 mode we can do this only for chars < 128.) */
4142
4143     case '[':
4144     if (*(++ptr) == '^')
4145       {
4146       class_optcount = 10;  /* Greater than one */
4147       ptr++;
4148       }
4149     else class_optcount = 0;
4150
4151 #ifdef SUPPORT_UTF8
4152     class_utf8 = FALSE;
4153 #endif
4154
4155     /* Written as a "do" so that an initial ']' is taken as data */
4156
4157     if (*ptr != 0) do
4158       {
4159       /* Inside \Q...\E everything is literal except \E */
4160
4161       if (inescq)
4162         {
4163         if (*ptr != '\\' || ptr[1] != 'E') goto GET_ONE_CHARACTER;
4164         inescq = FALSE;
4165         ptr += 1;
4166         continue;
4167         }
4168
4169       /* Outside \Q...\E, check for escapes */
4170
4171       if (*ptr == '\\')
4172         {
4173         c = check_escape(&ptr, &errorcode, bracount, options, TRUE);
4174         if (errorcode != 0) goto PCRE_ERROR_RETURN;
4175
4176         /* \b is backspace inside a class; \X is literal */
4177
4178         if (-c == ESC_b) c = '\b';
4179         else if (-c == ESC_X) c = 'X';
4180
4181         /* \Q enters quoting mode */
4182
4183         else if (-c == ESC_Q)
4184           {
4185           inescq = TRUE;
4186           continue;
4187           }
4188
4189         /* Handle escapes that turn into characters */
4190
4191         if (c >= 0) goto NON_SPECIAL_CHARACTER;
4192
4193         /* Escapes that are meta-things. The normal ones just affect the
4194         bit map, but Unicode properties require an XCLASS extended item. */
4195
4196         else
4197           {
4198           class_optcount = 10;         /* \d, \s etc; make sure > 1 */
4199 #ifdef SUPPORT_UTF8
4200           if (-c == ESC_p || -c == ESC_P)
4201             {
4202             if (!class_utf8)
4203               {
4204               class_utf8 = TRUE;
4205               length += LINK_SIZE + 2;
4206               }
4207             length += 2;
4208             }
4209 #endif
4210           }
4211         }
4212
4213       /* Check the syntax for POSIX stuff. The bits we actually handle are
4214       checked during the real compile phase. */
4215
4216       else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block))
4217         {
4218         ptr++;
4219         class_optcount = 10;    /* Make sure > 1 */
4220         }
4221
4222       /* Anything else increments the possible optimization count. We have to
4223       detect ranges here so that we can compute the number of extra ranges for
4224       caseless wide characters when UCP support is available. If there are wide
4225       characters, we are going to have to use an XCLASS, even for single
4226       characters. */
4227
4228       else
4229         {
4230         int d;
4231
4232         GET_ONE_CHARACTER:
4233
4234 #ifdef SUPPORT_UTF8
4235         if (utf8)
4236           {
4237           int extra = 0;
4238           GETCHARLEN(c, ptr, extra);
4239           ptr += extra;
4240           }
4241         else c = *ptr;
4242 #else
4243         c = *ptr;
4244 #endif
4245
4246         /* Come here from handling \ above when it escapes to a char value */
4247
4248         NON_SPECIAL_CHARACTER:
4249         class_optcount++;
4250
4251         d = -1;
4252         if (ptr[1] == '-')
4253           {
4254           uschar const *hyptr = ptr++;
4255           if (ptr[1] == '\\')
4256             {
4257             ptr++;
4258             d = check_escape(&ptr, &errorcode, bracount, options, TRUE);
4259             if (errorcode != 0) goto PCRE_ERROR_RETURN;
4260             if (-d == ESC_b) d = '\b';        /* backspace */
4261             else if (-d == ESC_X) d = 'X';    /* literal X in a class */
4262             }
4263           else if (ptr[1] != 0 && ptr[1] != ']')
4264             {
4265             ptr++;
4266 #ifdef SUPPORT_UTF8
4267             if (utf8)
4268               {
4269               int extra = 0;
4270               GETCHARLEN(d, ptr, extra);
4271               ptr += extra;
4272               }
4273             else
4274 #endif
4275             d = *ptr;
4276             }
4277           if (d < 0) ptr = hyptr;      /* go back to hyphen as data */
4278           }
4279
4280         /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
4281         127 for caseless matching, we will need to use an XCLASS. */
4282
4283         if (d >= 0)
4284           {
4285           class_optcount = 10;     /* Ensure > 1 */
4286           if (d < c)
4287             {
4288             errorcode = ERR8;
4289             goto PCRE_ERROR_RETURN;
4290             }
4291
4292 #ifdef SUPPORT_UTF8
4293           if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
4294             {
4295             uschar buffer[6];
4296             if (!class_utf8)         /* Allow for XCLASS overhead */
4297               {
4298               class_utf8 = TRUE;
4299               length += LINK_SIZE + 2;
4300               }
4301
4302 #ifdef SUPPORT_UCP
4303             /* If we have UCP support, find out how many extra ranges are
4304             needed to map the other case of characters within this range. We
4305             have to mimic the range optimization here, because extending the
4306             range upwards might push d over a boundary that makes is use
4307             another byte in the UTF-8 representation. */
4308
4309             if ((options & PCRE_CASELESS) != 0)
4310               {
4311               int occ, ocd;
4312               int cc = c;
4313               int origd = d;
4314               while (get_othercase_range(&cc, origd, &occ, &ocd))
4315                 {
4316                 if (occ >= c && ocd <= d) continue;   /* Skip embedded */
4317
4318                 if (occ < c  && ocd >= c - 1)  /* Extend the basic range */
4319                   {                            /* if there is overlap,   */
4320                   c = occ;                     /* noting that if occ < c */
4321                   continue;                    /* we can't have ocd > d  */
4322                   }                            /* because a subrange is  */
4323                 if (ocd > d && occ <= d + 1)   /* always shorter than    */
4324                   {                            /* the basic range.       */
4325                   d = ocd;
4326                   continue;
4327                   }
4328
4329                 /* An extra item is needed */
4330
4331                 length += 1 + _pcre_ord2utf8(occ, buffer) +
4332                   ((occ == ocd)? 0 : _pcre_ord2utf8(ocd, buffer));
4333                 }
4334               }
4335 #endif  /* SUPPORT_UCP */
4336
4337             /* The length of the (possibly extended) range */
4338
4339             length += 1 + _pcre_ord2utf8(c, buffer) + _pcre_ord2utf8(d, buffer);
4340             }
4341 #endif  /* SUPPORT_UTF8 */
4342
4343           }
4344
4345         /* We have a single character. There is nothing to be done unless we
4346         are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
4347         allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
4348         support. */
4349
4350         else
4351           {
4352 #ifdef SUPPORT_UTF8
4353           if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
4354             {
4355             uschar buffer[6];
4356             class_optcount = 10;     /* Ensure > 1 */
4357             if (!class_utf8)         /* Allow for XCLASS overhead */
4358               {
4359               class_utf8 = TRUE;
4360               length += LINK_SIZE + 2;
4361               }
4362 #ifdef SUPPORT_UCP
4363             length += (((options & PCRE_CASELESS) != 0)? 2 : 1) *
4364               (1 + _pcre_ord2utf8(c, buffer));
4365 #else   /* SUPPORT_UCP */
4366             length += 1 + _pcre_ord2utf8(c, buffer);
4367 #endif  /* SUPPORT_UCP */
4368             }
4369 #endif  /* SUPPORT_UTF8 */
4370           }
4371         }
4372       }
4373     while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */
4374
4375     if (*ptr == 0)                          /* Missing terminating ']' */
4376       {
4377       errorcode = ERR6;
4378       goto PCRE_ERROR_RETURN;
4379       }
4380
4381     /* We can optimize when there was only one optimizable character. Repeats
4382     for positive and negated single one-byte chars are handled by the general
4383     code. Here, we handle repeats for the class opcodes. */
4384
4385     if (class_optcount == 1) length += 3; else
4386       {
4387       length += 33;
4388
4389       /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
4390       we also need extra for wrapping the whole thing in a sub-pattern. */
4391
4392       if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
4393         {
4394         ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
4395         if (errorcode != 0) goto PCRE_ERROR_RETURN;
4396         if ((min == 0 && (max == 1 || max == -1)) ||
4397           (min == 1 && max == -1))
4398             length++;
4399         else length += 5;
4400         if (ptr[1] == '+')
4401           {
4402           ptr++;
4403           length += 2 + 2*LINK_SIZE;
4404           }
4405         else if (ptr[1] == '?') ptr++;
4406         }
4407       }
4408     continue;
4409
4410     /* Brackets may be genuine groups or special things */
4411
4412     case '(':
4413     branch_newextra = 0;
4414     bracket_length = 1 + LINK_SIZE;
4415
4416     /* Handle special forms of bracket, which all start (? */
4417
4418     if (ptr[1] == '?')
4419       {
4420       int set, unset;
4421       int *optset;
4422
4423       switch (c = ptr[2])
4424         {
4425         /* Skip over comments entirely */
4426         case '#':
4427         ptr += 3;
4428         while (*ptr != 0 && *ptr != ')') ptr++;
4429         if (*ptr == 0)
4430           {
4431           errorcode = ERR18;
4432           goto PCRE_ERROR_RETURN;
4433           }
4434         continue;
4435
4436         /* Non-referencing groups and lookaheads just move the pointer on, and
4437         then behave like a non-special bracket, except that they don't increment
4438         the count of extracting brackets. Ditto for the "once only" bracket,
4439         which is in Perl from version 5.005. */
4440
4441         case ':':
4442         case '=':
4443         case '!':
4444         case '>':
4445         ptr += 2;
4446         break;
4447
4448         /* (?R) specifies a recursive call to the regex, which is an extension
4449         to provide the facility which can be obtained by (?p{perl-code}) in
4450         Perl 5.6. In Perl 5.8 this has become (??{perl-code}).
4451
4452         From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to
4453         the appropriate numbered brackets. This includes both recursive and
4454         non-recursive calls. (?R) is now synonymous with (?0). */
4455
4456         case 'R':
4457         ptr++;
4458
4459         case '0': case '1': case '2': case '3': case '4':
4460         case '5': case '6': case '7': case '8': case '9':
4461         ptr += 2;
4462         if (c != 'R')
4463           while ((digitab[*(++ptr)] & ctype_digit) != 0);
4464         if (*ptr != ')')
4465           {
4466           errorcode = ERR29;
4467           goto PCRE_ERROR_RETURN;
4468           }
4469         length += 1 + LINK_SIZE;
4470
4471         /* If this item is quantified, it will get wrapped inside brackets so
4472         as to use the code for quantified brackets. We jump down and use the
4473         code that handles this for real brackets. */
4474
4475         if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{')
4476           {
4477           length += 2 + 2 * LINK_SIZE;       /* to make bracketed */
4478           duplength = 5 + 3 * LINK_SIZE;
4479           goto HANDLE_QUANTIFIED_BRACKETS;
4480           }
4481         continue;
4482
4483         /* (?C) is an extension which provides "callout" - to provide a bit of
4484         the functionality of the Perl (?{...}) feature. An optional number may
4485         follow (default is zero). */
4486
4487         case 'C':
4488         ptr += 2;
4489         while ((digitab[*(++ptr)] & ctype_digit) != 0);
4490         if (*ptr != ')')
4491           {
4492           errorcode = ERR39;
4493           goto PCRE_ERROR_RETURN;
4494           }
4495         length += 2 + 2*LINK_SIZE;
4496         continue;
4497
4498         /* Named subpatterns are an extension copied from Python */
4499
4500         case 'P':
4501         ptr += 3;
4502         if (*ptr == '<')
4503           {
4504           const uschar *p;    /* Don't amalgamate; some compilers */
4505           p = ++ptr;          /* grumble at autoincrement in declaration */
4506           while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;
4507           if (*ptr != '>')
4508             {
4509             errorcode = ERR42;
4510             goto PCRE_ERROR_RETURN;
4511             }
4512           name_count++;
4513           if (ptr - p > max_name_size) max_name_size = (ptr - p);
4514           break;
4515           }
4516
4517         if (*ptr == '=' || *ptr == '>')
4518           {
4519           while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0);
4520           if (*ptr != ')')
4521             {
4522             errorcode = ERR42;
4523             goto PCRE_ERROR_RETURN;
4524             }
4525           break;
4526           }
4527
4528         /* Unknown character after (?P */
4529
4530         errorcode = ERR41;
4531         goto PCRE_ERROR_RETURN;
4532
4533         /* Lookbehinds are in Perl from version 5.005 */
4534
4535         case '<':
4536         ptr += 3;
4537         if (*ptr == '=' || *ptr == '!')
4538           {
4539           branch_newextra = 1 + LINK_SIZE;
4540           length += 1 + LINK_SIZE;         /* For the first branch */
4541           break;
4542           }
4543         errorcode = ERR24;
4544         goto PCRE_ERROR_RETURN;
4545
4546         /* Conditionals are in Perl from version 5.005. The bracket must either
4547         be followed by a number (for bracket reference) or by an assertion
4548         group, or (a PCRE extension) by 'R' for a recursion test. */
4549
4550         case '(':
4551         if (ptr[3] == 'R' && ptr[4] == ')')
4552           {
4553           ptr += 4;
4554           length += 3;
4555           }
4556         else if ((digitab[ptr[3]] & ctype_digit) != 0)
4557           {
4558           ptr += 4;
4559           length += 3;
4560           while ((digitab[*ptr] & ctype_digit) != 0) ptr++;
4561           if (*ptr != ')')
4562             {
4563             errorcode = ERR26;
4564             goto PCRE_ERROR_RETURN;
4565             }
4566           }
4567         else   /* An assertion must follow */
4568           {
4569           ptr++;   /* Can treat like ':' as far as spacing is concerned */
4570           if (ptr[2] != '?' ||
4571              (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
4572             {
4573             ptr += 2;    /* To get right offset in message */
4574             errorcode = ERR28;
4575             goto PCRE_ERROR_RETURN;
4576             }
4577           }
4578         break;
4579
4580         /* Else loop checking valid options until ) is met. Anything else is an
4581         error. If we are without any brackets, i.e. at top level, the settings
4582         act as if specified in the options, so massage the options immediately.
4583         This is for backward compatibility with Perl 5.004. */
4584
4585         default:
4586         set = unset = 0;
4587         optset = &set;
4588         ptr += 2;
4589
4590         for (;; ptr++)
4591           {
4592           c = *ptr;
4593           switch (c)
4594             {
4595             case 'i':
4596             *optset |= PCRE_CASELESS;
4597             continue;
4598
4599             case 'm':
4600             *optset |= PCRE_MULTILINE;
4601             continue;
4602
4603             case 's':
4604             *optset |= PCRE_DOTALL;
4605             continue;
4606
4607             case 'x':
4608             *optset |= PCRE_EXTENDED;
4609             continue;
4610
4611             case 'X':
4612             *optset |= PCRE_EXTRA;
4613             continue;
4614
4615             case 'U':
4616             *optset |= PCRE_UNGREEDY;
4617             continue;
4618
4619             case '-':
4620             optset = &unset;
4621             continue;
4622
4623             /* A termination by ')' indicates an options-setting-only item; if
4624             this is at the very start of the pattern (indicated by item_count
4625             being zero), we use it to set the global options. This is helpful
4626             when analyzing the pattern for first characters, etc. Otherwise
4627             nothing is done here and it is handled during the compiling
4628             process.
4629
4630             [Historical note: Up to Perl 5.8, options settings at top level
4631             were always global settings, wherever they appeared in the pattern.
4632             That is, they were equivalent to an external setting. From 5.8
4633             onwards, they apply only to what follows (which is what you might
4634             expect).] */
4635
4636             case ')':
4637             if (item_count == 0)
4638               {
4639               options = (options | set) & (~unset);
4640               set = unset = 0;     /* To save length */
4641               item_count--;        /* To allow for several */
4642               }
4643
4644             /* Fall through */
4645
4646             /* A termination by ':' indicates the start of a nested group with
4647             the given options set. This is again handled at compile time, but
4648             we must allow for compiled space if any of the ims options are
4649             set. We also have to allow for resetting space at the end of
4650             the group, which is why 4 is added to the length and not just 2.
4651             If there are several changes of options within the same group, this
4652             will lead to an over-estimate on the length, but this shouldn't
4653             matter very much. We also have to allow for resetting options at
4654             the start of any alternations, which we do by setting
4655             branch_newextra to 2. Finally, we record whether the case-dependent
4656             flag ever changes within the regex. This is used by the "required
4657             character" code. */
4658
4659             case ':':
4660             if (((set|unset) & PCRE_IMS) != 0)
4661               {
4662               length += 4;
4663               branch_newextra = 2;
4664               if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
4665               }
4666             goto END_OPTIONS;
4667
4668             /* Unrecognized option character */
4669
4670             default:
4671             errorcode = ERR12;
4672             goto PCRE_ERROR_RETURN;
4673             }
4674           }
4675
4676         /* If we hit a closing bracket, that's it - this is a freestanding
4677         option-setting. We need to ensure that branch_extra is updated if
4678         necessary. The only values branch_newextra can have here are 0 or 2.
4679         If the value is 2, then branch_extra must either be 2 or 5, depending
4680         on whether this is a lookbehind group or not. */
4681
4682         END_OPTIONS:
4683         if (c == ')')
4684           {
4685           if (branch_newextra == 2 &&
4686               (branch_extra == 0 || branch_extra == 1+LINK_SIZE))
4687             branch_extra += branch_newextra;
4688           continue;
4689           }
4690
4691         /* If options were terminated by ':' control comes here. Fall through
4692         to handle the group below. */
4693         }
4694       }
4695
4696     /* Extracting brackets must be counted so we can process escapes in a
4697     Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to
4698     need an additional 3 bytes of store per extracting bracket. However, if
4699     PCRE_NO_AUTO)CAPTURE is set, unadorned brackets become non-capturing, so we
4700     must leave the count alone (it will aways be zero). */
4701
4702     else if ((options & PCRE_NO_AUTO_CAPTURE) == 0)
4703       {
4704       bracount++;
4705       if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
4706       }
4707
4708     /* Save length for computing whole length at end if there's a repeat that
4709     requires duplication of the group. Also save the current value of
4710     branch_extra, and start the new group with the new value. If non-zero, this
4711     will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
4712
4713     if (brastackptr >= sizeof(brastack)/sizeof(int))
4714       {
4715       errorcode = ERR19;
4716       goto PCRE_ERROR_RETURN;
4717       }
4718
4719     bralenstack[brastackptr] = branch_extra;
4720     branch_extra = branch_newextra;
4721
4722     brastack[brastackptr++] = length;
4723     length += bracket_length;
4724     continue;
4725
4726     /* Handle ket. Look for subsequent max/min; for certain sets of values we
4727     have to replicate this bracket up to that many times. If brastackptr is
4728     0 this is an unmatched bracket which will generate an error, but take care
4729     not to try to access brastack[-1] when computing the length and restoring
4730     the branch_extra value. */
4731
4732     case ')':
4733     length += 1 + LINK_SIZE;
4734     if (brastackptr > 0)
4735       {
4736       duplength = length - brastack[--brastackptr];
4737       branch_extra = bralenstack[brastackptr];
4738       }
4739     else duplength = 0;
4740
4741     /* The following code is also used when a recursion such as (?3) is
4742     followed by a quantifier, because in that case, it has to be wrapped inside
4743     brackets so that the quantifier works. The value of duplength must be
4744     set before arrival. */
4745
4746     HANDLE_QUANTIFIED_BRACKETS:
4747
4748     /* Leave ptr at the final char; for read_repeat_counts this happens
4749     automatically; for the others we need an increment. */
4750
4751     if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
4752       {
4753       ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
4754       if (errorcode != 0) goto PCRE_ERROR_RETURN;
4755       }
4756     else if (c == '*') { min = 0; max = -1; ptr++; }
4757     else if (c == '+') { min = 1; max = -1; ptr++; }
4758     else if (c == '?') { min = 0; max = 1;  ptr++; }
4759     else { min = 1; max = 1; }
4760
4761     /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
4762     group, and if the maximum is greater than zero, we have to replicate
4763     maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
4764     bracket set. */
4765
4766     if (min == 0)
4767       {
4768       length++;
4769       if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
4770       }
4771
4772     /* When the minimum is greater than zero, we have to replicate up to
4773     minval-1 times, with no additions required in the copies. Then, if there
4774     is a limited maximum we have to replicate up to maxval-1 times allowing
4775     for a BRAZERO item before each optional copy and nesting brackets for all
4776     but one of the optional copies. */
4777
4778     else
4779       {
4780       length += (min - 1) * duplength;
4781       if (max > min)   /* Need this test as max=-1 means no limit */
4782         length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
4783           - (2 + 2*LINK_SIZE);
4784       }
4785
4786     /* Allow space for once brackets for "possessive quantifier" */
4787
4788     if (ptr[1] == '+')
4789       {
4790       ptr++;
4791       length += 2 + 2*LINK_SIZE;
4792       }
4793     continue;
4794
4795     /* Non-special character. It won't be space or # in extended mode, so it is
4796     always a genuine character. If we are in a \Q...\E sequence, check for the
4797     end; if not, we have a literal. */
4798
4799     default:
4800     NORMAL_CHAR:
4801
4802     if (inescq && c == '\\' && ptr[1] == 'E')
4803       {
4804       inescq = FALSE;
4805       ptr++;
4806       continue;
4807       }
4808
4809     length += 2;          /* For a one-byte character */
4810     lastitemlength = 1;   /* Default length of last item for repeats */
4811
4812     /* In UTF-8 mode, check for additional bytes. */
4813
4814 #ifdef SUPPORT_UTF8
4815     if (utf8 && (c & 0xc0) == 0xc0)
4816       {
4817       while ((ptr[1] & 0xc0) == 0x80)         /* Can't flow over the end */
4818         {                                     /* because the end is marked */
4819         lastitemlength++;                     /* by a zero byte. */
4820         length++;
4821         ptr++;
4822         }
4823       }
4824 #endif
4825
4826     continue;
4827     }
4828   }
4829
4830 length += 2 + LINK_SIZE;    /* For final KET and END */
4831
4832 if ((options & PCRE_AUTO_CALLOUT) != 0)
4833   length += 2 + 2*LINK_SIZE;  /* For final callout */
4834
4835 if (length > MAX_PATTERN_SIZE)
4836   {
4837   errorcode = ERR20;
4838   goto PCRE_EARLY_ERROR_RETURN;
4839   }
4840
4841 /* Compute the size of data block needed and get it, either from malloc or
4842 externally provided function. */
4843
4844 size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
4845 re = (real_pcre *)(pcre_malloc)(size);
4846
4847 if (re == NULL)
4848   {
4849   errorcode = ERR21;
4850   goto PCRE_EARLY_ERROR_RETURN;
4851   }
4852
4853 /* Put in the magic number, and save the sizes, options, and character table
4854 pointer. NULL is used for the default character tables. The nullpad field is at
4855 the end; it's there to help in the case when a regex compiled on a system with
4856 4-byte pointers is run on another with 8-byte pointers. */
4857
4858 re->magic_number = MAGIC_NUMBER;
4859 re->size = size;
4860 re->options = options;
4861 re->dummy1 = 0;
4862 re->name_table_offset = sizeof(real_pcre);
4863 re->name_entry_size = max_name_size + 3;
4864 re->name_count = name_count;
4865 re->ref_count = 0;
4866 re->tables = (tables == _pcre_default_tables)? NULL : tables;
4867 re->nullpad = NULL;
4868
4869 /* The starting points of the name/number translation table and of the code are
4870 passed around in the compile data block. */
4871
4872 compile_block.names_found = 0;
4873 compile_block.name_entry_size = max_name_size + 3;
4874 compile_block.name_table = (uschar *)re + re->name_table_offset;
4875 codestart = compile_block.name_table + re->name_entry_size * re->name_count;
4876 compile_block.start_code = codestart;
4877 compile_block.start_pattern = (const uschar *)pattern;
4878 compile_block.req_varyopt = 0;
4879 compile_block.nopartial = FALSE;
4880
4881 /* Set up a starting, non-extracting bracket, then compile the expression. On
4882 error, errorcode will be set non-zero, so we don't need to look at the result
4883 of the function here. */
4884
4885 ptr = (const uschar *)pattern;
4886 code = (uschar *)codestart;
4887 *code = OP_BRA;
4888 bracount = 0;
4889 (void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr,
4890   &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, &compile_block);
4891 re->top_bracket = bracount;
4892 re->top_backref = compile_block.top_backref;
4893
4894 if (compile_block.nopartial) re->options |= PCRE_NOPARTIAL;
4895
4896 /* If not reached end of pattern on success, there's an excess bracket. */
4897
4898 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
4899
4900 /* Fill in the terminating state and check for disastrous overflow, but
4901 if debugging, leave the test till after things are printed out. */
4902
4903 *code++ = OP_END;
4904
4905 #ifndef DEBUG
4906 if (code - codestart > length) errorcode = ERR23;
4907 #endif
4908
4909 /* Give an error if there's back reference to a non-existent capturing
4910 subpattern. */
4911
4912 if (re->top_backref > re->top_bracket) errorcode = ERR15;
4913
4914 /* Failed to compile, or error while post-processing */
4915
4916 if (errorcode != 0)
4917   {
4918   (pcre_free)(re);
4919   PCRE_ERROR_RETURN:
4920   *erroroffset = ptr - (const uschar *)pattern;
4921   PCRE_EARLY_ERROR_RETURN:
4922   *errorptr = error_texts[errorcode];
4923   if (errorcodeptr != NULL) *errorcodeptr = errorcode;
4924   return NULL;
4925   }
4926
4927 /* If the anchored option was not passed, set the flag if we can determine that
4928 the pattern is anchored by virtue of ^ characters or \A or anything else (such
4929 as starting with .* when DOTALL is set).
4930
4931 Otherwise, if we know what the first character has to be, save it, because that
4932 speeds up unanchored matches no end. If not, see if we can set the
4933 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
4934 start with ^. and also when all branches start with .* for non-DOTALL matches.
4935 */
4936
4937 if ((options & PCRE_ANCHORED) == 0)
4938   {
4939   int temp_options = options;
4940   if (is_anchored(codestart, &temp_options, 0, compile_block.backref_map))
4941     re->options |= PCRE_ANCHORED;
4942   else
4943     {
4944     if (firstbyte < 0)
4945       firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
4946     if (firstbyte >= 0)   /* Remove caseless flag for non-caseable chars */
4947       {
4948       int ch = firstbyte & 255;
4949       re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
4950          compile_block.fcc[ch] == ch)? ch : firstbyte;
4951       re->options |= PCRE_FIRSTSET;
4952       }
4953     else if (is_startline(codestart, 0, compile_block.backref_map))
4954       re->options |= PCRE_STARTLINE;
4955     }
4956   }
4957
4958 /* For an anchored pattern, we use the "required byte" only if it follows a
4959 variable length item in the regex. Remove the caseless flag for non-caseable
4960 bytes. */
4961
4962 if (reqbyte >= 0 &&
4963      ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
4964   {
4965   int ch = reqbyte & 255;
4966   re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
4967     compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
4968   re->options |= PCRE_REQCHSET;
4969   }
4970
4971 /* Print out the compiled data for debugging */
4972
4973 #ifdef DEBUG
4974
4975 printf("Length = %d top_bracket = %d top_backref = %d\n",
4976   length, re->top_bracket, re->top_backref);
4977
4978 if (re->options != 0)
4979   {
4980   printf("%s%s%s%s%s%s%s%s%s%s\n",
4981     ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
4982     ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
4983     ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
4984     ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
4985     ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
4986     ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
4987     ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
4988     ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
4989     ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
4990     ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
4991   }
4992
4993 if ((re->options & PCRE_FIRSTSET) != 0)
4994   {
4995   int ch = re->first_byte & 255;
4996   const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
4997   if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
4998     else printf("First char = \\x%02x%s\n", ch, caseless);
4999   }
5000
5001 if ((re->options & PCRE_REQCHSET) != 0)
5002   {
5003   int ch = re->req_byte & 255;
5004   const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
5005   if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5006     else printf("Req char = \\x%02x%s\n", ch, caseless);
5007   }
5008
5009 _pcre_printint(re, stdout);
5010
5011 /* This check is done here in the debugging case so that the code that
5012 was compiled can be seen. */
5013
5014 if (code - codestart > length)
5015   {
5016   (pcre_free)(re);
5017   *errorptr = error_texts[ERR23];
5018   *erroroffset = ptr - (uschar *)pattern;
5019   if (errorcodeptr != NULL) *errorcodeptr = ERR23;
5020   return NULL;
5021   }
5022 #endif
5023
5024 return (pcre *)re;
5025 }
5026
5027 /* End of pcre_compile.c */