src/src/pcre/pcre.c

   1 /* $Cambridge: exim/src/src/pcre/pcre.c,v 1.2 2005/06/15 08:57:10 ph10 Exp $ */
   2
   3 /*************************************************
   4 *      Perl-Compatible Regular Expressions       *
   5 *************************************************/
   6
   7 /*
   8 This is a library of functions to support regular expressions whose syntax
   9 and semantics are as close as possible to those of the Perl 5 language. See
  10 the file Tech.Notes for some information on the internals.
  11
  12 Written by: Philip Hazel <ph10@cam.ac.uk>
  13
  14            Copyright (c) 1997-2004 University of Cambridge
  15
  16 -----------------------------------------------------------------------------
  17 Redistribution and use in source and binary forms, with or without
  18 modification, are permitted provided that the following conditions are met:
  19
  20     * Redistributions of source code must retain the above copyright notice,
  21       this list of conditions and the following disclaimer.
  22
  23     * Redistributions in binary form must reproduce the above copyright
  24       notice, this list of conditions and the following disclaimer in the
  25       documentation and/or other materials provided with the distribution.
  26
  27     * Neither the name of the University of Cambridge nor the names of its
  28       contributors may be used to endorse or promote products derived from
  29       this software without specific prior written permission.
  30
  31 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  32 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  33 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  34 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  35 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  36 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  37 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  38 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  39 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  40 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  41 POSSIBILITY OF SUCH DAMAGE.
  42 -----------------------------------------------------------------------------
  43 */
  44
  45
  46 /* Define DEBUG to get debugging output on stdout. */
  47 /* #define DEBUG */
  48
  49 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
  50 inline, and there are *still* stupid compilers about that don't like indented
  51 pre-processor statements. I suppose it's only been 10 years... */
  52
  53 #ifdef DEBUG
  54 #define DPRINTF(p) printf p
  55 #else
  56 #define DPRINTF(p) /*nothing*/
  57 #endif
  58
  59 /* Include the internals header, which itself includes "config.h", the Standard
  60 C headers, and the external pcre header. */
  61
  62 #include "internal.h"
  63
  64 /* If Unicode Property support is wanted, include a private copy of the
  65 function that does it, and the table that translates names to numbers. */
  66
  67 #ifdef SUPPORT_UCP
  68 #include "ucp.c"
  69 #include "ucptypetable.c"
  70 #endif
  71
  72 /* Maximum number of items on the nested bracket stacks at compile time. This
  73 applies to the nesting of all kinds of parentheses. It does not limit
  74 un-nested, non-capturing parentheses. This number can be made bigger if
  75 necessary - it is used to dimension one int and one unsigned char vector at
  76 compile time. */
  77
  78 #define BRASTACK_SIZE 200
  79
  80
  81 /* Maximum number of ints of offset to save on the stack for recursive calls.
  82 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
  83 because the offset vector is always a multiple of 3 long. */
  84
  85 #define REC_STACK_SAVE_MAX 30
  86
  87
  88 /* The maximum remaining length of subject we are prepared to search for a
  89 req_byte match. */
  90
  91 #define REQ_BYTE_MAX 1000
  92
  93
  94 /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
  95 the definition is next to the definition of the opcodes in internal.h. */
  96
  97 static const uschar OP_lengths[] = { OP_LENGTHS };
  98
  99 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
 100
 101 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
 102 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
 103
 104 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
 105 are simple data values; negative values are for special things like \d and so
 106 on. Zero means further processing is needed (for things like \x), or the escape
 107 is invalid. */
 108
 109 #if !EBCDIC   /* This is the "normal" table for ASCII systems */
 110 static const short int escapes[] = {
 111      0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
 112      0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
 113    '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
 114      0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */
 115 -ESC_P, -ESC_Q,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
 116 -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
 117    '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
 118      0,      0,      0,      0,      0,      0,  ESC_n,      0,   /* h - o */
 119 -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */
 120      0,      0, -ESC_z                                            /* x - z */
 121 };
 122
 123 #else         /* This is the "abnormal" table for EBCDIC systems */
 124 static const short int escapes[] = {
 125 /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
 126 /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
 127 /*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
 128 /*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
 129 /*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
 130 /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
 131 /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
 132 /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
 133 /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,
 134 /*  90 */     0,     0,      0,     'l',      0, ESC_n,      0, -ESC_p,
 135 /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
 136 /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,
 137 /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
 138 /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
 139 /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
 140 /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
 141 /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,
 142 /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,
 143 /*  D8 */-ESC_Q,     0,      0,       0,      0,     0,      0,      0,
 144 /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,
 145 /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
 146 /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
 147 /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
 148 };
 149 #endif
 150
 151
 152 /* Tables of names of POSIX character classes and their lengths. The list is
 153 terminated by a zero length entry. The first three must be alpha, upper, lower,
 154 as this is assumed for handling case independence. */
 155
 156 static const char *const posix_names[] = {
 157   "alpha", "lower", "upper",
 158   "alnum", "ascii", "blank", "cntrl", "digit", "graph",
 159   "print", "punct", "space", "word",  "xdigit" };
 160
 161 static const uschar posix_name_lengths[] = {
 162   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
 163
 164 /* Table of class bit maps for each POSIX class; up to three may be combined
 165 to form the class. The table for [:blank:] is dynamically modified to remove
 166 the vertical space characters. */
 167
 168 static const int posix_class_maps[] = {
 169   cbit_lower, cbit_upper, -1,             /* alpha */
 170   cbit_lower, -1,         -1,             /* lower */
 171   cbit_upper, -1,         -1,             /* upper */
 172   cbit_digit, cbit_lower, cbit_upper,     /* alnum */
 173   cbit_print, cbit_cntrl, -1,             /* ascii */
 174   cbit_space, -1,         -1,             /* blank - a GNU extension */
 175   cbit_cntrl, -1,         -1,             /* cntrl */
 176   cbit_digit, -1,         -1,             /* digit */
 177   cbit_graph, -1,         -1,             /* graph */
 178   cbit_print, -1,         -1,             /* print */
 179   cbit_punct, -1,         -1,             /* punct */
 180   cbit_space, -1,         -1,             /* space */
 181   cbit_word,  -1,         -1,             /* word - a Perl extension */
 182   cbit_xdigit,-1,         -1              /* xdigit */
 183 };
 184
 185 /* Table to identify digits and hex digits. This is used when compiling
 186 patterns. Note that the tables in chartables are dependent on the locale, and
 187 may mark arbitrary characters as digits - but the PCRE compiling code expects
 188 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
 189 a private table here. It costs 256 bytes, but it is a lot faster than doing
 190 character value tests (at least in some simple cases I timed), and in some
 191 applications one wants PCRE to compile efficiently as well as match
 192 efficiently.
 193
 194 For convenience, we use the same bit definitions as in chartables:
 195
 196   0x04   decimal digit
 197   0x08   hexadecimal digit
 198
 199 Then we can use ctype_digit and ctype_xdigit in the code. */
 200
 201 #if !EBCDIC    /* This is the "normal" case, for ASCII systems */
 202 static const unsigned char digitab[] =
 203   {
 204   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
 205   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
 206   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
 207   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
 208   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
 209   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
 210   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
 211   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
 212   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
 213   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
 214   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
 215   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
 216   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
 217   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
 218   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
 219   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
 220   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
 221   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
 222   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
 223   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
 224   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
 225   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
 226   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
 227   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
 228   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
 229   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
 230   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
 231   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
 232   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
 233   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
 234   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
 235   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
 236
 237 #else          /* This is the "abnormal" case, for EBCDIC systems */
 238 static const unsigned char digitab[] =
 239   {
 240   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
 241   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
 242   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
 243   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
 244   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
 245   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
 246   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
 247   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
 248   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
 249   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
 250   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
 251   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- ¬     */
 252   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
 253   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
 254   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
 255   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
 256   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
 257   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
 258   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
 259   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
 260   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
 261   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
 262   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
 263   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
 264   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
 265   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
 266   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
 267   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
 268   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
 269   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
 270   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
 271   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
 272
 273 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
 274   0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
 275   0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
 276   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
 277   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
 278   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
 279   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
 280   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
 281   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
 282   0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
 283   0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
 284   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
 285   0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- ¬  */
 286   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
 287   0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
 288   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
 289   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
 290   0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
 291   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
 292   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
 293   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
 294   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
 295   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
 296   0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
 297   0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
 298   0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
 299   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
 300   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
 301   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
 302   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
 303   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
 304   0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
 305   0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
 306 #endif
 307
 308
 309 /* Definition to allow mutual recursion */
 310
 311 static BOOL
 312   compile_regex(int, int, int *, uschar **, const uschar **, const char **,
 313     BOOL, int, int *, int *, branch_chain *, compile_data *);
 314
 315 /* Structure for building a chain of data that actually lives on the
 316 stack, for holding the values of the subject pointer at the start of each
 317 subpattern, so as to detect when an empty string has been matched by a
 318 subpattern - to break infinite loops. When NO_RECURSE is set, these blocks
 319 are on the heap, not on the stack. */
 320
 321 typedef struct eptrblock {
 322   struct eptrblock *epb_prev;
 323   const uschar *epb_saved_eptr;
 324 } eptrblock;
 325
 326 /* Flag bits for the match() function */
 327
 328 #define match_condassert   0x01    /* Called to check a condition assertion */
 329 #define match_isgroup      0x02    /* Set if start of bracketed group */
 330
 331 /* Non-error returns from the match() function. Error returns are externally
 332 defined PCRE_ERROR_xxx codes, which are all negative. */
 333
 334 #define MATCH_MATCH        1
 335 #define MATCH_NOMATCH      0
 336
 337
 338
 339 /*************************************************
 340 *               Global variables                 *
 341 *************************************************/
 342
 343 /* PCRE is thread-clean and doesn't use any global variables in the normal
 344 sense. However, it calls memory allocation and free functions via the four
 345 indirections below, and it can optionally do callouts. These values can be
 346 changed by the caller, but are shared between all threads. However, when
 347 compiling for Virtual Pascal, things are done differently (see pcre.in). */
 348
 349 #ifndef VPCOMPAT
 350 #ifdef __cplusplus
 351 extern "C" void *(*pcre_malloc)(size_t) = malloc;
 352 extern "C" void  (*pcre_free)(void *) = free;
 353 extern "C" void *(*pcre_stack_malloc)(size_t) = malloc;
 354 extern "C" void  (*pcre_stack_free)(void *) = free;
 355 extern "C" int   (*pcre_callout)(pcre_callout_block *) = NULL;
 356 #else
 357 void *(*pcre_malloc)(size_t) = malloc;
 358 void  (*pcre_free)(void *) = free;
 359 void *(*pcre_stack_malloc)(size_t) = malloc;
 360 void  (*pcre_stack_free)(void *) = free;
 361 int   (*pcre_callout)(pcre_callout_block *) = NULL;
 362 #endif
 363 #endif
 364
 365
 366 /*************************************************
 367 *    Macros and tables for character handling    *
 368 *************************************************/
 369
 370 /* When UTF-8 encoding is being used, a character is no longer just a single
 371 byte. The macros for character handling generate simple sequences when used in
 372 byte-mode, and more complicated ones for UTF-8 characters. */
 373
 374 #ifndef SUPPORT_UTF8
 375 #define GETCHAR(c, eptr) c = *eptr;
 376 #define GETCHARINC(c, eptr) c = *eptr++;
 377 #define GETCHARINCTEST(c, eptr) c = *eptr++;
 378 #define GETCHARLEN(c, eptr, len) c = *eptr;
 379 #define BACKCHAR(eptr)
 380
 381 #else   /* SUPPORT_UTF8 */
 382
 383 /* Get the next UTF-8 character, not advancing the pointer. This is called when
 384 we know we are in UTF-8 mode. */
 385
 386 #define GETCHAR(c, eptr) \
 387   c = *eptr; \
 388   if ((c & 0xc0) == 0xc0) \
 389     { \
 390     int gcii; \
 391     int gcaa = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
 392     int gcss = 6*gcaa; \
 393     c = (c & utf8_table3[gcaa]) << gcss; \
 394     for (gcii = 1; gcii <= gcaa; gcii++) \
 395       { \
 396       gcss -= 6; \
 397       c |= (eptr[gcii] & 0x3f) << gcss; \
 398       } \
 399     }
 400
 401 /* Get the next UTF-8 character, advancing the pointer. This is called when we
 402 know we are in UTF-8 mode. */
 403
 404 #define GETCHARINC(c, eptr) \
 405   c = *eptr++; \
 406   if ((c & 0xc0) == 0xc0) \
 407     { \
 408     int gcaa = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
 409     int gcss = 6*gcaa; \
 410     c = (c & utf8_table3[gcaa]) << gcss; \
 411     while (gcaa-- > 0) \
 412       { \
 413       gcss -= 6; \
 414       c |= (*eptr++ & 0x3f) << gcss; \
 415       } \
 416     }
 417
 418 /* Get the next character, testing for UTF-8 mode, and advancing the pointer */
 419
 420 #define GETCHARINCTEST(c, eptr) \
 421   c = *eptr++; \
 422   if (md->utf8 && (c & 0xc0) == 0xc0) \
 423     { \
 424     int gcaa = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
 425     int gcss = 6*gcaa; \
 426     c = (c & utf8_table3[gcaa]) << gcss; \
 427     while (gcaa-- > 0) \
 428       { \
 429       gcss -= 6; \
 430       c |= (*eptr++ & 0x3f) << gcss; \
 431       } \
 432     }
 433
 434 /* Get the next UTF-8 character, not advancing the pointer, incrementing length
 435 if there are extra bytes. This is called when we know we are in UTF-8 mode. */
 436
 437 #define GETCHARLEN(c, eptr, len) \
 438   c = *eptr; \
 439   if ((c & 0xc0) == 0xc0) \
 440     { \
 441     int gcii; \
 442     int gcaa = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
 443     int gcss = 6*gcaa; \
 444     c = (c & utf8_table3[gcaa]) << gcss; \
 445     for (gcii = 1; gcii <= gcaa; gcii++) \
 446       { \
 447       gcss -= 6; \
 448       c |= (eptr[gcii] & 0x3f) << gcss; \
 449       } \
 450     len += gcaa; \
 451     }
 452
 453 /* If the pointer is not at the start of a character, move it back until
 454 it is. Called only in UTF-8 mode. */
 455
 456 #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
 457
 458 #endif
 459
 460
 461
 462 /*************************************************
 463 *             Default character tables           *
 464 *************************************************/
 465
 466 /* A default set of character tables is included in the PCRE binary. Its source
 467 is built by the maketables auxiliary program, which uses the default C ctypes
 468 functions, and put in the file chartables.c. These tables are used by PCRE
 469 whenever the caller of pcre_compile() does not provide an alternate set of
 470 tables. */
 471
 472 #include "chartables.c"
 473
 474
 475
 476 #ifdef SUPPORT_UTF8
 477 /*************************************************
 478 *           Tables for UTF-8 support             *
 479 *************************************************/
 480
 481 /* These are the breakpoints for different numbers of bytes in a UTF-8
 482 character. */
 483
 484 static const int utf8_table1[] =
 485   { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
 486
 487 /* These are the indicator bits and the mask for the data bits to set in the
 488 first byte of a character, indexed by the number of additional bytes. */
 489
 490 static const int utf8_table2[] = { 0,    0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
 491 static const int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
 492
 493 /* Table of the number of extra characters, indexed by the first character
 494 masked with 0x3f. The highest number for a valid UTF-8 character is in fact
 495 0x3d. */
 496
 497 static const uschar utf8_table4[] = {
 498   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 499   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 500   2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
 501   3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
 502
 503
 504 /*************************************************
 505 *       Convert character value to UTF-8         *
 506 *************************************************/
 507
 508 /* This function takes an integer value in the range 0 - 0x7fffffff
 509 and encodes it as a UTF-8 character in 0 to 6 bytes.
 510
 511 Arguments:
 512   cvalue     the character value
 513   buffer     pointer to buffer for result - at least 6 bytes long
 514
 515 Returns:     number of characters placed in the buffer
 516 */
 517
 518 static int
 519 ord2utf8(int cvalue, uschar *buffer)
 520 {
 521 register int i, j;
 522 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
 523   if (cvalue <= utf8_table1[i]) break;
 524 buffer += i;
 525 for (j = i; j > 0; j--)
 526  {
 527  *buffer-- = 0x80 | (cvalue & 0x3f);
 528  cvalue >>= 6;
 529  }
 530 *buffer = utf8_table2[i] | cvalue;
 531 return i + 1;
 532 }
 533 #endif
 534
 535
 536
 537 /*************************************************
 538 *         Print compiled regex                   *
 539 *************************************************/
 540
 541 /* The code for doing this is held in a separate file that is also included in
 542 pcretest.c. It defines a function called print_internals(). */
 543
 544 #ifdef DEBUG
 545 #include "printint.c"
 546 #endif
 547
 548
 549
 550 /*************************************************
 551 *          Return version string                 *
 552 *************************************************/
 553
 554 #define STRING(a)  # a
 555 #define XSTRING(s) STRING(s)
 556
 557 EXPORT const char *
 558 pcre_version(void)
 559 {
 560 return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
 561 }
 562
 563
 564
 565
 566 /*************************************************
 567 *         Flip bytes in an integer               *
 568 *************************************************/
 569
 570 /* This function is called when the magic number in a regex doesn't match in
 571 order to flip its bytes to see if we are dealing with a pattern that was
 572 compiled on a host of different endianness. If so, this function is used to
 573 flip other byte values.
 574
 575 Arguments:
 576   value        the number to flip
 577   n            the number of bytes to flip (assumed to be 2 or 4)
 578
 579 Returns:       the flipped value
 580 */
 581
 582 static long int
 583 byteflip(long int value, int n)
 584 {
 585 if (n == 2) return ((value & 0x00ff) << 8) | ((value & 0xff00) >> 8);
 586 return ((value & 0x000000ff) << 24) |
 587        ((value & 0x0000ff00) <<  8) |
 588        ((value & 0x00ff0000) >>  8) |
 589        ((value & 0xff000000) >> 24);
 590 }
 591
 592
 593
 594 /*************************************************
 595 *       Test for a byte-flipped compiled regex   *
 596 *************************************************/
 597
 598 /* This function is called from pce_exec() and also from pcre_fullinfo(). Its
 599 job is to test whether the regex is byte-flipped - that is, it was compiled on
 600 a system of opposite endianness. The function is called only when the native
 601 MAGIC_NUMBER test fails. If the regex is indeed flipped, we flip all the
 602 relevant values into a different data block, and return it.
 603
 604 Arguments:
 605   re               points to the regex
 606   study            points to study data, or NULL
 607   internal_re      points to a new regex block
 608   internal_study   points to a new study block
 609
 610 Returns:           the new block if is is indeed a byte-flipped regex
 611                    NULL if it is not
 612 */
 613
 614 static real_pcre *
 615 try_flipped(const real_pcre *re, real_pcre *internal_re,
 616   const pcre_study_data *study, pcre_study_data *internal_study)
 617 {
 618 if (byteflip(re->magic_number, sizeof(re->magic_number)) != MAGIC_NUMBER)
 619   return NULL;
 620
 621 *internal_re = *re;           /* To copy other fields */
 622 internal_re->size = byteflip(re->size, sizeof(re->size));
 623 internal_re->options = byteflip(re->options, sizeof(re->options));
 624 internal_re->top_bracket = byteflip(re->top_bracket, sizeof(re->top_bracket));
 625 internal_re->top_backref = byteflip(re->top_backref, sizeof(re->top_backref));
 626 internal_re->first_byte = byteflip(re->first_byte, sizeof(re->first_byte));
 627 internal_re->req_byte = byteflip(re->req_byte, sizeof(re->req_byte));
 628 internal_re->name_table_offset = byteflip(re->name_table_offset,
 629   sizeof(re->name_table_offset));
 630 internal_re->name_entry_size = byteflip(re->name_entry_size,
 631   sizeof(re->name_entry_size));
 632 internal_re->name_count = byteflip(re->name_count, sizeof(re->name_count));
 633
 634 if (study != NULL)
 635   {
 636   *internal_study = *study;   /* To copy other fields */
 637   internal_study->size = byteflip(study->size, sizeof(study->size));
 638   internal_study->options = byteflip(study->options, sizeof(study->options));
 639   }
 640
 641 return internal_re;
 642 }
 643
 644
 645
 646 /*************************************************
 647 * (Obsolete) Return info about compiled pattern  *
 648 *************************************************/
 649
 650 /* This is the original "info" function. It picks potentially useful data out
 651 of the private structure, but its interface was too rigid. It remains for
 652 backwards compatibility. The public options are passed back in an int - though
 653 the re->options field has been expanded to a long int, all the public options
 654 at the low end of it, and so even on 16-bit systems this will still be OK.
 655 Therefore, I haven't changed the API for pcre_info().
 656
 657 Arguments:
 658   argument_re   points to compiled code
 659   optptr        where to pass back the options
 660   first_byte    where to pass back the first character,
 661                 or -1 if multiline and all branches start ^,
 662                 or -2 otherwise
 663
 664 Returns:        number of capturing subpatterns
 665                 or negative values on error
 666 */
 667
 668 EXPORT int
 669 pcre_info(const pcre *argument_re, int *optptr, int *first_byte)
 670 {
 671 real_pcre internal_re;
 672 const real_pcre *re = (const real_pcre *)argument_re;
 673 if (re == NULL) return PCRE_ERROR_NULL;
 674 if (re->magic_number != MAGIC_NUMBER)
 675   {
 676   re = try_flipped(re, &internal_re, NULL, NULL);
 677   if (re == NULL) return PCRE_ERROR_BADMAGIC;
 678   }
 679 if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
 680 if (first_byte != NULL)
 681   *first_byte = ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
 682      ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
 683 return re->top_bracket;
 684 }
 685
 686
 687
 688 /*************************************************
 689 *        Return info about compiled pattern      *
 690 *************************************************/
 691
 692 /* This is a newer "info" function which has an extensible interface so
 693 that additional items can be added compatibly.
 694
 695 Arguments:
 696   argument_re      points to compiled code
 697   extra_data       points extra data, or NULL
 698   what             what information is required
 699   where            where to put the information
 700
 701 Returns:           0 if data returned, negative on error
 702 */
 703
 704 EXPORT int
 705 pcre_fullinfo(const pcre *argument_re, const pcre_extra *extra_data, int what,
 706   void *where)
 707 {
 708 real_pcre internal_re;
 709 pcre_study_data internal_study;
 710 const real_pcre *re = (const real_pcre *)argument_re;
 711 const pcre_study_data *study = NULL;
 712
 713 if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
 714
 715 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0)
 716   study = (const pcre_study_data *)extra_data->study_data;
 717
 718 if (re->magic_number != MAGIC_NUMBER)
 719   {
 720   re = try_flipped(re, &internal_re, study, &internal_study);
 721   if (re == NULL) return PCRE_ERROR_BADMAGIC;
 722   if (study != NULL) study = &internal_study;
 723   }
 724
 725 switch (what)
 726   {
 727   case PCRE_INFO_OPTIONS:
 728   *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
 729   break;
 730
 731   case PCRE_INFO_SIZE:
 732   *((size_t *)where) = re->size;
 733   break;
 734
 735   case PCRE_INFO_STUDYSIZE:
 736   *((size_t *)where) = (study == NULL)? 0 : study->size;
 737   break;
 738
 739   case PCRE_INFO_CAPTURECOUNT:
 740   *((int *)where) = re->top_bracket;
 741   break;
 742
 743   case PCRE_INFO_BACKREFMAX:
 744   *((int *)where) = re->top_backref;
 745   break;
 746
 747   case PCRE_INFO_FIRSTBYTE:
 748   *((int *)where) =
 749     ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
 750     ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
 751   break;
 752
 753   /* Make sure we pass back the pointer to the bit vector in the external
 754   block, not the internal copy (with flipped integer fields). */
 755
 756   case PCRE_INFO_FIRSTTABLE:
 757   *((const uschar **)where) =
 758     (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
 759       ((const pcre_study_data *)extra_data->study_data)->start_bits : NULL;
 760   break;
 761
 762   case PCRE_INFO_LASTLITERAL:
 763   *((int *)where) =
 764     ((re->options & PCRE_REQCHSET) != 0)? re->req_byte : -1;
 765   break;
 766
 767   case PCRE_INFO_NAMEENTRYSIZE:
 768   *((int *)where) = re->name_entry_size;
 769   break;
 770
 771   case PCRE_INFO_NAMECOUNT:
 772   *((int *)where) = re->name_count;
 773   break;
 774
 775   case PCRE_INFO_NAMETABLE:
 776   *((const uschar **)where) = (const uschar *)re + re->name_table_offset;
 777   break;
 778
 779   case PCRE_INFO_DEFAULT_TABLES:
 780   *((const uschar **)where) = (const uschar *)pcre_default_tables;
 781   break;
 782
 783   default: return PCRE_ERROR_BADOPTION;
 784   }
 785
 786 return 0;
 787 }
 788
 789
 790
 791 /*************************************************
 792 * Return info about what features are configured *
 793 *************************************************/
 794
 795 /* This is function which has an extensible interface so that additional items
 796 can be added compatibly.
 797
 798 Arguments:
 799   what             what information is required
 800   where            where to put the information
 801
 802 Returns:           0 if data returned, negative on error
 803 */
 804
 805 EXPORT int
 806 pcre_config(int what, void *where)
 807 {
 808 switch (what)
 809   {
 810   case PCRE_CONFIG_UTF8:
 811 #ifdef SUPPORT_UTF8
 812   *((int *)where) = 1;
 813 #else
 814   *((int *)where) = 0;
 815 #endif
 816   break;
 817
 818   case PCRE_CONFIG_UNICODE_PROPERTIES:
 819 #ifdef SUPPORT_UCP
 820   *((int *)where) = 1;
 821 #else
 822   *((int *)where) = 0;
 823 #endif
 824   break;
 825
 826   case PCRE_CONFIG_NEWLINE:
 827   *((int *)where) = NEWLINE;
 828   break;
 829
 830   case PCRE_CONFIG_LINK_SIZE:
 831   *((int *)where) = LINK_SIZE;
 832   break;
 833
 834   case PCRE_CONFIG_POSIX_MALLOC_THRESHOLD:
 835   *((int *)where) = POSIX_MALLOC_THRESHOLD;
 836   break;
 837
 838   case PCRE_CONFIG_MATCH_LIMIT:
 839   *((unsigned int *)where) = MATCH_LIMIT;
 840   break;
 841
 842   case PCRE_CONFIG_STACKRECURSE:
 843 #ifdef NO_RECURSE
 844   *((int *)where) = 0;
 845 #else
 846   *((int *)where) = 1;
 847 #endif
 848   break;
 849
 850   default: return PCRE_ERROR_BADOPTION;
 851   }
 852
 853 return 0;
 854 }
 855
 856
 857
 858 #ifdef DEBUG
 859 /*************************************************
 860 *        Debugging function to print chars       *
 861 *************************************************/
 862
 863 /* Print a sequence of chars in printable format, stopping at the end of the
 864 subject if the requested.
 865
 866 Arguments:
 867   p           points to characters
 868   length      number to print
 869   is_subject  TRUE if printing from within md->start_subject
 870   md          pointer to matching data block, if is_subject is TRUE
 871
 872 Returns:     nothing
 873 */
 874
 875 static void
 876 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
 877 {
 878 int c;
 879 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
 880 while (length-- > 0)
 881   if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
 882 }
 883 #endif
 884
 885
 886
 887
 888 /*************************************************
 889 *            Handle escapes                      *
 890 *************************************************/
 891
 892 /* This function is called when a \ has been encountered. It either returns a
 893 positive value for a simple escape such as \n, or a negative value which
 894 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
 895 a positive value greater than 255 may be returned. On entry, ptr is pointing at
 896 the \. On exit, it is on the final character of the escape sequence.
 897
 898 Arguments:
 899   ptrptr     points to the pattern position pointer
 900   errorptr   points to the pointer to the error message
 901   bracount   number of previous extracting brackets
 902   options    the options bits
 903   isclass    TRUE if inside a character class
 904
 905 Returns:     zero or positive => a data character
 906              negative => a special escape sequence
 907              on error, errorptr is set
 908 */
 909
 910 static int
 911 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
 912   int options, BOOL isclass)
 913 {
 914 const uschar *ptr = *ptrptr;
 915 int c, i;
 916
 917 /* If backslash is at the end of the pattern, it's an error. */
 918
 919 c = *(++ptr);
 920 if (c == 0) *errorptr = ERR1;
 921
 922 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
 923 a table. A non-zero result is something that can be returned immediately.
 924 Otherwise further processing may be required. */
 925
 926 #if !EBCDIC    /* ASCII coding */
 927 else if (c < '0' || c > 'z') {}                           /* Not alphameric */
 928 else if ((i = escapes[c - '0']) != 0) c = i;
 929
 930 #else          /* EBCDIC coding */
 931 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */
 932 else if ((i = escapes[c - 0x48]) != 0)  c = i;
 933 #endif
 934
 935 /* Escapes that need further processing, or are illegal. */
 936
 937 else
 938   {
 939   const uschar *oldptr;
 940   switch (c)
 941     {
 942     /* A number of Perl escapes are not handled by PCRE. We give an explicit
 943     error. */
 944
 945     case 'l':
 946     case 'L':
 947     case 'N':
 948     case 'u':
 949     case 'U':
 950     *errorptr = ERR37;
 951     break;
 952
 953     /* The handling of escape sequences consisting of a string of digits
 954     starting with one that is not zero is not straightforward. By experiment,
 955     the way Perl works seems to be as follows:
 956
 957     Outside a character class, the digits are read as a decimal number. If the
 958     number is less than 10, or if there are that many previous extracting
 959     left brackets, then it is a back reference. Otherwise, up to three octal
 960     digits are read to form an escaped byte. Thus \123 is likely to be octal
 961     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
 962     value is greater than 377, the least significant 8 bits are taken. Inside a
 963     character class, \ followed by a digit is always an octal number. */
 964
 965     case '1': case '2': case '3': case '4': case '5':
 966     case '6': case '7': case '8': case '9':
 967
 968     if (!isclass)
 969       {
 970       oldptr = ptr;
 971       c -= '0';
 972       while ((digitab[ptr[1]] & ctype_digit) != 0)
 973         c = c * 10 + *(++ptr) - '0';
 974       if (c < 10 || c <= bracount)
 975         {
 976         c = -(ESC_REF + c);
 977         break;
 978         }
 979       ptr = oldptr;      /* Put the pointer back and fall through */
 980       }
 981
 982     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
 983     generates a binary zero byte and treats the digit as a following literal.
 984     Thus we have to pull back the pointer by one. */
 985
 986     if ((c = *ptr) >= '8')
 987       {
 988       ptr--;
 989       c = 0;
 990       break;
 991       }
 992
 993     /* \0 always starts an octal number, but we may drop through to here with a
 994     larger first octal digit. */
 995
 996     case '0':
 997     c -= '0';
 998     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
 999         c = c * 8 + *(++ptr) - '0';
1000     c &= 255;     /* Take least significant 8 bits */
1001     break;
1002
1003     /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
1004     which can be greater than 0xff, but only if the ddd are hex digits. */
1005
1006     case 'x':
1007 #ifdef SUPPORT_UTF8
1008     if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
1009       {
1010       const uschar *pt = ptr + 2;
1011       register int count = 0;
1012       c = 0;
1013       while ((digitab[*pt] & ctype_xdigit) != 0)
1014         {
1015         int cc = *pt++;
1016         count++;
1017 #if !EBCDIC    /* ASCII coding */
1018         if (cc >= 'a') cc -= 32;               /* Convert to upper case */
1019         c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
1020 #else          /* EBCDIC coding */
1021         if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
1022         c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
1023 #endif
1024         }
1025       if (*pt == '}')
1026         {
1027         if (c < 0 || count > 8) *errorptr = ERR34;
1028         ptr = pt;
1029         break;
1030         }
1031       /* If the sequence of hex digits does not end with '}', then we don't
1032       recognize this construct; fall through to the normal \x handling. */
1033       }
1034 #endif
1035
1036     /* Read just a single hex char */
1037
1038     c = 0;
1039     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
1040       {
1041       int cc;                               /* Some compilers don't like ++ */
1042       cc = *(++ptr);                        /* in initializers */
1043 #if !EBCDIC    /* ASCII coding */
1044       if (cc >= 'a') cc -= 32;              /* Convert to upper case */
1045       c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
1046 #else          /* EBCDIC coding */
1047       if (cc <= 'z') cc += 64;              /* Convert to upper case */
1048       c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
1049 #endif
1050       }
1051     break;
1052
1053     /* Other special escapes not starting with a digit are straightforward */
1054
1055     case 'c':
1056     c = *(++ptr);
1057     if (c == 0)
1058       {
1059       *errorptr = ERR2;
1060       return 0;
1061       }
1062
1063     /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
1064     is ASCII-specific, but then the whole concept of \cx is ASCII-specific.
1065     (However, an EBCDIC equivalent has now been added.) */
1066
1067 #if !EBCDIC    /* ASCII coding */
1068     if (c >= 'a' && c <= 'z') c -= 32;
1069     c ^= 0x40;
1070 #else          /* EBCDIC coding */
1071     if (c >= 'a' && c <= 'z') c += 64;
1072     c ^= 0xC0;
1073 #endif
1074     break;
1075
1076     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1077     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
1078     for Perl compatibility, it is a literal. This code looks a bit odd, but
1079     there used to be some cases other than the default, and there may be again
1080     in future, so I haven't "optimized" it. */
1081
1082     default:
1083     if ((options & PCRE_EXTRA) != 0) switch(c)
1084       {
1085       default:
1086       *errorptr = ERR3;
1087       break;
1088       }
1089     break;
1090     }
1091   }
1092
1093 *ptrptr = ptr;
1094 return c;
1095 }
1096
1097
1098
1099 #ifdef SUPPORT_UCP
1100 /*************************************************
1101 *               Handle \P and \p                 *
1102 *************************************************/
1103
1104 /* This function is called after \P or \p has been encountered, provided that
1105 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1106 pointing at the P or p. On exit, it is pointing at the final character of the
1107 escape sequence.
1108
1109 Argument:
1110   ptrptr     points to the pattern position pointer
1111   negptr     points to a boolean that is set TRUE for negation else FALSE
1112   errorptr   points to the pointer to the error message
1113
1114 Returns:     value from ucp_type_table, or -1 for an invalid type
1115 */
1116
1117 static int
1118 get_ucp(const uschar **ptrptr, BOOL *negptr, const char **errorptr)
1119 {
1120 int c, i, bot, top;
1121 const uschar *ptr = *ptrptr;
1122 char name[4];
1123
1124 c = *(++ptr);
1125 if (c == 0) goto ERROR_RETURN;
1126
1127 *negptr = FALSE;
1128
1129 /* \P or \p can be followed by a one- or two-character name in {}, optionally
1130 preceded by ^ for negation. */
1131
1132 if (c == '{')
1133   {
1134   if (ptr[1] == '^')
1135     {
1136     *negptr = TRUE;
1137     ptr++;
1138     }
1139   for (i = 0; i <= 2; i++)
1140     {
1141     c = *(++ptr);
1142     if (c == 0) goto ERROR_RETURN;
1143     if (c == '}') break;
1144     name[i] = c;
1145     }
1146   if (c !='}')   /* Try to distinguish error cases */
1147     {
1148     while (*(++ptr) != 0 && *ptr != '}');
1149     if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;
1150     }
1151   name[i] = 0;
1152   }
1153
1154 /* Otherwise there is just one following character */
1155
1156 else
1157   {
1158   name[0] = c;
1159   name[1] = 0;
1160   }
1161
1162 *ptrptr = ptr;
1163
1164 /* Search for a recognized property name using binary chop */
1165
1166 bot = 0;
1167 top = sizeof(utt)/sizeof(ucp_type_table);
1168
1169 while (bot < top)
1170   {
1171   i = (bot + top)/2;
1172   c = strcmp(name, utt[i].name);
1173   if (c == 0) return utt[i].value;
1174   if (c > 0) bot = i + 1; else top = i;
1175   }
1176
1177 UNKNOWN_RETURN:
1178 *errorptr = ERR47;
1179 *ptrptr = ptr;
1180 return -1;
1181
1182 ERROR_RETURN:
1183 *errorptr = ERR46;
1184 *ptrptr = ptr;
1185 return -1;
1186 }
1187 #endif
1188
1189
1190
1191
1192 /*************************************************
1193 *            Check for counted repeat            *
1194 *************************************************/
1195
1196 /* This function is called when a '{' is encountered in a place where it might
1197 start a quantifier. It looks ahead to see if it really is a quantifier or not.
1198 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1199 where the ddds are digits.
1200
1201 Arguments:
1202   p         pointer to the first char after '{'
1203
1204 Returns:    TRUE or FALSE
1205 */
1206
1207 static BOOL
1208 is_counted_repeat(const uschar *p)
1209 {
1210 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1211 while ((digitab[*p] & ctype_digit) != 0) p++;
1212 if (*p == '}') return TRUE;
1213
1214 if (*p++ != ',') return FALSE;
1215 if (*p == '}') return TRUE;
1216
1217 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1218 while ((digitab[*p] & ctype_digit) != 0) p++;
1219
1220 return (*p == '}');
1221 }
1222
1223
1224
1225 /*************************************************
1226 *         Read repeat counts                     *
1227 *************************************************/
1228
1229 /* Read an item of the form {n,m} and return the values. This is called only
1230 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1231 so the syntax is guaranteed to be correct, but we need to check the values.
1232
1233 Arguments:
1234   p          pointer to first char after '{'
1235   minp       pointer to int for min
1236   maxp       pointer to int for max
1237              returned as -1 if no max
1238   errorptr   points to pointer to error message
1239
1240 Returns:     pointer to '}' on success;
1241              current ptr on error, with errorptr set
1242 */
1243
1244 static const uschar *
1245 read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
1246 {
1247 int min = 0;
1248 int max = -1;
1249
1250 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
1251
1252 if (*p == '}') max = min; else
1253   {
1254   if (*(++p) != '}')
1255     {
1256     max = 0;
1257     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
1258     if (max < min)
1259       {
1260       *errorptr = ERR4;
1261       return p;
1262       }
1263     }
1264   }
1265
1266 /* Do paranoid checks, then fill in the required variables, and pass back the
1267 pointer to the terminating '}'. */
1268
1269 if (min > 65535 || max > 65535)
1270   *errorptr = ERR5;
1271 else
1272   {
1273   *minp = min;
1274   *maxp = max;
1275   }
1276 return p;
1277 }
1278
1279
1280
1281 /*************************************************
1282 *      Find first significant op code            *
1283 *************************************************/
1284
1285 /* This is called by several functions that scan a compiled expression looking
1286 for a fixed first character, or an anchoring op code etc. It skips over things
1287 that do not influence this. For some calls, a change of option is important.
1288 For some calls, it makes sense to skip negative forward and all backward
1289 assertions, and also the \b assertion; for others it does not.
1290
1291 Arguments:
1292   code         pointer to the start of the group
1293   options      pointer to external options
1294   optbit       the option bit whose changing is significant, or
1295                  zero if none are
1296   skipassert   TRUE if certain assertions are to be skipped
1297
1298 Returns:       pointer to the first significant opcode
1299 */
1300
1301 static const uschar*
1302 first_significant_code(const uschar *code, int *options, int optbit,
1303   BOOL skipassert)
1304 {
1305 for (;;)
1306   {
1307   switch ((int)*code)
1308     {
1309     case OP_OPT:
1310     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1311       *options = (int)code[1];
1312     code += 2;
1313     break;
1314
1315     case OP_ASSERT_NOT:
1316     case OP_ASSERTBACK:
1317     case OP_ASSERTBACK_NOT:
1318     if (!skipassert) return code;
1319     do code += GET(code, 1); while (*code == OP_ALT);
1320     code += OP_lengths[*code];
1321     break;
1322
1323     case OP_WORD_BOUNDARY:
1324     case OP_NOT_WORD_BOUNDARY:
1325     if (!skipassert) return code;
1326     /* Fall through */
1327
1328     case OP_CALLOUT:
1329     case OP_CREF:
1330     case OP_BRANUMBER:
1331     code += OP_lengths[*code];
1332     break;
1333
1334     default:
1335     return code;
1336     }
1337   }
1338 /* Control never reaches here */
1339 }
1340
1341
1342
1343
1344 /*************************************************
1345 *        Find the fixed length of a pattern      *
1346 *************************************************/
1347
1348 /* Scan a pattern and compute the fixed length of subject that will match it,
1349 if the length is fixed. This is needed for dealing with backward assertions.
1350 In UTF8 mode, the result is in characters rather than bytes.
1351
1352 Arguments:
1353   code     points to the start of the pattern (the bracket)
1354   options  the compiling options
1355
1356 Returns:   the fixed length, or -1 if there is no fixed length,
1357              or -2 if \C was encountered
1358 */
1359
1360 static int
1361 find_fixedlength(uschar *code, int options)
1362 {
1363 int length = -1;
1364
1365 register int branchlength = 0;
1366 register uschar *cc = code + 1 + LINK_SIZE;
1367
1368 /* Scan along the opcodes for this branch. If we get to the end of the
1369 branch, check the length against that of the other branches. */
1370
1371 for (;;)
1372   {
1373   int d;
1374   register int op = *cc;
1375   if (op >= OP_BRA) op = OP_BRA;
1376
1377   switch (op)
1378     {
1379     case OP_BRA:
1380     case OP_ONCE:
1381     case OP_COND:
1382     d = find_fixedlength(cc, options);
1383     if (d < 0) return d;
1384     branchlength += d;
1385     do cc += GET(cc, 1); while (*cc == OP_ALT);
1386     cc += 1 + LINK_SIZE;
1387     break;
1388
1389     /* Reached end of a branch; if it's a ket it is the end of a nested
1390     call. If it's ALT it is an alternation in a nested call. If it is
1391     END it's the end of the outer call. All can be handled by the same code. */
1392
1393     case OP_ALT:
1394     case OP_KET:
1395     case OP_KETRMAX:
1396     case OP_KETRMIN:
1397     case OP_END:
1398     if (length < 0) length = branchlength;
1399       else if (length != branchlength) return -1;
1400     if (*cc != OP_ALT) return length;
1401     cc += 1 + LINK_SIZE;
1402     branchlength = 0;
1403     break;
1404
1405     /* Skip over assertive subpatterns */
1406
1407     case OP_ASSERT:
1408     case OP_ASSERT_NOT:
1409     case OP_ASSERTBACK:
1410     case OP_ASSERTBACK_NOT:
1411     do cc += GET(cc, 1); while (*cc == OP_ALT);
1412     /* Fall through */
1413
1414     /* Skip over things that don't match chars */
1415
1416     case OP_REVERSE:
1417     case OP_BRANUMBER:
1418     case OP_CREF:
1419     case OP_OPT:
1420     case OP_CALLOUT:
1421     case OP_SOD:
1422     case OP_SOM:
1423     case OP_EOD:
1424     case OP_EODN:
1425     case OP_CIRC:
1426     case OP_DOLL:
1427     case OP_NOT_WORD_BOUNDARY:
1428     case OP_WORD_BOUNDARY:
1429     cc += OP_lengths[*cc];
1430     break;
1431
1432     /* Handle literal characters */
1433
1434     case OP_CHAR:
1435     case OP_CHARNC:
1436     branchlength++;
1437     cc += 2;
1438 #ifdef SUPPORT_UTF8
1439     if ((options & PCRE_UTF8) != 0)
1440       {
1441       while ((*cc & 0xc0) == 0x80) cc++;
1442       }
1443 #endif
1444     break;
1445
1446     /* Handle exact repetitions. The count is already in characters, but we
1447     need to skip over a multibyte character in UTF8 mode.  */
1448
1449     case OP_EXACT:
1450     branchlength += GET2(cc,1);
1451     cc += 4;
1452 #ifdef SUPPORT_UTF8
1453     if ((options & PCRE_UTF8) != 0)
1454       {
1455       while((*cc & 0x80) == 0x80) cc++;
1456       }
1457 #endif
1458     break;
1459
1460     case OP_TYPEEXACT:
1461     branchlength += GET2(cc,1);
1462     cc += 4;
1463     break;
1464
1465     /* Handle single-char matchers */
1466
1467     case OP_PROP:
1468     case OP_NOTPROP:
1469     cc++;
1470     /* Fall through */
1471
1472     case OP_NOT_DIGIT:
1473     case OP_DIGIT:
1474     case OP_NOT_WHITESPACE:
1475     case OP_WHITESPACE:
1476     case OP_NOT_WORDCHAR:
1477     case OP_WORDCHAR:
1478     case OP_ANY:
1479     branchlength++;
1480     cc++;
1481     break;
1482
1483     /* The single-byte matcher isn't allowed */
1484
1485     case OP_ANYBYTE:
1486     return -2;
1487
1488     /* Check a class for variable quantification */
1489
1490 #ifdef SUPPORT_UTF8
1491     case OP_XCLASS:
1492     cc += GET(cc, 1) - 33;
1493     /* Fall through */
1494 #endif
1495
1496     case OP_CLASS:
1497     case OP_NCLASS:
1498     cc += 33;
1499
1500     switch (*cc)
1501       {
1502       case OP_CRSTAR:
1503       case OP_CRMINSTAR:
1504       case OP_CRQUERY:
1505       case OP_CRMINQUERY:
1506       return -1;
1507
1508       case OP_CRRANGE:
1509       case OP_CRMINRANGE:
1510       if (GET2(cc,1) != GET2(cc,3)) return -1;
1511       branchlength += GET2(cc,1);
1512       cc += 5;
1513       break;
1514
1515       default:
1516       branchlength++;
1517       }
1518     break;
1519
1520     /* Anything else is variable length */
1521
1522     default:
1523     return -1;
1524     }
1525   }
1526 /* Control never gets here */
1527 }
1528
1529
1530
1531
1532 /*************************************************
1533 *    Scan compiled regex for numbered bracket    *
1534 *************************************************/
1535
1536 /* This little function scans through a compiled pattern until it finds a
1537 capturing bracket with the given number.
1538
1539 Arguments:
1540   code        points to start of expression
1541   utf8        TRUE in UTF-8 mode
1542   number      the required bracket number
1543
1544 Returns:      pointer to the opcode for the bracket, or NULL if not found
1545 */
1546
1547 static const uschar *
1548 find_bracket(const uschar *code, BOOL utf8, int number)
1549 {
1550 #ifndef SUPPORT_UTF8
1551 utf8 = utf8;               /* Stop pedantic compilers complaining */
1552 #endif
1553
1554 for (;;)
1555   {
1556   register int c = *code;
1557   if (c == OP_END) return NULL;
1558   else if (c > OP_BRA)
1559     {
1560     int n = c - OP_BRA;
1561     if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1562     if (n == number) return (uschar *)code;
1563     code += OP_lengths[OP_BRA];
1564     }
1565   else
1566     {
1567     code += OP_lengths[c];
1568
1569 #ifdef SUPPORT_UTF8
1570
1571     /* In UTF-8 mode, opcodes that are followed by a character may be followed
1572     by a multi-byte character. The length in the table is a minimum, so we have
1573     to scan along to skip the extra bytes. All opcodes are less than 128, so we
1574     can use relatively efficient code. */
1575
1576     if (utf8) switch(c)
1577       {
1578       case OP_CHAR:
1579       case OP_CHARNC:
1580       case OP_EXACT:
1581       case OP_UPTO:
1582       case OP_MINUPTO:
1583       case OP_STAR:
1584       case OP_MINSTAR:
1585       case OP_PLUS:
1586       case OP_MINPLUS:
1587       case OP_QUERY:
1588       case OP_MINQUERY:
1589       while ((*code & 0xc0) == 0x80) code++;
1590       break;
1591
1592       /* XCLASS is used for classes that cannot be represented just by a bit
1593       map. This includes negated single high-valued characters. The length in
1594       the table is zero; the actual length is stored in the compiled code. */
1595
1596       case OP_XCLASS:
1597       code += GET(code, 1) + 1;
1598       break;
1599       }
1600 #endif
1601     }
1602   }
1603 }
1604
1605
1606
1607 /*************************************************
1608 *   Scan compiled regex for recursion reference  *
1609 *************************************************/
1610
1611 /* This little function scans through a compiled pattern until it finds an
1612 instance of OP_RECURSE.
1613
1614 Arguments:
1615   code        points to start of expression
1616   utf8        TRUE in UTF-8 mode
1617
1618 Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
1619 */
1620
1621 static const uschar *
1622 find_recurse(const uschar *code, BOOL utf8)
1623 {
1624 #ifndef SUPPORT_UTF8
1625 utf8 = utf8;               /* Stop pedantic compilers complaining */
1626 #endif
1627
1628 for (;;)
1629   {
1630   register int c = *code;
1631   if (c == OP_END) return NULL;
1632   else if (c == OP_RECURSE) return code;
1633   else if (c > OP_BRA)
1634     {
1635     code += OP_lengths[OP_BRA];
1636     }
1637   else
1638     {
1639     code += OP_lengths[c];
1640
1641 #ifdef SUPPORT_UTF8
1642
1643     /* In UTF-8 mode, opcodes that are followed by a character may be followed
1644     by a multi-byte character. The length in the table is a minimum, so we have
1645     to scan along to skip the extra bytes. All opcodes are less than 128, so we
1646     can use relatively efficient code. */
1647
1648     if (utf8) switch(c)
1649       {
1650       case OP_CHAR:
1651       case OP_CHARNC:
1652       case OP_EXACT:
1653       case OP_UPTO:
1654       case OP_MINUPTO:
1655       case OP_STAR:
1656       case OP_MINSTAR:
1657       case OP_PLUS:
1658       case OP_MINPLUS:
1659       case OP_QUERY:
1660       case OP_MINQUERY:
1661       while ((*code & 0xc0) == 0x80) code++;
1662       break;
1663
1664       /* XCLASS is used for classes that cannot be represented just by a bit
1665       map. This includes negated single high-valued characters. The length in
1666       the table is zero; the actual length is stored in the compiled code. */
1667
1668       case OP_XCLASS:
1669       code += GET(code, 1) + 1;
1670       break;
1671       }
1672 #endif
1673     }
1674   }
1675 }
1676
1677
1678
1679 /*************************************************
1680 *    Scan compiled branch for non-emptiness      *
1681 *************************************************/
1682
1683 /* This function scans through a branch of a compiled pattern to see whether it
1684 can match the empty string or not. It is called only from could_be_empty()
1685 below. Note that first_significant_code() skips over assertions. If we hit an
1686 unclosed bracket, we return "empty" - this means we've struck an inner bracket
1687 whose current branch will already have been scanned.
1688
1689 Arguments:
1690   code        points to start of search
1691   endcode     points to where to stop
1692   utf8        TRUE if in UTF8 mode
1693
1694 Returns:      TRUE if what is matched could be empty
1695 */
1696
1697 static BOOL
1698 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1699 {
1700 register int c;
1701 for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);
1702      code < endcode;
1703      code = first_significant_code(code + OP_lengths[c], NULL, 0, TRUE))
1704   {
1705   const uschar *ccode;
1706
1707   c = *code;
1708
1709   if (c >= OP_BRA)
1710     {
1711     BOOL empty_branch;
1712     if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
1713
1714     /* Scan a closed bracket */
1715
1716     empty_branch = FALSE;
1717     do
1718       {
1719       if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1720         empty_branch = TRUE;
1721       code += GET(code, 1);
1722       }
1723     while (*code == OP_ALT);
1724     if (!empty_branch) return FALSE;   /* All branches are non-empty */
1725     code += 1 + LINK_SIZE;
1726     c = *code;
1727     }
1728
1729   else switch (c)
1730     {
1731     /* Check for quantifiers after a class */
1732
1733 #ifdef SUPPORT_UTF8
1734     case OP_XCLASS:
1735     ccode = code + GET(code, 1);
1736     goto CHECK_CLASS_REPEAT;
1737 #endif
1738
1739     case OP_CLASS:
1740     case OP_NCLASS:
1741     ccode = code + 33;
1742
1743 #ifdef SUPPORT_UTF8
1744     CHECK_CLASS_REPEAT:
1745 #endif
1746
1747     switch (*ccode)
1748       {
1749       case OP_CRSTAR:            /* These could be empty; continue */
1750       case OP_CRMINSTAR:
1751       case OP_CRQUERY:
1752       case OP_CRMINQUERY:
1753       break;
1754
1755       default:                   /* Non-repeat => class must match */
1756       case OP_CRPLUS:            /* These repeats aren't empty */
1757       case OP_CRMINPLUS:
1758       return FALSE;
1759
1760       case OP_CRRANGE:
1761       case OP_CRMINRANGE:
1762       if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
1763       break;
1764       }
1765     break;
1766
1767     /* Opcodes that must match a character */
1768
1769     case OP_PROP:
1770     case OP_NOTPROP:
1771     case OP_EXTUNI:
1772     case OP_NOT_DIGIT:
1773     case OP_DIGIT:
1774     case OP_NOT_WHITESPACE:
1775     case OP_WHITESPACE:
1776     case OP_NOT_WORDCHAR:
1777     case OP_WORDCHAR:
1778     case OP_ANY:
1779     case OP_ANYBYTE:
1780     case OP_CHAR:
1781     case OP_CHARNC:
1782     case OP_NOT:
1783     case OP_PLUS:
1784     case OP_MINPLUS:
1785     case OP_EXACT:
1786     case OP_NOTPLUS:
1787     case OP_NOTMINPLUS:
1788     case OP_NOTEXACT:
1789     case OP_TYPEPLUS:
1790     case OP_TYPEMINPLUS:
1791     case OP_TYPEEXACT:
1792     return FALSE;
1793
1794     /* End of branch */
1795
1796     case OP_KET:
1797     case OP_KETRMAX:
1798     case OP_KETRMIN:
1799     case OP_ALT:
1800     return TRUE;
1801
1802     /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO  may be
1803     followed by a multibyte character */
1804
1805 #ifdef SUPPORT_UTF8
1806     case OP_STAR:
1807     case OP_MINSTAR:
1808     case OP_QUERY:
1809     case OP_MINQUERY:
1810     case OP_UPTO:
1811     case OP_MINUPTO:
1812     if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1813     break;
1814 #endif
1815     }
1816   }
1817
1818 return TRUE;
1819 }
1820
1821
1822
1823 /*************************************************
1824 *    Scan compiled regex for non-emptiness       *
1825 *************************************************/
1826
1827 /* This function is called to check for left recursive calls. We want to check
1828 the current branch of the current pattern to see if it could match the empty
1829 string. If it could, we must look outwards for branches at other levels,
1830 stopping when we pass beyond the bracket which is the subject of the recursion.
1831
1832 Arguments:
1833   code        points to start of the recursion
1834   endcode     points to where to stop (current RECURSE item)
1835   bcptr       points to the chain of current (unclosed) branch starts
1836   utf8        TRUE if in UTF-8 mode
1837
1838 Returns:      TRUE if what is matched could be empty
1839 */
1840
1841 static BOOL
1842 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1843   BOOL utf8)
1844 {
1845 while (bcptr != NULL && bcptr->current >= code)
1846   {
1847   if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1848   bcptr = bcptr->outer;
1849   }
1850 return TRUE;
1851 }
1852
1853
1854
1855 /*************************************************
1856 *           Check for POSIX class syntax         *
1857 *************************************************/
1858
1859 /* This function is called when the sequence "[:" or "[." or "[=" is
1860 encountered in a character class. It checks whether this is followed by an
1861 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1862 ".]" or "=]".
1863
1864 Argument:
1865   ptr      pointer to the initial [
1866   endptr   where to return the end pointer
1867   cd       pointer to compile data
1868
1869 Returns:   TRUE or FALSE
1870 */
1871
1872 static BOOL
1873 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1874 {
1875 int terminator;          /* Don't combine these lines; the Solaris cc */
1876 terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
1877 if (*(++ptr) == '^') ptr++;
1878 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1879 if (*ptr == terminator && ptr[1] == ']')
1880   {
1881   *endptr = ptr;
1882   return TRUE;
1883   }
1884 return FALSE;
1885 }
1886
1887
1888
1889
1890 /*************************************************
1891 *          Check POSIX class name                *
1892 *************************************************/
1893
1894 /* This function is called to check the name given in a POSIX-style class entry
1895 such as [:alnum:].
1896
1897 Arguments:
1898   ptr        points to the first letter
1899   len        the length of the name
1900
1901 Returns:     a value representing the name, or -1 if unknown
1902 */
1903
1904 static int
1905 check_posix_name(const uschar *ptr, int len)
1906 {
1907 register int yield = 0;
1908 while (posix_name_lengths[yield] != 0)
1909   {
1910   if (len == posix_name_lengths[yield] &&
1911     strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1912   yield++;
1913   }
1914 return -1;
1915 }
1916
1917
1918 /*************************************************
1919 *    Adjust OP_RECURSE items in repeated group   *
1920 *************************************************/
1921
1922 /* OP_RECURSE items contain an offset from the start of the regex to the group
1923 that is referenced. This means that groups can be replicated for fixed
1924 repetition simply by copying (because the recursion is allowed to refer to
1925 earlier groups that are outside the current group). However, when a group is
1926 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1927 it, after it has been compiled. This means that any OP_RECURSE items within it
1928 that refer to the group itself or any contained groups have to have their
1929 offsets adjusted. That is the job of this function. Before it is called, the
1930 partially compiled regex must be temporarily terminated with OP_END.
1931
1932 Arguments:
1933   group      points to the start of the group
1934   adjust     the amount by which the group is to be moved
1935   utf8       TRUE in UTF-8 mode
1936   cd         contains pointers to tables etc.
1937
1938 Returns:     nothing
1939 */
1940
1941 static void
1942 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)
1943 {
1944 uschar *ptr = group;
1945 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1946   {
1947   int offset = GET(ptr, 1);
1948   if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1949   ptr += 1 + LINK_SIZE;
1950   }
1951 }
1952
1953
1954
1955 /*************************************************
1956 *        Insert an automatic callout point       *
1957 *************************************************/
1958
1959 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1960 callout points before each pattern item.
1961
1962 Arguments:
1963   code           current code pointer
1964   ptr            current pattern pointer
1965   cd             pointers to tables etc
1966
1967 Returns:         new code pointer
1968 */
1969
1970 static uschar *
1971 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1972 {
1973 *code++ = OP_CALLOUT;
1974 *code++ = 255;
1975 PUT(code, 0, ptr - cd->start_pattern);  /* Pattern offset */
1976 PUT(code, LINK_SIZE, 0);                /* Default length */
1977 return code + 2*LINK_SIZE;
1978 }
1979
1980
1981
1982 /*************************************************
1983 *         Complete a callout item                *
1984 *************************************************/
1985
1986 /* A callout item contains the length of the next item in the pattern, which
1987 we can't fill in till after we have reached the relevant point. This is used
1988 for both automatic and manual callouts.
1989
1990 Arguments:
1991   previous_callout   points to previous callout item
1992   ptr                current pattern pointer
1993   cd                 pointers to tables etc
1994
1995 Returns:             nothing
1996 */
1997
1998 static void
1999 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2000 {
2001 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
2002 PUT(previous_callout, 2 + LINK_SIZE, length);
2003 }
2004
2005
2006
2007 #ifdef SUPPORT_UCP
2008 /*************************************************
2009 *           Get othercase range                  *
2010 *************************************************/
2011
2012 /* This function is passed the start and end of a class range, in UTF-8 mode
2013 with UCP support. It searches up the characters, looking for internal ranges of
2014 characters in the "other" case. Each call returns the next one, updating the
2015 start address.
2016
2017 Arguments:
2018   cptr        points to starting character value; updated
2019   d           end value
2020   ocptr       where to put start of othercase range
2021   odptr       where to put end of othercase range
2022
2023 Yield:        TRUE when range returned; FALSE when no more
2024 */
2025
2026 static BOOL
2027 get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)
2028 {
2029 int c, chartype, othercase, next;
2030
2031 for (c = *cptr; c <= d; c++)
2032   {
2033   if (ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0) break;
2034   }
2035
2036 if (c > d) return FALSE;
2037
2038 *ocptr = othercase;
2039 next = othercase + 1;
2040
2041 for (++c; c <= d; c++)
2042   {
2043   if (ucp_findchar(c, &chartype, &othercase) != ucp_L || othercase != next)
2044     break;
2045   next++;
2046   }
2047
2048 *odptr = next - 1;
2049 *cptr = c;
2050
2051 return TRUE;
2052 }
2053 #endif  /* SUPPORT_UCP */
2054
2055
2056 /*************************************************
2057 *           Compile one branch                   *
2058 *************************************************/
2059
2060 /* Scan the pattern, compiling it into the code vector. If the options are
2061 changed during the branch, the pointer is used to change the external options
2062 bits.
2063
2064 Arguments:
2065   optionsptr     pointer to the option bits
2066   brackets       points to number of extracting brackets used
2067   codeptr        points to the pointer to the current code point
2068   ptrptr         points to the current pattern pointer
2069   errorptr       points to pointer to error message
2070   firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2071   reqbyteptr     set to the last literal character required, else < 0
2072   bcptr          points to current branch chain
2073   cd             contains pointers to tables etc.
2074
2075 Returns:         TRUE on success
2076                  FALSE, with *errorptr set on error
2077 */
2078
2079 static BOOL
2080 compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
2081   const uschar **ptrptr, const char **errorptr, int *firstbyteptr,
2082   int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
2083 {
2084 int repeat_type, op_type;
2085 int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
2086 int bravalue = 0;
2087 int greedy_default, greedy_non_default;
2088 int firstbyte, reqbyte;
2089 int zeroreqbyte, zerofirstbyte;
2090 int req_caseopt, reqvary, tempreqvary;
2091 int condcount = 0;
2092 int options = *optionsptr;
2093 int after_manual_callout = 0;
2094 register int c;
2095 register uschar *code = *codeptr;
2096 uschar *tempcode;
2097 BOOL inescq = FALSE;
2098 BOOL groupsetfirstbyte = FALSE;
2099 const uschar *ptr = *ptrptr;
2100 const uschar *tempptr;
2101 uschar *previous = NULL;
2102 uschar *previous_callout = NULL;
2103 uschar classbits[32];
2104
2105 #ifdef SUPPORT_UTF8
2106 BOOL class_utf8;
2107 BOOL utf8 = (options & PCRE_UTF8) != 0;
2108 uschar *class_utf8data;
2109 uschar utf8_char[6];
2110 #else
2111 BOOL utf8 = FALSE;
2112 #endif
2113
2114 /* Set up the default and non-default settings for greediness */
2115
2116 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2117 greedy_non_default = greedy_default ^ 1;
2118
2119 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2120 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2121 matches a non-fixed char first char; reqbyte just remains unset if we never
2122 find one.
2123
2124 When we hit a repeat whose minimum is zero, we may have to adjust these values
2125 to take the zero repeat into account. This is implemented by setting them to
2126 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2127 item types that can be repeated set these backoff variables appropriately. */
2128
2129 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2130
2131 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2132 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2133 value > 255. It is added into the firstbyte or reqbyte variables to record the
2134 case status of the value. This is used only for ASCII characters. */
2135
2136 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2137
2138 /* Switch on next character until the end of the branch */
2139
2140 for (;; ptr++)
2141   {
2142   BOOL negate_class;
2143   BOOL possessive_quantifier;
2144   BOOL is_quantifier;
2145   int class_charcount;
2146   int class_lastchar;
2147   int newoptions;
2148   int recno;
2149   int skipbytes;
2150   int subreqbyte;
2151   int subfirstbyte;
2152   int mclength;
2153   uschar mcbuffer[8];
2154
2155   /* Next byte in the pattern */
2156
2157   c = *ptr;
2158
2159   /* If in \Q...\E, check for the end; if not, we have a literal */
2160
2161   if (inescq && c != 0)
2162     {
2163     if (c == '\\' && ptr[1] == 'E')
2164       {
2165       inescq = FALSE;
2166       ptr++;
2167       continue;
2168       }
2169     else
2170       {
2171       if (previous_callout != NULL)
2172         {
2173         complete_callout(previous_callout, ptr, cd);
2174         previous_callout = NULL;
2175         }
2176       if ((options & PCRE_AUTO_CALLOUT) != 0)
2177         {
2178         previous_callout = code;
2179         code = auto_callout(code, ptr, cd);
2180         }
2181       goto NORMAL_CHAR;
2182       }
2183     }
2184
2185   /* Fill in length of a previous callout, except when the next thing is
2186   a quantifier. */
2187
2188   is_quantifier = c == '*' || c == '+' || c == '?' ||
2189     (c == '{' && is_counted_repeat(ptr+1));
2190
2191   if (!is_quantifier && previous_callout != NULL &&
2192        after_manual_callout-- <= 0)
2193     {
2194     complete_callout(previous_callout, ptr, cd);
2195     previous_callout = NULL;
2196     }
2197
2198   /* In extended mode, skip white space and comments */
2199
2200   if ((options & PCRE_EXTENDED) != 0)
2201     {
2202     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2203     if (c == '#')
2204       {
2205       /* The space before the ; is to avoid a warning on a silly compiler
2206       on the Macintosh. */
2207       while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
2208       if (c != 0) continue;   /* Else fall through to handle end of string */
2209       }
2210     }
2211
2212   /* No auto callout for quantifiers. */
2213
2214   if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2215     {
2216     previous_callout = code;
2217     code = auto_callout(code, ptr, cd);
2218     }
2219
2220   switch(c)
2221     {
2222     /* The branch terminates at end of string, |, or ). */
2223
2224     case 0:
2225     case '|':
2226     case ')':
2227     *firstbyteptr = firstbyte;
2228     *reqbyteptr = reqbyte;
2229     *codeptr = code;
2230     *ptrptr = ptr;
2231     return TRUE;
2232
2233     /* Handle single-character metacharacters. In multiline mode, ^ disables
2234     the setting of any following char as a first character. */
2235
2236     case '^':
2237     if ((options & PCRE_MULTILINE) != 0)
2238       {
2239       if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2240       }
2241     previous = NULL;
2242     *code++ = OP_CIRC;
2243     break;
2244
2245     case '$':
2246     previous = NULL;
2247     *code++ = OP_DOLL;
2248     break;
2249
2250     /* There can never be a first char if '.' is first, whatever happens about
2251     repeats. The value of reqbyte doesn't change either. */
2252
2253     case '.':
2254     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2255     zerofirstbyte = firstbyte;
2256     zeroreqbyte = reqbyte;
2257     previous = code;
2258     *code++ = OP_ANY;
2259     break;
2260
2261     /* Character classes. If the included characters are all < 255 in value, we
2262     build a 32-byte bitmap of the permitted characters, except in the special
2263     case where there is only one such character. For negated classes, we build
2264     the map as usual, then invert it at the end. However, we use a different
2265     opcode so that data characters > 255 can be handled correctly.
2266
2267     If the class contains characters outside the 0-255 range, a different
2268     opcode is compiled. It may optionally have a bit map for characters < 256,
2269     but those above are are explicitly listed afterwards. A flag byte tells
2270     whether the bitmap is present, and whether this is a negated class or not.
2271     */
2272
2273     case '[':
2274     previous = code;
2275
2276     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2277     they are encountered at the top level, so we'll do that too. */
2278
2279     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2280         check_posix_syntax(ptr, &tempptr, cd))
2281       {
2282       *errorptr = (ptr[1] == ':')? ERR13 : ERR31;
2283       goto FAILED;
2284       }
2285
2286     /* If the first character is '^', set the negation flag and skip it. */
2287
2288     if ((c = *(++ptr)) == '^')
2289       {
2290       negate_class = TRUE;
2291       c = *(++ptr);
2292       }
2293     else
2294       {
2295       negate_class = FALSE;
2296       }
2297
2298     /* Keep a count of chars with values < 256 so that we can optimize the case
2299     of just a single character (as long as it's < 256). For higher valued UTF-8
2300     characters, we don't yet do any optimization. */
2301
2302     class_charcount = 0;
2303     class_lastchar = -1;
2304
2305 #ifdef SUPPORT_UTF8
2306     class_utf8 = FALSE;                       /* No chars >= 256 */
2307     class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */
2308 #endif
2309
2310     /* Initialize the 32-char bit map to all zeros. We have to build the
2311     map in a temporary bit of store, in case the class contains only 1
2312     character (< 256), because in that case the compiled code doesn't use the
2313     bit map. */
2314
2315     memset(classbits, 0, 32 * sizeof(uschar));
2316
2317     /* Process characters until ] is reached. By writing this as a "do" it
2318     means that an initial ] is taken as a data character. The first pass
2319     through the regex checked the overall syntax, so we don't need to be very
2320     strict here. At the start of the loop, c contains the first byte of the
2321     character. */
2322
2323     do
2324       {
2325 #ifdef SUPPORT_UTF8
2326       if (utf8 && c > 127)
2327         {                           /* Braces are required because the */
2328         GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
2329         }
2330 #endif
2331
2332       /* Inside \Q...\E everything is literal except \E */
2333
2334       if (inescq)
2335         {
2336         if (c == '\\' && ptr[1] == 'E')
2337           {
2338           inescq = FALSE;
2339           ptr++;
2340           continue;
2341           }
2342         else goto LONE_SINGLE_CHARACTER;
2343         }
2344
2345       /* Handle POSIX class names. Perl allows a negation extension of the
2346       form [:^name:]. A square bracket that doesn't match the syntax is
2347       treated as a literal. We also recognize the POSIX constructions
2348       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2349       5.6 and 5.8 do. */
2350
2351       if (c == '[' &&
2352           (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2353           check_posix_syntax(ptr, &tempptr, cd))
2354         {
2355         BOOL local_negate = FALSE;
2356         int posix_class, i;
2357         register const uschar *cbits = cd->cbits;
2358
2359         if (ptr[1] != ':')
2360           {
2361           *errorptr = ERR31;
2362           goto FAILED;
2363           }
2364
2365         ptr += 2;
2366         if (*ptr == '^')
2367           {
2368           local_negate = TRUE;
2369           ptr++;
2370           }
2371
2372         posix_class = check_posix_name(ptr, tempptr - ptr);
2373         if (posix_class < 0)
2374           {
2375           *errorptr = ERR30;
2376           goto FAILED;
2377           }
2378
2379         /* If matching is caseless, upper and lower are converted to
2380         alpha. This relies on the fact that the class table starts with
2381         alpha, lower, upper as the first 3 entries. */
2382
2383         if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2384           posix_class = 0;
2385
2386         /* Or into the map we are building up to 3 of the static class
2387         tables, or their negations. The [:blank:] class sets up the same
2388         chars as the [:space:] class (all white space). We remove the vertical
2389         white space chars afterwards. */
2390
2391         posix_class *= 3;
2392         for (i = 0; i < 3; i++)
2393           {
2394           BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;
2395           int taboffset = posix_class_maps[posix_class + i];
2396           if (taboffset < 0) break;
2397           if (local_negate)
2398             {
2399             if (i == 0)
2400               for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset];
2401             else
2402               for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset];
2403             if (blankclass) classbits[1] |= 0x3c;
2404             }
2405           else
2406             {
2407             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset];
2408             if (blankclass) classbits[1] &= ~0x3c;
2409             }
2410           }
2411
2412         ptr = tempptr + 1;
2413         class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
2414         continue;    /* End of POSIX syntax handling */
2415         }
2416
2417       /* Backslash may introduce a single character, or it may introduce one
2418       of the specials, which just set a flag. Escaped items are checked for
2419       validity in the pre-compiling pass. The sequence \b is a special case.
2420       Inside a class (and only there) it is treated as backspace. Elsewhere
2421       it marks a word boundary. Other escapes have preset maps ready to
2422       or into the one we are building. We assume they have more than one
2423       character in them, so set class_charcount bigger than one. */
2424
2425       if (c == '\\')
2426         {
2427         c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
2428
2429         if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */
2430         else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
2431         else if (-c == ESC_Q)            /* Handle start of quoted string */
2432           {
2433           if (ptr[1] == '\\' && ptr[2] == 'E')
2434             {
2435             ptr += 2; /* avoid empty string */
2436             }
2437           else inescq = TRUE;
2438           continue;
2439           }
2440
2441         if (c < 0)
2442           {
2443           register const uschar *cbits = cd->cbits;
2444           class_charcount += 2;     /* Greater than 1 is what matters */
2445           switch (-c)
2446             {
2447             case ESC_d:
2448             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2449             continue;
2450
2451             case ESC_D:
2452             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2453             continue;
2454
2455             case ESC_w:
2456             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2457             continue;
2458
2459             case ESC_W:
2460             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2461             continue;
2462
2463             case ESC_s:
2464             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2465             classbits[1] &= ~0x08;   /* Perl 5.004 onwards omits VT from \s */
2466             continue;
2467
2468             case ESC_S:
2469             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2470             classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
2471             continue;
2472
2473 #ifdef SUPPORT_UCP
2474             case ESC_p:
2475             case ESC_P:
2476               {
2477               BOOL negated;
2478               int property = get_ucp(&ptr, &negated, errorptr);
2479               if (property < 0) goto FAILED;
2480               class_utf8 = TRUE;
2481               *class_utf8data++ = ((-c == ESC_p) != negated)?
2482                 XCL_PROP : XCL_NOTPROP;
2483               *class_utf8data++ = property;
2484               class_charcount -= 2;   /* Not a < 256 character */
2485               }
2486             continue;
2487 #endif
2488
2489             /* Unrecognized escapes are faulted if PCRE is running in its
2490             strict mode. By default, for compatibility with Perl, they are
2491             treated as literals. */
2492
2493             default:
2494             if ((options & PCRE_EXTRA) != 0)
2495               {
2496               *errorptr = ERR7;
2497               goto FAILED;
2498               }
2499             c = *ptr;              /* The final character */
2500             class_charcount -= 2;  /* Undo the default count from above */
2501             }
2502           }
2503
2504         /* Fall through if we have a single character (c >= 0). This may be
2505         > 256 in UTF-8 mode. */
2506
2507         }   /* End of backslash handling */
2508
2509       /* A single character may be followed by '-' to form a range. However,
2510       Perl does not permit ']' to be the end of the range. A '-' character
2511       here is treated as a literal. */
2512
2513       if (ptr[1] == '-' && ptr[2] != ']')
2514         {
2515         int d;
2516         ptr += 2;
2517
2518 #ifdef SUPPORT_UTF8
2519         if (utf8)
2520           {                           /* Braces are required because the */
2521           GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
2522           }
2523         else
2524 #endif
2525         d = *ptr;  /* Not UTF-8 mode */
2526
2527         /* The second part of a range can be a single-character escape, but
2528         not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2529         in such circumstances. */
2530
2531         if (d == '\\')
2532           {
2533           const uschar *oldptr = ptr;
2534           d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
2535
2536           /* \b is backslash; \X is literal X; any other special means the '-'
2537           was literal */
2538
2539           if (d < 0)
2540             {
2541             if (d == -ESC_b) d = '\b';
2542             else if (d == -ESC_X) d = 'X'; else
2543               {
2544               ptr = oldptr - 2;
2545               goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2546               }
2547             }
2548           }
2549
2550         /* The check that the two values are in the correct order happens in
2551         the pre-pass. Optimize one-character ranges */
2552
2553         if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2554
2555         /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2556         matching, we have to use an XCLASS with extra data items. Caseless
2557         matching for characters > 127 is available only if UCP support is
2558         available. */
2559
2560 #ifdef SUPPORT_UTF8
2561         if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2562           {
2563           class_utf8 = TRUE;
2564
2565           /* With UCP support, we can find the other case equivalents of
2566           the relevant characters. There may be several ranges. Optimize how
2567           they fit with the basic range. */
2568
2569 #ifdef SUPPORT_UCP
2570           if ((options & PCRE_CASELESS) != 0)
2571             {
2572             int occ, ocd;
2573             int cc = c;
2574             int origd = d;
2575             while (get_othercase_range(&cc, origd, &occ, &ocd))
2576               {
2577               if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */
2578
2579               if (occ < c  && ocd >= c - 1)        /* Extend the basic range */
2580                 {                                  /* if there is overlap,   */
2581                 c = occ;                           /* noting that if occ < c */
2582                 continue;                          /* we can't have ocd > d  */
2583                 }                                  /* because a subrange is  */
2584               if (ocd > d && occ <= d + 1)         /* always shorter than    */
2585                 {                                  /* the basic range.       */
2586                 d = ocd;
2587                 continue;
2588                 }
2589
2590               if (occ == ocd)
2591                 {
2592                 *class_utf8data++ = XCL_SINGLE;
2593                 }
2594               else
2595                 {
2596                 *class_utf8data++ = XCL_RANGE;
2597                 class_utf8data += ord2utf8(occ, class_utf8data);
2598                 }
2599               class_utf8data += ord2utf8(ocd, class_utf8data);
2600               }
2601             }
2602 #endif  /* SUPPORT_UCP */
2603
2604           /* Now record the original range, possibly modified for UCP caseless
2605           overlapping ranges. */
2606
2607           *class_utf8data++ = XCL_RANGE;
2608           class_utf8data += ord2utf8(c, class_utf8data);
2609           class_utf8data += ord2utf8(d, class_utf8data);
2610
2611           /* With UCP support, we are done. Without UCP support, there is no
2612           caseless matching for UTF-8 characters > 127; we can use the bit map
2613           for the smaller ones. */
2614
2615 #ifdef SUPPORT_UCP
2616           continue;    /* With next character in the class */
2617 #else
2618           if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2619
2620           /* Adjust upper limit and fall through to set up the map */
2621
2622           d = 127;
2623
2624 #endif  /* SUPPORT_UCP */
2625           }
2626 #endif  /* SUPPORT_UTF8 */
2627
2628         /* We use the bit map for all cases when not in UTF-8 mode; else
2629         ranges that lie entirely within 0-127 when there is UCP support; else
2630         for partial ranges without UCP support. */
2631
2632         for (; c <= d; c++)
2633           {
2634           classbits[c/8] |= (1 << (c&7));
2635           if ((options & PCRE_CASELESS) != 0)
2636             {
2637             int uc = cd->fcc[c];           /* flip case */
2638             classbits[uc/8] |= (1 << (uc&7));
2639             }
2640           class_charcount++;                /* in case a one-char range */
2641           class_lastchar = c;
2642           }
2643
2644         continue;   /* Go get the next char in the class */
2645         }
2646
2647       /* Handle a lone single character - we can get here for a normal
2648       non-escape char, or after \ that introduces a single character or for an
2649       apparent range that isn't. */
2650
2651       LONE_SINGLE_CHARACTER:
2652
2653       /* Handle a character that cannot go in the bit map */
2654
2655 #ifdef SUPPORT_UTF8
2656       if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2657         {
2658         class_utf8 = TRUE;
2659         *class_utf8data++ = XCL_SINGLE;
2660         class_utf8data += ord2utf8(c, class_utf8data);
2661
2662 #ifdef SUPPORT_UCP
2663         if ((options & PCRE_CASELESS) != 0)
2664           {
2665           int chartype;
2666           int othercase;
2667           if (ucp_findchar(c, &chartype, &othercase) >= 0 && othercase > 0)
2668             {
2669             *class_utf8data++ = XCL_SINGLE;
2670             class_utf8data += ord2utf8(othercase, class_utf8data);
2671             }
2672           }
2673 #endif  /* SUPPORT_UCP */
2674
2675         }
2676       else
2677 #endif  /* SUPPORT_UTF8 */
2678
2679       /* Handle a single-byte character */
2680         {
2681         classbits[c/8] |= (1 << (c&7));
2682         if ((options & PCRE_CASELESS) != 0)
2683           {
2684           c = cd->fcc[c];   /* flip case */
2685           classbits[c/8] |= (1 << (c&7));
2686           }
2687         class_charcount++;
2688         class_lastchar = c;
2689         }
2690       }
2691
2692     /* Loop until ']' reached; the check for end of string happens inside the
2693     loop. This "while" is the end of the "do" above. */
2694
2695     while ((c = *(++ptr)) != ']' || inescq);
2696
2697     /* If class_charcount is 1, we saw precisely one character whose value is
2698     less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2699     can optimize the negative case only if there were no characters >= 128
2700     because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2701     single-bytes only. This is an historical hangover. Maybe one day we can
2702     tidy these opcodes to handle multi-byte characters.
2703
2704     The optimization throws away the bit map. We turn the item into a
2705     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2706     that OP_NOT does not support multibyte characters. In the positive case, it
2707     can cause firstbyte to be set. Otherwise, there can be no first char if
2708     this item is first, whatever repeat count may follow. In the case of
2709     reqbyte, save the previous value for reinstating. */
2710
2711 #ifdef SUPPORT_UTF8
2712     if (class_charcount == 1 &&
2713           (!utf8 ||
2714           (!class_utf8 && (!negate_class || class_lastchar < 128))))
2715
2716 #else
2717     if (class_charcount == 1)
2718 #endif
2719       {
2720       zeroreqbyte = reqbyte;
2721
2722       /* The OP_NOT opcode works on one-byte characters only. */
2723
2724       if (negate_class)
2725         {
2726         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2727         zerofirstbyte = firstbyte;
2728         *code++ = OP_NOT;
2729         *code++ = class_lastchar;
2730         break;
2731         }
2732
2733       /* For a single, positive character, get the value into mcbuffer, and
2734       then we can handle this with the normal one-character code. */
2735
2736 #ifdef SUPPORT_UTF8
2737       if (utf8 && class_lastchar > 127)
2738         mclength = ord2utf8(class_lastchar, mcbuffer);
2739       else
2740 #endif
2741         {
2742         mcbuffer[0] = class_lastchar;
2743         mclength = 1;
2744         }
2745       goto ONE_CHAR;
2746       }       /* End of 1-char optimization */
2747
2748     /* The general case - not the one-char optimization. If this is the first
2749     thing in the branch, there can be no first char setting, whatever the
2750     repeat count. Any reqbyte setting must remain unchanged after any kind of
2751     repeat. */
2752
2753     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2754     zerofirstbyte = firstbyte;
2755     zeroreqbyte = reqbyte;
2756
2757     /* If there are characters with values > 255, we have to compile an
2758     extended class, with its own opcode. If there are no characters < 256,
2759     we can omit the bitmap. */
2760
2761 #ifdef SUPPORT_UTF8
2762     if (class_utf8)
2763       {
2764       *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
2765       *code++ = OP_XCLASS;
2766       code += LINK_SIZE;
2767       *code = negate_class? XCL_NOT : 0;
2768
2769       /* If the map is required, install it, and move on to the end of
2770       the extra data */
2771
2772       if (class_charcount > 0)
2773         {
2774         *code++ |= XCL_MAP;
2775         memcpy(code, classbits, 32);
2776         code = class_utf8data;
2777         }
2778
2779       /* If the map is not required, slide down the extra data. */
2780
2781       else
2782         {
2783         int len = class_utf8data - (code + 33);
2784         memmove(code + 1, code + 33, len);
2785         code += len + 1;
2786         }
2787
2788       /* Now fill in the complete length of the item */
2789
2790       PUT(previous, 1, code - previous);
2791       break;   /* End of class handling */
2792       }
2793 #endif
2794
2795     /* If there are no characters > 255, negate the 32-byte map if necessary,
2796     and copy it into the code vector. If this is the first thing in the branch,
2797     there can be no first char setting, whatever the repeat count. Any reqbyte
2798     setting must remain unchanged after any kind of repeat. */
2799
2800     if (negate_class)
2801       {
2802       *code++ = OP_NCLASS;
2803       for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2804       }
2805     else
2806       {
2807       *code++ = OP_CLASS;
2808       memcpy(code, classbits, 32);
2809       }
2810     code += 32;
2811     break;
2812
2813     /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2814     has been tested above. */
2815
2816     case '{':
2817     if (!is_quantifier) goto NORMAL_CHAR;
2818     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
2819     if (*errorptr != NULL) goto FAILED;
2820     goto REPEAT;
2821
2822     case '*':
2823     repeat_min = 0;
2824     repeat_max = -1;
2825     goto REPEAT;
2826
2827     case '+':
2828     repeat_min = 1;
2829     repeat_max = -1;
2830     goto REPEAT;
2831
2832     case '?':
2833     repeat_min = 0;
2834     repeat_max = 1;
2835
2836     REPEAT:
2837     if (previous == NULL)
2838       {
2839       *errorptr = ERR9;
2840       goto FAILED;
2841       }
2842
2843     if (repeat_min == 0)
2844       {
2845       firstbyte = zerofirstbyte;    /* Adjust for zero repeat */
2846       reqbyte = zeroreqbyte;        /* Ditto */
2847       }
2848
2849     /* Remember whether this is a variable length repeat */
2850
2851     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2852
2853     op_type = 0;                    /* Default single-char op codes */
2854     possessive_quantifier = FALSE;  /* Default not possessive quantifier */
2855
2856     /* Save start of previous item, in case we have to move it up to make space
2857     for an inserted OP_ONCE for the additional '+' extension. */
2858
2859     tempcode = previous;
2860
2861     /* If the next character is '+', we have a possessive quantifier. This
2862     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2863     If the next character is '?' this is a minimizing repeat, by default,
2864     but if PCRE_UNGREEDY is set, it works the other way round. We change the
2865     repeat type to the non-default. */
2866
2867     if (ptr[1] == '+')
2868       {
2869       repeat_type = 0;                  /* Force greedy */
2870       possessive_quantifier = TRUE;
2871       ptr++;
2872       }
2873     else if (ptr[1] == '?')
2874       {
2875       repeat_type = greedy_non_default;
2876       ptr++;
2877       }
2878     else repeat_type = greedy_default;
2879
2880     /* If previous was a recursion, we need to wrap it inside brackets so that
2881     it can be replicated if necessary. */
2882
2883     if (*previous == OP_RECURSE)
2884       {
2885       memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2886       code += 1 + LINK_SIZE;
2887       *previous = OP_BRA;
2888       PUT(previous, 1, code - previous);
2889       *code = OP_KET;
2890       PUT(code, 1, code - previous);
2891       code += 1 + LINK_SIZE;
2892       }
2893
2894     /* If previous was a character match, abolish the item and generate a
2895     repeat item instead. If a char item has a minumum of more than one, ensure
2896     that it is set in reqbyte - it might not be if a sequence such as x{3} is
2897     the first thing in a branch because the x will have gone into firstbyte
2898     instead.  */
2899
2900     if (*previous == OP_CHAR || *previous == OP_CHARNC)
2901       {
2902       /* Deal with UTF-8 characters that take up more than one byte. It's
2903       easier to write this out separately than try to macrify it. Use c to
2904       hold the length of the character in bytes, plus 0x80 to flag that it's a
2905       length rather than a small character. */
2906
2907 #ifdef SUPPORT_UTF8
2908       if (utf8 && (code[-1] & 0x80) != 0)
2909         {
2910         uschar *lastchar = code - 1;
2911         while((*lastchar & 0xc0) == 0x80) lastchar--;
2912         c = code - lastchar;            /* Length of UTF-8 character */
2913         memcpy(utf8_char, lastchar, c); /* Save the char */
2914         c |= 0x80;                      /* Flag c as a length */
2915         }
2916       else
2917 #endif
2918
2919       /* Handle the case of a single byte - either with no UTF8 support, or
2920       with UTF-8 disabled, or for a UTF-8 character < 128. */
2921
2922         {
2923         c = code[-1];
2924         if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2925         }
2926
2927       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
2928       }
2929
2930     /* If previous was a single negated character ([^a] or similar), we use
2931     one of the special opcodes, replacing it. The code is shared with single-
2932     character repeats by setting opt_type to add a suitable offset into
2933     repeat_type. OP_NOT is currently used only for single-byte chars. */
2934
2935     else if (*previous == OP_NOT)
2936       {
2937       op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
2938       c = previous[1];
2939       goto OUTPUT_SINGLE_REPEAT;
2940       }
2941
2942     /* If previous was a character type match (\d or similar), abolish it and
2943     create a suitable repeat item. The code is shared with single-character
2944     repeats by setting op_type to add a suitable offset into repeat_type. Note
2945     the the Unicode property types will be present only when SUPPORT_UCP is
2946     defined, but we don't wrap the little bits of code here because it just
2947     makes it horribly messy. */
2948
2949     else if (*previous < OP_EODN)
2950       {
2951       uschar *oldcode;
2952       int prop_type;
2953       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
2954       c = *previous;
2955
2956       OUTPUT_SINGLE_REPEAT:
2957       prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)?
2958         previous[1] : -1;
2959
2960       oldcode = code;
2961       code = previous;                  /* Usually overwrite previous item */
2962
2963       /* If the maximum is zero then the minimum must also be zero; Perl allows
2964       this case, so we do too - by simply omitting the item altogether. */
2965
2966       if (repeat_max == 0) goto END_REPEAT;
2967
2968       /* All real repeats make it impossible to handle partial matching (maybe
2969       one day we will be able to remove this restriction). */
2970
2971       if (repeat_max != 1) cd->nopartial = TRUE;
2972
2973       /* Combine the op_type with the repeat_type */
2974
2975       repeat_type += op_type;
2976
2977       /* A minimum of zero is handled either as the special case * or ?, or as
2978       an UPTO, with the maximum given. */
2979
2980       if (repeat_min == 0)
2981         {
2982         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
2983           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
2984         else
2985           {
2986           *code++ = OP_UPTO + repeat_type;
2987           PUT2INC(code, 0, repeat_max);
2988           }
2989         }
2990
2991       /* A repeat minimum of 1 is optimized into some special cases. If the
2992       maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
2993       left in place and, if the maximum is greater than 1, we use OP_UPTO with
2994       one less than the maximum. */
2995
2996       else if (repeat_min == 1)
2997         {
2998         if (repeat_max == -1)
2999           *code++ = OP_PLUS + repeat_type;
3000         else
3001           {
3002           code = oldcode;                 /* leave previous item in place */
3003           if (repeat_max == 1) goto END_REPEAT;
3004           *code++ = OP_UPTO + repeat_type;
3005           PUT2INC(code, 0, repeat_max - 1);
3006           }
3007         }
3008
3009       /* The case {n,n} is just an EXACT, while the general case {n,m} is
3010       handled as an EXACT followed by an UPTO. */
3011
3012       else
3013         {
3014         *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
3015         PUT2INC(code, 0, repeat_min);
3016
3017         /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3018         we have to insert the character for the previous code. For a repeated
3019         Unicode property match, there is an extra byte that defines the
3020         required property. In UTF-8 mode, long characters have their length in
3021         c, with the 0x80 bit as a flag. */
3022
3023         if (repeat_max < 0)
3024           {
3025 #ifdef SUPPORT_UTF8
3026           if (utf8 && c >= 128)
3027             {
3028             memcpy(code, utf8_char, c & 7);
3029             code += c & 7;
3030             }
3031           else
3032 #endif
3033             {
3034             *code++ = c;
3035             if (prop_type >= 0) *code++ = prop_type;
3036             }
3037           *code++ = OP_STAR + repeat_type;
3038           }
3039
3040         /* Else insert an UPTO if the max is greater than the min, again
3041         preceded by the character, for the previously inserted code. */
3042
3043         else if (repeat_max != repeat_min)
3044           {
3045 #ifdef SUPPORT_UTF8
3046           if (utf8 && c >= 128)
3047             {
3048             memcpy(code, utf8_char, c & 7);
3049             code += c & 7;
3050             }
3051           else
3052 #endif
3053           *code++ = c;
3054           if (prop_type >= 0) *code++ = prop_type;
3055           repeat_max -= repeat_min;
3056           *code++ = OP_UPTO + repeat_type;
3057           PUT2INC(code, 0, repeat_max);
3058           }
3059         }
3060
3061       /* The character or character type itself comes last in all cases. */
3062
3063 #ifdef SUPPORT_UTF8
3064       if (utf8 && c >= 128)
3065         {
3066         memcpy(code, utf8_char, c & 7);
3067         code += c & 7;
3068         }
3069       else
3070 #endif
3071       *code++ = c;
3072
3073       /* For a repeated Unicode property match, there is an extra byte that
3074       defines the required property. */
3075
3076 #ifdef SUPPORT_UCP
3077       if (prop_type >= 0) *code++ = prop_type;
3078 #endif
3079       }
3080
3081     /* If previous was a character class or a back reference, we put the repeat
3082     stuff after it, but just skip the item if the repeat was {0,0}. */
3083
3084     else if (*previous == OP_CLASS ||
3085              *previous == OP_NCLASS ||
3086 #ifdef SUPPORT_UTF8
3087              *previous == OP_XCLASS ||
3088 #endif
3089              *previous == OP_REF)
3090       {
3091       if (repeat_max == 0)
3092         {
3093         code = previous;
3094         goto END_REPEAT;
3095         }
3096
3097       /* All real repeats make it impossible to handle partial matching (maybe
3098       one day we will be able to remove this restriction). */
3099
3100       if (repeat_max != 1) cd->nopartial = TRUE;
3101
3102       if (repeat_min == 0 && repeat_max == -1)
3103         *code++ = OP_CRSTAR + repeat_type;
3104       else if (repeat_min == 1 && repeat_max == -1)
3105         *code++ = OP_CRPLUS + repeat_type;
3106       else if (repeat_min == 0 && repeat_max == 1)
3107         *code++ = OP_CRQUERY + repeat_type;
3108       else
3109         {
3110         *code++ = OP_CRRANGE + repeat_type;
3111         PUT2INC(code, 0, repeat_min);
3112         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
3113         PUT2INC(code, 0, repeat_max);
3114         }
3115       }
3116
3117     /* If previous was a bracket group, we may have to replicate it in certain
3118     cases. */
3119
3120     else if (*previous >= OP_BRA || *previous == OP_ONCE ||
3121              *previous == OP_COND)
3122       {
3123       register int i;
3124       int ketoffset = 0;
3125       int len = code - previous;
3126       uschar *bralink = NULL;
3127
3128       /* If the maximum repeat count is unlimited, find the end of the bracket
3129       by scanning through from the start, and compute the offset back to it
3130       from the current code pointer. There may be an OP_OPT setting following
3131       the final KET, so we can't find the end just by going back from the code
3132       pointer. */
3133
3134       if (repeat_max == -1)
3135         {
3136         register uschar *ket = previous;
3137         do ket += GET(ket, 1); while (*ket != OP_KET);
3138         ketoffset = code - ket;
3139         }
3140
3141       /* The case of a zero minimum is special because of the need to stick
3142       OP_BRAZERO in front of it, and because the group appears once in the
3143       data, whereas in other cases it appears the minimum number of times. For
3144       this reason, it is simplest to treat this case separately, as otherwise
3145       the code gets far too messy. There are several special subcases when the
3146       minimum is zero. */
3147
3148       if (repeat_min == 0)
3149         {
3150         /* If the maximum is also zero, we just omit the group from the output
3151         altogether. */
3152
3153         if (repeat_max == 0)
3154           {
3155           code = previous;
3156           goto END_REPEAT;
3157           }
3158
3159         /* If the maximum is 1 or unlimited, we just have to stick in the
3160         BRAZERO and do no more at this point. However, we do need to adjust
3161         any OP_RECURSE calls inside the group that refer to the group itself or
3162         any internal group, because the offset is from the start of the whole
3163         regex. Temporarily terminate the pattern while doing this. */
3164
3165         if (repeat_max <= 1)
3166           {
3167           *code = OP_END;
3168           adjust_recurse(previous, 1, utf8, cd);
3169           memmove(previous+1, previous, len);
3170           code++;
3171           *previous++ = OP_BRAZERO + repeat_type;
3172           }
3173
3174         /* If the maximum is greater than 1 and limited, we have to replicate
3175         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3176         The first one has to be handled carefully because it's the original
3177         copy, which has to be moved up. The remainder can be handled by code
3178         that is common with the non-zero minimum case below. We have to
3179         adjust the value or repeat_max, since one less copy is required. Once
3180         again, we may have to adjust any OP_RECURSE calls inside the group. */
3181
3182         else
3183           {
3184           int offset;
3185           *code = OP_END;
3186           adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);
3187           memmove(previous + 2 + LINK_SIZE, previous, len);
3188           code += 2 + LINK_SIZE;
3189           *previous++ = OP_BRAZERO + repeat_type;
3190           *previous++ = OP_BRA;
3191
3192           /* We chain together the bracket offset fields that have to be
3193           filled in later when the ends of the brackets are reached. */
3194
3195           offset = (bralink == NULL)? 0 : previous - bralink;
3196           bralink = previous;
3197           PUTINC(previous, 0, offset);
3198           }
3199
3200         repeat_max--;
3201         }
3202
3203       /* If the minimum is greater than zero, replicate the group as many
3204       times as necessary, and adjust the maximum to the number of subsequent
3205       copies that we need. If we set a first char from the group, and didn't
3206       set a required char, copy the latter from the former. */
3207
3208       else
3209         {
3210         if (repeat_min > 1)
3211           {
3212           if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3213           for (i = 1; i < repeat_min; i++)
3214             {
3215             memcpy(code, previous, len);
3216             code += len;
3217             }
3218           }
3219         if (repeat_max > 0) repeat_max -= repeat_min;
3220         }
3221
3222       /* This code is common to both the zero and non-zero minimum cases. If
3223       the maximum is limited, it replicates the group in a nested fashion,
3224       remembering the bracket starts on a stack. In the case of a zero minimum,
3225       the first one was set up above. In all cases the repeat_max now specifies
3226       the number of additional copies needed. */
3227
3228       if (repeat_max >= 0)
3229         {
3230         for (i = repeat_max - 1; i >= 0; i--)
3231           {
3232           *code++ = OP_BRAZERO + repeat_type;
3233
3234           /* All but the final copy start a new nesting, maintaining the
3235           chain of brackets outstanding. */
3236
3237           if (i != 0)
3238             {
3239             int offset;
3240             *code++ = OP_BRA;
3241             offset = (bralink == NULL)? 0 : code - bralink;
3242             bralink = code;
3243             PUTINC(code, 0, offset);
3244             }
3245
3246           memcpy(code, previous, len);
3247           code += len;
3248           }
3249
3250         /* Now chain through the pending brackets, and fill in their length
3251         fields (which are holding the chain links pro tem). */
3252
3253         while (bralink != NULL)
3254           {
3255           int oldlinkoffset;
3256           int offset = code - bralink + 1;
3257           uschar *bra = code - offset;
3258           oldlinkoffset = GET(bra, 1);
3259           bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3260           *code++ = OP_KET;
3261           PUTINC(code, 0, offset);
3262           PUT(bra, 1, offset);
3263           }
3264         }
3265
3266       /* If the maximum is unlimited, set a repeater in the final copy. We
3267       can't just offset backwards from the current code point, because we
3268       don't know if there's been an options resetting after the ket. The
3269       correct offset was computed above. */
3270
3271       else code[-ketoffset] = OP_KETRMAX + repeat_type;
3272       }
3273
3274     /* Else there's some kind of shambles */
3275
3276     else
3277       {
3278       *errorptr = ERR11;
3279       goto FAILED;
3280       }
3281
3282     /* If the character following a repeat is '+', we wrap the entire repeated
3283     item inside OP_ONCE brackets. This is just syntactic sugar, taken from
3284     Sun's Java package. The repeated item starts at tempcode, not at previous,
3285     which might be the first part of a string whose (former) last char we
3286     repeated. However, we don't support '+' after a greediness '?'. */
3287
3288     if (possessive_quantifier)
3289       {
3290       int len = code - tempcode;
3291       memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3292       code += 1 + LINK_SIZE;
3293       len += 1 + LINK_SIZE;
3294       tempcode[0] = OP_ONCE;
3295       *code++ = OP_KET;
3296       PUTINC(code, 0, len);
3297       PUT(tempcode, 1, len);
3298       }
3299
3300     /* In all case we no longer have a previous item. We also set the
3301     "follows varying string" flag for subsequently encountered reqbytes if
3302     it isn't already set and we have just passed a varying length item. */
3303
3304     END_REPEAT:
3305     previous = NULL;
3306     cd->req_varyopt |= reqvary;
3307     break;
3308
3309
3310     /* Start of nested bracket sub-expression, or comment or lookahead or
3311     lookbehind or option setting or condition. First deal with special things
3312     that can come after a bracket; all are introduced by ?, and the appearance
3313     of any of them means that this is not a referencing group. They were
3314     checked for validity in the first pass over the string, so we don't have to
3315     check for syntax errors here.  */
3316
3317     case '(':
3318     newoptions = options;
3319     skipbytes = 0;
3320
3321     if (*(++ptr) == '?')
3322       {
3323       int set, unset;
3324       int *optset;
3325
3326       switch (*(++ptr))
3327         {
3328         case '#':                 /* Comment; skip to ket */
3329         ptr++;
3330         while (*ptr != ')') ptr++;
3331         continue;
3332
3333         case ':':                 /* Non-extracting bracket */
3334         bravalue = OP_BRA;
3335         ptr++;
3336         break;
3337
3338         case '(':
3339         bravalue = OP_COND;       /* Conditional group */
3340
3341         /* Condition to test for recursion */
3342
3343         if (ptr[1] == 'R')
3344           {
3345           code[1+LINK_SIZE] = OP_CREF;
3346           PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
3347           skipbytes = 3;
3348           ptr += 3;
3349           }
3350
3351         /* Condition to test for a numbered subpattern match. We know that
3352         if a digit follows ( then there will just be digits until ) because
3353         the syntax was checked in the first pass. */
3354
3355         else if ((digitab[ptr[1]] && ctype_digit) != 0)
3356           {
3357           int condref;                 /* Don't amalgamate; some compilers */
3358           condref = *(++ptr) - '0';    /* grumble at autoincrement in declaration */
3359           while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
3360           if (condref == 0)
3361             {
3362             *errorptr = ERR35;
3363             goto FAILED;
3364             }
3365           ptr++;
3366           code[1+LINK_SIZE] = OP_CREF;
3367           PUT2(code, 2+LINK_SIZE, condref);
3368           skipbytes = 3;
3369           }
3370         /* For conditions that are assertions, we just fall through, having
3371         set bravalue above. */
3372         break;
3373
3374         case '=':                 /* Positive lookahead */
3375         bravalue = OP_ASSERT;
3376         ptr++;
3377         break;
3378
3379         case '!':                 /* Negative lookahead */
3380         bravalue = OP_ASSERT_NOT;
3381         ptr++;
3382         break;
3383
3384         case '<':                 /* Lookbehinds */
3385         switch (*(++ptr))
3386           {
3387           case '=':               /* Positive lookbehind */
3388           bravalue = OP_ASSERTBACK;
3389           ptr++;
3390           break;
3391
3392           case '!':               /* Negative lookbehind */
3393           bravalue = OP_ASSERTBACK_NOT;
3394           ptr++;
3395           break;
3396           }
3397         break;
3398
3399         case '>':                 /* One-time brackets */
3400         bravalue = OP_ONCE;
3401         ptr++;
3402         break;
3403
3404         case 'C':                 /* Callout - may be followed by digits; */
3405         previous_callout = code;  /* Save for later completion */
3406         after_manual_callout = 1; /* Skip one item before completing */
3407         *code++ = OP_CALLOUT;     /* Already checked that the terminating */
3408           {                       /* closing parenthesis is present. */
3409           int n = 0;
3410           while ((digitab[*(++ptr)] & ctype_digit) != 0)
3411             n = n * 10 + *ptr - '0';
3412           if (n > 255)
3413             {
3414             *errorptr = ERR38;
3415             goto FAILED;
3416             }
3417           *code++ = n;
3418           PUT(code, 0, ptr - cd->start_pattern + 1);  /* Pattern offset */
3419           PUT(code, LINK_SIZE, 0);                    /* Default length */
3420           code += 2 * LINK_SIZE;
3421           }
3422         previous = NULL;
3423         continue;
3424
3425         case 'P':                 /* Named subpattern handling */
3426         if (*(++ptr) == '<')      /* Definition */
3427           {
3428           int i, namelen;
3429           uschar *slot = cd->name_table;
3430           const uschar *name;     /* Don't amalgamate; some compilers */
3431           name = ++ptr;           /* grumble at autoincrement in declaration */
3432
3433           while (*ptr++ != '>');
3434           namelen = ptr - name - 1;
3435
3436           for (i = 0; i < cd->names_found; i++)
3437             {
3438             int crc = memcmp(name, slot+2, namelen);
3439             if (crc == 0)
3440               {
3441               if (slot[2+namelen] == 0)
3442                 {
3443                 *errorptr = ERR43;
3444                 goto FAILED;
3445                 }
3446               crc = -1;             /* Current name is substring */
3447               }
3448             if (crc < 0)
3449               {
3450               memmove(slot + cd->name_entry_size, slot,
3451                 (cd->names_found - i) * cd->name_entry_size);
3452               break;
3453               }
3454             slot += cd->name_entry_size;
3455             }
3456
3457           PUT2(slot, 0, *brackets + 1);
3458           memcpy(slot + 2, name, namelen);
3459           slot[2+namelen] = 0;
3460           cd->names_found++;
3461           goto NUMBERED_GROUP;
3462           }
3463
3464         if (*ptr == '=' || *ptr == '>')  /* Reference or recursion */
3465           {
3466           int i, namelen;
3467           int type = *ptr++;
3468           const uschar *name = ptr;
3469           uschar *slot = cd->name_table;
3470
3471           while (*ptr != ')') ptr++;
3472           namelen = ptr - name;
3473
3474           for (i = 0; i < cd->names_found; i++)
3475             {
3476             if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3477             slot += cd->name_entry_size;
3478             }
3479           if (i >= cd->names_found)
3480             {
3481             *errorptr = ERR15;
3482             goto FAILED;
3483             }
3484
3485           recno = GET2(slot, 0);
3486
3487           if (type == '>') goto HANDLE_RECURSION;  /* A few lines below */
3488
3489           /* Back reference */
3490
3491           previous = code;
3492           *code++ = OP_REF;
3493           PUT2INC(code, 0, recno);
3494           cd->backref_map |= (recno < 32)? (1 << recno) : 1;
3495           if (recno > cd->top_backref) cd->top_backref = recno;
3496           continue;
3497           }
3498
3499         /* Should never happen */
3500         break;
3501
3502         case 'R':                 /* Pattern recursion */
3503         ptr++;                    /* Same as (?0)      */
3504         /* Fall through */
3505
3506         /* Recursion or "subroutine" call */
3507
3508         case '0': case '1': case '2': case '3': case '4':
3509         case '5': case '6': case '7': case '8': case '9':
3510           {
3511           const uschar *called;
3512           recno = 0;
3513           while((digitab[*ptr] & ctype_digit) != 0)
3514             recno = recno * 10 + *ptr++ - '0';
3515
3516           /* Come here from code above that handles a named recursion */
3517
3518           HANDLE_RECURSION:
3519
3520           previous = code;
3521
3522           /* Find the bracket that is being referenced. Temporarily end the
3523           regex in case it doesn't exist. */
3524
3525           *code = OP_END;
3526           called = (recno == 0)?
3527             cd->start_code : find_bracket(cd->start_code, utf8, recno);
3528
3529           if (called == NULL)
3530             {
3531             *errorptr = ERR15;
3532             goto FAILED;
3533             }
3534
3535           /* If the subpattern is still open, this is a recursive call. We
3536           check to see if this is a left recursion that could loop for ever,
3537           and diagnose that case. */
3538
3539           if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
3540             {
3541             *errorptr = ERR40;
3542             goto FAILED;
3543             }
3544
3545           /* Insert the recursion/subroutine item */
3546
3547           *code = OP_RECURSE;
3548           PUT(code, 1, called - cd->start_code);
3549           code += 1 + LINK_SIZE;
3550           }
3551         continue;
3552
3553         /* Character after (? not specially recognized */
3554
3555         default:                  /* Option setting */
3556         set = unset = 0;
3557         optset = &set;
3558
3559         while (*ptr != ')' && *ptr != ':')
3560           {
3561           switch (*ptr++)
3562             {
3563             case '-': optset = &unset; break;
3564
3565             case 'i': *optset |= PCRE_CASELESS; break;
3566             case 'm': *optset |= PCRE_MULTILINE; break;
3567             case 's': *optset |= PCRE_DOTALL; break;
3568             case 'x': *optset |= PCRE_EXTENDED; break;
3569             case 'U': *optset |= PCRE_UNGREEDY; break;
3570             case 'X': *optset |= PCRE_EXTRA; break;
3571             }
3572           }
3573
3574         /* Set up the changed option bits, but don't change anything yet. */
3575
3576         newoptions = (options | set) & (~unset);
3577
3578         /* If the options ended with ')' this is not the start of a nested
3579         group with option changes, so the options change at this level. Compile
3580         code to change the ims options if this setting actually changes any of
3581         them. We also pass the new setting back so that it can be put at the
3582         start of any following branches, and when this group ends (if we are in
3583         a group), a resetting item can be compiled.
3584
3585         Note that if this item is right at the start of the pattern, the
3586         options will have been abstracted and made global, so there will be no
3587         change to compile. */
3588
3589         if (*ptr == ')')
3590           {
3591           if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
3592             {
3593             *code++ = OP_OPT;
3594             *code++ = newoptions & PCRE_IMS;
3595             }
3596
3597           /* Change options at this level, and pass them back for use
3598           in subsequent branches. Reset the greedy defaults and the case
3599           value for firstbyte and reqbyte. */
3600
3601           *optionsptr = options = newoptions;
3602           greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
3603           greedy_non_default = greedy_default ^ 1;
3604           req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3605
3606           previous = NULL;       /* This item can't be repeated */
3607           continue;              /* It is complete */
3608           }
3609
3610         /* If the options ended with ':' we are heading into a nested group
3611         with possible change of options. Such groups are non-capturing and are
3612         not assertions of any kind. All we need to do is skip over the ':';
3613         the newoptions value is handled below. */
3614
3615         bravalue = OP_BRA;
3616         ptr++;
3617         }
3618       }
3619
3620     /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
3621     non-capturing and behave like (?:...) brackets */
3622
3623     else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
3624       {
3625       bravalue = OP_BRA;
3626       }
3627
3628     /* Else we have a referencing group; adjust the opcode. If the bracket
3629     number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
3630     arrange for the true number to follow later, in an OP_BRANUMBER item. */
3631
3632     else
3633       {
3634       NUMBERED_GROUP:
3635       if (++(*brackets) > EXTRACT_BASIC_MAX)
3636         {
3637         bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
3638         code[1+LINK_SIZE] = OP_BRANUMBER;
3639         PUT2(code, 2+LINK_SIZE, *brackets);
3640         skipbytes = 3;
3641         }
3642       else bravalue = OP_BRA + *brackets;
3643       }
3644
3645     /* Process nested bracketed re. Assertions may not be repeated, but other
3646     kinds can be. We copy code into a non-register variable in order to be able
3647     to pass its address because some compilers complain otherwise. Pass in a
3648     new setting for the ims options if they have changed. */
3649
3650     previous = (bravalue >= OP_ONCE)? code : NULL;
3651     *code = bravalue;
3652     tempcode = code;
3653     tempreqvary = cd->req_varyopt;     /* Save value before bracket */
3654
3655     if (!compile_regex(
3656          newoptions,                   /* The complete new option state */
3657          options & PCRE_IMS,           /* The previous ims option state */
3658          brackets,                     /* Extracting bracket count */
3659          &tempcode,                    /* Where to put code (updated) */
3660          &ptr,                         /* Input pointer (updated) */
3661          errorptr,                     /* Where to put an error message */
3662          (bravalue == OP_ASSERTBACK ||
3663           bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
3664          skipbytes,                    /* Skip over OP_COND/OP_BRANUMBER */
3665          &subfirstbyte,                /* For possible first char */
3666          &subreqbyte,                  /* For possible last char */
3667          bcptr,                        /* Current branch chain */
3668          cd))                          /* Tables block */
3669       goto FAILED;
3670
3671     /* At the end of compiling, code is still pointing to the start of the
3672     group, while tempcode has been updated to point past the end of the group
3673     and any option resetting that may follow it. The pattern pointer (ptr)
3674     is on the bracket. */
3675
3676     /* If this is a conditional bracket, check that there are no more than
3677     two branches in the group. */
3678
3679     else if (bravalue == OP_COND)
3680       {
3681       uschar *tc = code;
3682       condcount = 0;
3683
3684       do {
3685          condcount++;
3686          tc += GET(tc,1);
3687          }
3688       while (*tc != OP_KET);
3689
3690       if (condcount > 2)
3691         {
3692         *errorptr = ERR27;
3693         goto FAILED;
3694         }
3695
3696       /* If there is just one branch, we must not make use of its firstbyte or
3697       reqbyte, because this is equivalent to an empty second branch. */
3698
3699       if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
3700       }
3701
3702     /* Handle updating of the required and first characters. Update for normal
3703     brackets of all kinds, and conditions with two branches (see code above).
3704     If the bracket is followed by a quantifier with zero repeat, we have to
3705     back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
3706     main loop so that they can be accessed for the back off. */
3707
3708     zeroreqbyte = reqbyte;
3709     zerofirstbyte = firstbyte;
3710     groupsetfirstbyte = FALSE;
3711
3712     if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
3713       {
3714       /* If we have not yet set a firstbyte in this branch, take it from the
3715       subpattern, remembering that it was set here so that a repeat of more
3716       than one can replicate it as reqbyte if necessary. If the subpattern has
3717       no firstbyte, set "none" for the whole branch. In both cases, a zero
3718       repeat forces firstbyte to "none". */
3719
3720       if (firstbyte == REQ_UNSET)
3721         {
3722         if (subfirstbyte >= 0)
3723           {
3724           firstbyte = subfirstbyte;
3725           groupsetfirstbyte = TRUE;
3726           }
3727         else firstbyte = REQ_NONE;
3728         zerofirstbyte = REQ_NONE;
3729         }
3730
3731       /* If firstbyte was previously set, convert the subpattern's firstbyte
3732       into reqbyte if there wasn't one, using the vary flag that was in
3733       existence beforehand. */
3734
3735       else if (subfirstbyte >= 0 && subreqbyte < 0)
3736         subreqbyte = subfirstbyte | tempreqvary;
3737
3738       /* If the subpattern set a required byte (or set a first byte that isn't
3739       really the first byte - see above), set it. */
3740
3741       if (subreqbyte >= 0) reqbyte = subreqbyte;
3742       }
3743
3744     /* For a forward assertion, we take the reqbyte, if set. This can be
3745     helpful if the pattern that follows the assertion doesn't set a different
3746     char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
3747     for an assertion, however because it leads to incorrect effect for patterns
3748     such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
3749     of a firstbyte. This is overcome by a scan at the end if there's no
3750     firstbyte, looking for an asserted first char. */
3751
3752     else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
3753
3754     /* Now update the main code pointer to the end of the group. */
3755
3756     code = tempcode;
3757
3758     /* Error if hit end of pattern */
3759
3760     if (*ptr != ')')
3761       {
3762       *errorptr = ERR14;
3763       goto FAILED;
3764       }
3765     break;
3766
3767     /* Check \ for being a real metacharacter; if not, fall through and handle
3768     it as a data character at the start of a string. Escape items are checked
3769     for validity in the pre-compiling pass. */
3770
3771     case '\\':
3772     tempptr = ptr;
3773     c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
3774
3775     /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
3776     are arranged to be the negation of the corresponding OP_values. For the
3777     back references, the values are ESC_REF plus the reference number. Only
3778     back references and those types that consume a character may be repeated.
3779     We can test for values between ESC_b and ESC_Z for the latter; this may
3780     have to change if any new ones are ever created. */
3781
3782     if (c < 0)
3783       {
3784       if (-c == ESC_Q)            /* Handle start of quoted string */
3785         {
3786         if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
3787           else inescq = TRUE;
3788         continue;
3789         }
3790
3791       /* For metasequences that actually match a character, we disable the
3792       setting of a first character if it hasn't already been set. */
3793
3794       if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
3795         firstbyte = REQ_NONE;
3796
3797       /* Set values to reset to if this is followed by a zero repeat. */
3798
3799       zerofirstbyte = firstbyte;
3800       zeroreqbyte = reqbyte;
3801
3802       /* Back references are handled specially */
3803
3804       if (-c >= ESC_REF)
3805         {
3806         int number = -c - ESC_REF;
3807         previous = code;
3808         *code++ = OP_REF;
3809         PUT2INC(code, 0, number);
3810         }
3811
3812       /* So are Unicode property matches, if supported. We know that get_ucp
3813       won't fail because it was tested in the pre-pass. */
3814
3815 #ifdef SUPPORT_UCP
3816       else if (-c == ESC_P || -c == ESC_p)
3817         {
3818         BOOL negated;
3819         int value = get_ucp(&ptr, &negated, errorptr);
3820         previous = code;
3821         *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
3822         *code++ = value;
3823         }
3824 #endif
3825
3826       /* For the rest, we can obtain the OP value by negating the escape
3827       value */
3828
3829       else
3830         {
3831         previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
3832         *code++ = -c;
3833         }
3834       continue;
3835       }
3836
3837     /* We have a data character whose value is in c. In UTF-8 mode it may have
3838     a value > 127. We set its representation in the length/buffer, and then
3839     handle it as a data character. */
3840
3841 #ifdef SUPPORT_UTF8
3842     if (utf8 && c > 127)
3843       mclength = ord2utf8(c, mcbuffer);
3844     else
3845 #endif
3846
3847      {
3848      mcbuffer[0] = c;
3849      mclength = 1;
3850      }
3851
3852     goto ONE_CHAR;
3853
3854     /* Handle a literal character. It is guaranteed not to be whitespace or #
3855     when the extended flag is set. If we are in UTF-8 mode, it may be a
3856     multi-byte literal character. */
3857
3858     default:
3859     NORMAL_CHAR:
3860     mclength = 1;
3861     mcbuffer[0] = c;
3862
3863 #ifdef SUPPORT_UTF8
3864     if (utf8 && (c & 0xc0) == 0xc0)
3865       {
3866       while ((ptr[1] & 0xc0) == 0x80)
3867         mcbuffer[mclength++] = *(++ptr);
3868       }
3869 #endif
3870
3871     /* At this point we have the character's bytes in mcbuffer, and the length
3872     in mclength. When not in UTF-8 mode, the length is always 1. */
3873
3874     ONE_CHAR:
3875     previous = code;
3876     *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
3877     for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
3878
3879     /* Set the first and required bytes appropriately. If no previous first
3880     byte, set it from this character, but revert to none on a zero repeat.
3881     Otherwise, leave the firstbyte value alone, and don't change it on a zero
3882     repeat. */
3883
3884     if (firstbyte == REQ_UNSET)
3885       {
3886       zerofirstbyte = REQ_NONE;
3887       zeroreqbyte = reqbyte;
3888
3889       /* If the character is more than one byte long, we can set firstbyte
3890       only if it is not to be matched caselessly. */
3891
3892       if (mclength == 1 || req_caseopt == 0)
3893         {
3894         firstbyte = mcbuffer[0] | req_caseopt;
3895         if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
3896         }
3897       else firstbyte = reqbyte = REQ_NONE;
3898       }
3899
3900     /* firstbyte was previously set; we can set reqbyte only the length is
3901     1 or the matching is caseful. */
3902
3903     else
3904       {
3905       zerofirstbyte = firstbyte;
3906       zeroreqbyte = reqbyte;
3907       if (mclength == 1 || req_caseopt == 0)
3908         reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3909       }
3910
3911     break;            /* End of literal character handling */
3912     }
3913   }                   /* end of big loop */
3914
3915 /* Control never reaches here by falling through, only by a goto for all the
3916 error states. Pass back the position in the pattern so that it can be displayed
3917 to the user for diagnosing the error. */
3918
3919 FAILED:
3920 *ptrptr = ptr;
3921 return FALSE;
3922 }
3923
3924
3925
3926
3927 /*************************************************
3928 *     Compile sequence of alternatives           *
3929 *************************************************/
3930
3931 /* On entry, ptr is pointing past the bracket character, but on return
3932 it points to the closing bracket, or vertical bar, or end of string.
3933 The code variable is pointing at the byte into which the BRA operator has been
3934 stored. If the ims options are changed at the start (for a (?ims: group) or
3935 during any branch, we need to insert an OP_OPT item at the start of every
3936 following branch to ensure they get set correctly at run time, and also pass
3937 the new options into every subsequent branch compile.
3938
3939 Argument:
3940   options        option bits, including any changes for this subpattern
3941   oldims         previous settings of ims option bits
3942   brackets       -> int containing the number of extracting brackets used
3943   codeptr        -> the address of the current code pointer
3944   ptrptr         -> the address of the current pattern pointer
3945   errorptr       -> pointer to error message
3946   lookbehind     TRUE if this is a lookbehind assertion
3947   skipbytes      skip this many bytes at start (for OP_COND, OP_BRANUMBER)
3948   firstbyteptr   place to put the first required character, or a negative number
3949   reqbyteptr     place to put the last required character, or a negative number
3950   bcptr          pointer to the chain of currently open branches
3951   cd             points to the data block with tables pointers etc.
3952
3953 Returns:      TRUE on success
3954 */
3955
3956 static BOOL
3957 compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
3958   const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes,
3959   int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
3960 {
3961 const uschar *ptr = *ptrptr;
3962 uschar *code = *codeptr;
3963 uschar *last_branch = code;
3964 uschar *start_bracket = code;
3965 uschar *reverse_count = NULL;
3966 int firstbyte, reqbyte;
3967 int branchfirstbyte, branchreqbyte;
3968 branch_chain bc;
3969
3970 bc.outer = bcptr;
3971 bc.current = code;
3972
3973 firstbyte = reqbyte = REQ_UNSET;
3974
3975 /* Offset is set zero to mark that this bracket is still open */
3976
3977 PUT(code, 1, 0);
3978 code += 1 + LINK_SIZE + skipbytes;
3979
3980 /* Loop for each alternative branch */
3981
3982 for (;;)
3983   {
3984   /* Handle a change of ims options at the start of the branch */
3985
3986   if ((options & PCRE_IMS) != oldims)
3987     {
3988     *code++ = OP_OPT;
3989     *code++ = options & PCRE_IMS;
3990     }
3991
3992   /* Set up dummy OP_REVERSE if lookbehind assertion */
3993
3994   if (lookbehind)
3995     {
3996     *code++ = OP_REVERSE;
3997     reverse_count = code;
3998     PUTINC(code, 0, 0);
3999     }
4000
4001   /* Now compile the branch */
4002
4003   if (!compile_branch(&options, brackets, &code, &ptr, errorptr,
4004         &branchfirstbyte, &branchreqbyte, &bc, cd))
4005     {
4006     *ptrptr = ptr;
4007     return FALSE;
4008     }
4009
4010   /* If this is the first branch, the firstbyte and reqbyte values for the
4011   branch become the values for the regex. */
4012
4013   if (*last_branch != OP_ALT)
4014     {
4015     firstbyte = branchfirstbyte;
4016     reqbyte = branchreqbyte;
4017     }
4018
4019   /* If this is not the first branch, the first char and reqbyte have to
4020   match the values from all the previous branches, except that if the previous
4021   value for reqbyte didn't have REQ_VARY set, it can still match, and we set
4022   REQ_VARY for the regex. */
4023
4024   else
4025     {
4026     /* If we previously had a firstbyte, but it doesn't match the new branch,
4027     we have to abandon the firstbyte for the regex, but if there was previously
4028     no reqbyte, it takes on the value of the old firstbyte. */
4029
4030     if (firstbyte >= 0 && firstbyte != branchfirstbyte)
4031       {
4032       if (reqbyte < 0) reqbyte = firstbyte;
4033       firstbyte = REQ_NONE;
4034       }
4035
4036     /* If we (now or from before) have no firstbyte, a firstbyte from the
4037     branch becomes a reqbyte if there isn't a branch reqbyte. */
4038
4039     if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
4040         branchreqbyte = branchfirstbyte;
4041
4042     /* Now ensure that the reqbytes match */
4043
4044     if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
4045       reqbyte = REQ_NONE;
4046     else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */
4047     }
4048
4049   /* If lookbehind, check that this branch matches a fixed-length string,
4050   and put the length into the OP_REVERSE item. Temporarily mark the end of
4051   the branch with OP_END. */
4052
4053   if (lookbehind)
4054     {
4055     int length;
4056     *code = OP_END;
4057     length = find_fixedlength(last_branch, options);
4058     DPRINTF(("fixed length = %d\n", length));
4059     if (length < 0)
4060       {
4061       *errorptr = (length == -2)? ERR36 : ERR25;
4062       *ptrptr = ptr;
4063       return FALSE;
4064       }
4065     PUT(reverse_count, 0, length);
4066     }
4067
4068   /* Reached end of expression, either ')' or end of pattern. Go back through
4069   the alternative branches and reverse the chain of offsets, with the field in
4070   the BRA item now becoming an offset to the first alternative. If there are
4071   no alternatives, it points to the end of the group. The length in the
4072   terminating ket is always the length of the whole bracketed item. If any of
4073   the ims options were changed inside the group, compile a resetting op-code
4074   following, except at the very end of the pattern. Return leaving the pointer
4075   at the terminating char. */
4076
4077   if (*ptr != '|')
4078     {
4079     int length = code - last_branch;
4080     do
4081       {
4082       int prev_length = GET(last_branch, 1);
4083       PUT(last_branch, 1, length);
4084       length = prev_length;
4085       last_branch -= length;
4086       }
4087     while (length > 0);
4088
4089     /* Fill in the ket */
4090
4091     *code = OP_KET;
4092     PUT(code, 1, code - start_bracket);
4093     code += 1 + LINK_SIZE;
4094
4095     /* Resetting option if needed */
4096
4097     if ((options & PCRE_IMS) != oldims && *ptr == ')')
4098       {
4099       *code++ = OP_OPT;
4100       *code++ = oldims;
4101       }
4102
4103     /* Set values to pass back */
4104
4105     *codeptr = code;
4106     *ptrptr = ptr;
4107     *firstbyteptr = firstbyte;
4108     *reqbyteptr = reqbyte;
4109     return TRUE;
4110     }
4111
4112   /* Another branch follows; insert an "or" node. Its length field points back
4113   to the previous branch while the bracket remains open. At the end the chain
4114   is reversed. It's done like this so that the start of the bracket has a
4115   zero offset until it is closed, making it possible to detect recursion. */
4116
4117   *code = OP_ALT;
4118   PUT(code, 1, code - last_branch);
4119   bc.current = last_branch = code;
4120   code += 1 + LINK_SIZE;
4121   ptr++;
4122   }
4123 /* Control never reaches here */
4124 }
4125
4126
4127
4128
4129 /*************************************************
4130 *          Check for anchored expression         *
4131 *************************************************/
4132
4133 /* Try to find out if this is an anchored regular expression. Consider each
4134 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
4135 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
4136 it's anchored. However, if this is a multiline pattern, then only OP_SOD
4137 counts, since OP_CIRC can match in the middle.
4138
4139 We can also consider a regex to be anchored if OP_SOM starts all its branches.
4140 This is the code for \G, which means "match at start of match position, taking
4141 into account the match offset".
4142
4143 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
4144 because that will try the rest of the pattern at all possible matching points,
4145 so there is no point trying again.... er ....
4146
4147 .... except when the .* appears inside capturing parentheses, and there is a
4148 subsequent back reference to those parentheses. We haven't enough information
4149 to catch that case precisely.
4150
4151 At first, the best we could do was to detect when .* was in capturing brackets
4152 and the highest back reference was greater than or equal to that level.
4153 However, by keeping a bitmap of the first 31 back references, we can catch some
4154 of the more common cases more precisely.
4155
4156 Arguments:
4157   code           points to start of expression (the bracket)
4158   options        points to the options setting
4159   bracket_map    a bitmap of which brackets we are inside while testing; this
4160                   handles up to substring 31; after that we just have to take
4161                   the less precise approach
4162   backref_map    the back reference bitmap
4163
4164 Returns:     TRUE or FALSE
4165 */
4166
4167 static BOOL
4168 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
4169   unsigned int backref_map)
4170 {
4171 do {
4172    const uschar *scode =
4173      first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE);
4174    register int op = *scode;
4175
4176    /* Capturing brackets */
4177
4178    if (op > OP_BRA)
4179      {
4180      int new_map;
4181      op -= OP_BRA;
4182      if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
4183      new_map = bracket_map | ((op < 32)? (1 << op) : 1);
4184      if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
4185      }
4186
4187    /* Other brackets */
4188
4189    else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4190      {
4191      if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
4192      }
4193
4194    /* .* is not anchored unless DOTALL is set and it isn't in brackets that
4195    are or may be referenced. */
4196
4197    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
4198             (*options & PCRE_DOTALL) != 0)
4199      {
4200      if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
4201      }
4202
4203    /* Check for explicit anchoring */
4204
4205    else if (op != OP_SOD && op != OP_SOM &&
4206            ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
4207      return FALSE;
4208    code += GET(code, 1);
4209    }
4210 while (*code == OP_ALT);   /* Loop for each alternative */
4211 return TRUE;
4212 }
4213
4214
4215
4216 /*************************************************
4217 *         Check for starting with ^ or .*        *
4218 *************************************************/
4219
4220 /* This is called to find out if every branch starts with ^ or .* so that
4221 "first char" processing can be done to speed things up in multiline
4222 matching and for non-DOTALL patterns that start with .* (which must start at
4223 the beginning or after \n). As in the case of is_anchored() (see above), we
4224 have to take account of back references to capturing brackets that contain .*
4225 because in that case we can't make the assumption.
4226
4227 Arguments:
4228   code           points to start of expression (the bracket)
4229   bracket_map    a bitmap of which brackets we are inside while testing; this
4230                   handles up to substring 31; after that we just have to take
4231                   the less precise approach
4232   backref_map    the back reference bitmap
4233
4234 Returns:         TRUE or FALSE
4235 */
4236
4237 static BOOL
4238 is_startline(const uschar *code, unsigned int bracket_map,
4239   unsigned int backref_map)
4240 {
4241 do {
4242    const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0,
4243      FALSE);
4244    register int op = *scode;
4245
4246    /* Capturing brackets */
4247
4248    if (op > OP_BRA)
4249      {
4250      int new_map;
4251      op -= OP_BRA;
4252      if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
4253      new_map = bracket_map | ((op < 32)? (1 << op) : 1);
4254      if (!is_startline(scode, new_map, backref_map)) return FALSE;
4255      }
4256
4257    /* Other brackets */
4258
4259    else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4260      { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
4261
4262    /* .* means "start at start or after \n" if it isn't in brackets that
4263    may be referenced. */
4264
4265    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
4266      {
4267      if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
4268      }
4269
4270    /* Check for explicit circumflex */
4271
4272    else if (op != OP_CIRC) return FALSE;
4273
4274    /* Move on to the next alternative */
4275
4276    code += GET(code, 1);
4277    }
4278 while (*code == OP_ALT);  /* Loop for each alternative */
4279 return TRUE;
4280 }
4281
4282
4283
4284 /*************************************************
4285 *       Check for asserted fixed first char      *
4286 *************************************************/
4287
4288 /* During compilation, the "first char" settings from forward assertions are
4289 discarded, because they can cause conflicts with actual literals that follow.
4290 However, if we end up without a first char setting for an unanchored pattern,
4291 it is worth scanning the regex to see if there is an initial asserted first
4292 char. If all branches start with the same asserted char, or with a bracket all
4293 of whose alternatives start with the same asserted char (recurse ad lib), then
4294 we return that char, otherwise -1.
4295
4296 Arguments:
4297   code       points to start of expression (the bracket)
4298   options    pointer to the options (used to check casing changes)
4299   inassert   TRUE if in an assertion
4300
4301 Returns:     -1 or the fixed first char
4302 */
4303
4304 static int
4305 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
4306 {
4307 register int c = -1;
4308 do {
4309    int d;
4310    const uschar *scode =
4311      first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
4312    register int op = *scode;
4313
4314    if (op >= OP_BRA) op = OP_BRA;
4315
4316    switch(op)
4317      {
4318      default:
4319      return -1;
4320
4321      case OP_BRA:
4322      case OP_ASSERT:
4323      case OP_ONCE:
4324      case OP_COND:
4325      if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
4326        return -1;
4327      if (c < 0) c = d; else if (c != d) return -1;
4328      break;
4329
4330      case OP_EXACT:       /* Fall through */
4331      scode += 2;
4332
4333      case OP_CHAR:
4334      case OP_CHARNC:
4335      case OP_PLUS:
4336      case OP_MINPLUS:
4337      if (!inassert) return -1;
4338      if (c < 0)
4339        {
4340        c = scode[1];
4341        if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
4342        }
4343      else if (c != scode[1]) return -1;
4344      break;
4345      }
4346
4347    code += GET(code, 1);
4348    }
4349 while (*code == OP_ALT);
4350 return c;
4351 }
4352
4353
4354
4355
4356 #ifdef SUPPORT_UTF8
4357 /*************************************************
4358 *         Validate a UTF-8 string                *
4359 *************************************************/
4360
4361 /* This function is called (optionally) at the start of compile or match, to
4362 validate that a supposed UTF-8 string is actually valid. The early check means
4363 that subsequent code can assume it is dealing with a valid string. The check
4364 can be turned off for maximum performance, but then consequences of supplying
4365 an invalid string are then undefined.
4366
4367 Arguments:
4368   string       points to the string
4369   length       length of string, or -1 if the string is zero-terminated
4370
4371 Returns:       < 0    if the string is a valid UTF-8 string
4372                >= 0   otherwise; the value is the offset of the bad byte
4373 */
4374
4375 static int
4376 valid_utf8(const uschar *string, int length)
4377 {
4378 register const uschar *p;
4379
4380 if (length < 0)
4381   {
4382   for (p = string; *p != 0; p++);
4383   length = p - string;
4384   }
4385
4386 for (p = string; length-- > 0; p++)
4387   {
4388   register int ab;
4389   register int c = *p;
4390   if (c < 128) continue;
4391   if ((c & 0xc0) != 0xc0) return p - string;
4392   ab = utf8_table4[c & 0x3f];  /* Number of additional bytes */
4393   if (length < ab) return p - string;
4394   length -= ab;
4395
4396   /* Check top bits in the second byte */
4397   if ((*(++p) & 0xc0) != 0x80) return p - string;
4398
4399   /* Check for overlong sequences for each different length */
4400   switch (ab)
4401     {
4402     /* Check for xx00 000x */
4403     case 1:
4404     if ((c & 0x3e) == 0) return p - string;
4405     continue;   /* We know there aren't any more bytes to check */
4406
4407     /* Check for 1110 0000, xx0x xxxx */
4408     case 2:
4409     if (c == 0xe0 && (*p & 0x20) == 0) return p - string;
4410     break;
4411
4412     /* Check for 1111 0000, xx00 xxxx */
4413     case 3:
4414     if (c == 0xf0 && (*p & 0x30) == 0) return p - string;
4415     break;
4416
4417     /* Check for 1111 1000, xx00 0xxx */
4418     case 4:
4419     if (c == 0xf8 && (*p & 0x38) == 0) return p - string;
4420     break;
4421
4422     /* Check for leading 0xfe or 0xff, and then for 1111 1100, xx00 00xx */
4423     case 5:
4424     if (c == 0xfe || c == 0xff ||
4425        (c == 0xfc && (*p & 0x3c) == 0)) return p - string;
4426     break;
4427     }
4428
4429   /* Check for valid bytes after the 2nd, if any; all must start 10 */
4430   while (--ab > 0)
4431     {
4432     if ((*(++p) & 0xc0) != 0x80) return p - string;
4433     }
4434   }
4435
4436 return -1;
4437 }
4438 #endif
4439
4440
4441
4442 /*************************************************
4443 *        Compile a Regular Expression            *
4444 *************************************************/
4445
4446 /* This function takes a string and returns a pointer to a block of store
4447 holding a compiled version of the expression.
4448
4449 Arguments:
4450   pattern      the regular expression
4451   options      various option bits
4452   errorptr     pointer to pointer to error text
4453   erroroffset  ptr offset in pattern where error was detected
4454   tables       pointer to character tables or NULL
4455
4456 Returns:       pointer to compiled data block, or NULL on error,
4457                with errorptr and erroroffset set
4458 */
4459
4460 EXPORT pcre *
4461 pcre_compile(const char *pattern, int options, const char **errorptr,
4462   int *erroroffset, const unsigned char *tables)
4463 {
4464 real_pcre *re;
4465 int length = 1 + LINK_SIZE;      /* For initial BRA plus length */
4466 int c, firstbyte, reqbyte;
4467 int bracount = 0;
4468 int branch_extra = 0;
4469 int branch_newextra;
4470 int item_count = -1;
4471 int name_count = 0;
4472 int max_name_size = 0;
4473 int lastitemlength = 0;
4474 #ifdef SUPPORT_UTF8
4475 BOOL utf8;
4476 BOOL class_utf8;
4477 #endif
4478 BOOL inescq = FALSE;
4479 unsigned int brastackptr = 0;
4480 size_t size;
4481 uschar *code;
4482 const uschar *codestart;
4483 const uschar *ptr;
4484 compile_data compile_block;
4485 int brastack[BRASTACK_SIZE];
4486 uschar bralenstack[BRASTACK_SIZE];
4487
4488 /* We can't pass back an error message if errorptr is NULL; I guess the best we
4489 can do is just return NULL. */
4490
4491 if (errorptr == NULL) return NULL;
4492 *errorptr = NULL;
4493
4494 /* However, we can give a message for this error */
4495
4496 if (erroroffset == NULL)
4497   {
4498   *errorptr = ERR16;
4499   return NULL;
4500   }
4501 *erroroffset = 0;
4502
4503 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
4504
4505 #ifdef SUPPORT_UTF8
4506 utf8 = (options & PCRE_UTF8) != 0;
4507 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
4508      (*erroroffset = valid_utf8((uschar *)pattern, -1)) >= 0)
4509   {
4510   *errorptr = ERR44;
4511   return NULL;
4512   }
4513 #else
4514 if ((options & PCRE_UTF8) != 0)
4515   {
4516   *errorptr = ERR32;
4517   return NULL;
4518   }
4519 #endif
4520
4521 if ((options & ~PUBLIC_OPTIONS) != 0)
4522   {
4523   *errorptr = ERR17;
4524   return NULL;
4525   }
4526
4527 /* Set up pointers to the individual character tables */
4528
4529 if (tables == NULL) tables = pcre_default_tables;
4530 compile_block.lcc = tables + lcc_offset;
4531 compile_block.fcc = tables + fcc_offset;
4532 compile_block.cbits = tables + cbits_offset;
4533 compile_block.ctypes = tables + ctypes_offset;
4534
4535 /* Maximum back reference and backref bitmap. This is updated for numeric
4536 references during the first pass, but for named references during the actual
4537 compile pass. The bitmap records up to 31 back references to help in deciding
4538 whether (.*) can be treated as anchored or not. */
4539
4540 compile_block.top_backref = 0;
4541 compile_block.backref_map = 0;
4542
4543 /* Reflect pattern for debugging output */
4544
4545 DPRINTF(("------------------------------------------------------------------\n"));
4546 DPRINTF(("%s\n", pattern));
4547
4548 /* The first thing to do is to make a pass over the pattern to compute the
4549 amount of store required to hold the compiled code. This does not have to be
4550 perfect as long as errors are overestimates. At the same time we can detect any
4551 flag settings right at the start, and extract them. Make an attempt to correct
4552 for any counted white space if an "extended" flag setting appears late in the
4553 pattern. We can't be so clever for #-comments. */
4554
4555 ptr = (const uschar *)(pattern - 1);
4556 while ((c = *(++ptr)) != 0)
4557   {
4558   int min, max;
4559   int class_optcount;
4560   int bracket_length;
4561   int duplength;
4562
4563   /* If we are inside a \Q...\E sequence, all chars are literal */
4564
4565   if (inescq)
4566     {
4567     if ((options & PCRE_AUTO_CALLOUT) != 0) length += 2 + 2*LINK_SIZE;
4568     goto NORMAL_CHAR;
4569     }
4570
4571   /* Otherwise, first check for ignored whitespace and comments */
4572
4573   if ((options & PCRE_EXTENDED) != 0)
4574     {
4575     if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
4576     if (c == '#')
4577       {
4578       /* The space before the ; is to avoid a warning on a silly compiler
4579       on the Macintosh. */
4580       while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
4581       if (c == 0) break;
4582       continue;
4583       }
4584     }
4585
4586   item_count++;    /* Is zero for the first non-comment item */
4587
4588   /* Allow space for auto callout before every item except quantifiers. */
4589
4590   if ((options & PCRE_AUTO_CALLOUT) != 0 &&
4591        c != '*' && c != '+' && c != '?' &&
4592        (c != '{' || !is_counted_repeat(ptr + 1)))
4593     length += 2 + 2*LINK_SIZE;
4594
4595   switch(c)
4596     {
4597     /* A backslashed item may be an escaped data character or it may be a
4598     character type. */
4599
4600     case '\\':
4601     c = check_escape(&ptr, errorptr, bracount, options, FALSE);
4602     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4603
4604     lastitemlength = 1;     /* Default length of last item for repeats */
4605
4606     if (c >= 0)             /* Data character */
4607       {
4608       length += 2;          /* For a one-byte character */
4609
4610 #ifdef SUPPORT_UTF8
4611       if (utf8 && c > 127)
4612         {
4613         int i;
4614         for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
4615           if (c <= utf8_table1[i]) break;
4616         length += i;
4617         lastitemlength += i;
4618         }
4619 #endif
4620
4621       continue;
4622       }
4623
4624     /* If \Q, enter "literal" mode */
4625
4626     if (-c == ESC_Q)
4627       {
4628       inescq = TRUE;
4629       continue;
4630       }
4631
4632     /* \X is supported only if Unicode property support is compiled */
4633
4634 #ifndef SUPPORT_UCP
4635     if (-c == ESC_X)
4636       {
4637       *errorptr = ERR45;
4638       goto PCRE_ERROR_RETURN;
4639       }
4640 #endif
4641
4642     /* \P and \p are for Unicode properties, but only when the support has
4643     been compiled. Each item needs 2 bytes. */
4644
4645     else if (-c == ESC_P || -c == ESC_p)
4646       {
4647 #ifdef SUPPORT_UCP
4648       BOOL negated;
4649       length += 2;
4650       lastitemlength = 2;
4651       if (get_ucp(&ptr, &negated, errorptr) < 0) goto PCRE_ERROR_RETURN;
4652       continue;
4653 #else
4654       *errorptr = ERR45;
4655       goto PCRE_ERROR_RETURN;
4656 #endif
4657       }
4658
4659     /* Other escapes need one byte */
4660
4661     length++;
4662
4663     /* A back reference needs an additional 2 bytes, plus either one or 5
4664     bytes for a repeat. We also need to keep the value of the highest
4665     back reference. */
4666
4667     if (c <= -ESC_REF)
4668       {
4669       int refnum = -c - ESC_REF;
4670       compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
4671       if (refnum > compile_block.top_backref)
4672         compile_block.top_backref = refnum;
4673       length += 2;   /* For single back reference */
4674       if (ptr[1] == '{' && is_counted_repeat(ptr+2))
4675         {
4676         ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
4677         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4678         if ((min == 0 && (max == 1 || max == -1)) ||
4679           (min == 1 && max == -1))
4680             length++;
4681         else length += 5;
4682         if (ptr[1] == '?') ptr++;
4683         }
4684       }
4685     continue;
4686
4687     case '^':     /* Single-byte metacharacters */
4688     case '.':
4689     case '$':
4690     length++;
4691     lastitemlength = 1;
4692     continue;
4693
4694     case '*':            /* These repeats won't be after brackets; */
4695     case '+':            /* those are handled separately */
4696     case '?':
4697     length++;
4698     goto POSESSIVE;      /* A few lines below */
4699
4700     /* This covers the cases of braced repeats after a single char, metachar,
4701     class, or back reference. */
4702
4703     case '{':
4704     if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
4705     ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
4706     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4707
4708     /* These special cases just insert one extra opcode */
4709
4710     if ((min == 0 && (max == 1 || max == -1)) ||
4711       (min == 1 && max == -1))
4712         length++;
4713
4714     /* These cases might insert additional copies of a preceding character. */
4715
4716     else
4717       {
4718       if (min != 1)
4719         {
4720         length -= lastitemlength;   /* Uncount the original char or metachar */
4721         if (min > 0) length += 3 + lastitemlength;
4722         }
4723       length += lastitemlength + ((max > 0)? 3 : 1);
4724       }
4725
4726     if (ptr[1] == '?') ptr++;      /* Needs no extra length */
4727
4728     POSESSIVE:                     /* Test for possessive quantifier */
4729     if (ptr[1] == '+')
4730       {
4731       ptr++;
4732       length += 2 + 2*LINK_SIZE;   /* Allow for atomic brackets */
4733       }
4734     continue;
4735
4736     /* An alternation contains an offset to the next branch or ket. If any ims
4737     options changed in the previous branch(es), and/or if we are in a
4738     lookbehind assertion, extra space will be needed at the start of the
4739     branch. This is handled by branch_extra. */
4740
4741     case '|':
4742     length += 1 + LINK_SIZE + branch_extra;
4743     continue;
4744
4745     /* A character class uses 33 characters provided that all the character
4746     values are less than 256. Otherwise, it uses a bit map for low valued
4747     characters, and individual items for others. Don't worry about character
4748     types that aren't allowed in classes - they'll get picked up during the
4749     compile. A character class that contains only one single-byte character
4750     uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
4751     where we can. (In UTF-8 mode we can do this only for chars < 128.) */
4752
4753     case '[':
4754     if (*(++ptr) == '^')
4755       {
4756       class_optcount = 10;  /* Greater than one */
4757       ptr++;
4758       }
4759     else class_optcount = 0;
4760
4761 #ifdef SUPPORT_UTF8
4762     class_utf8 = FALSE;
4763 #endif
4764
4765     /* Written as a "do" so that an initial ']' is taken as data */
4766
4767     if (*ptr != 0) do
4768       {
4769       /* Inside \Q...\E everything is literal except \E */
4770
4771       if (inescq)
4772         {
4773         if (*ptr != '\\' || ptr[1] != 'E') goto GET_ONE_CHARACTER;
4774         inescq = FALSE;
4775         ptr += 1;
4776         continue;
4777         }
4778
4779       /* Outside \Q...\E, check for escapes */
4780
4781       if (*ptr == '\\')
4782         {
4783         c = check_escape(&ptr, errorptr, bracount, options, TRUE);
4784         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4785
4786         /* \b is backspace inside a class; \X is literal */
4787
4788         if (-c == ESC_b) c = '\b';
4789         else if (-c == ESC_X) c = 'X';
4790
4791         /* \Q enters quoting mode */
4792
4793         else if (-c == ESC_Q)
4794           {
4795           inescq = TRUE;
4796           continue;
4797           }
4798
4799         /* Handle escapes that turn into characters */
4800
4801         if (c >= 0) goto NON_SPECIAL_CHARACTER;
4802
4803         /* Escapes that are meta-things. The normal ones just affect the
4804         bit map, but Unicode properties require an XCLASS extended item. */
4805
4806         else
4807           {
4808           class_optcount = 10;         /* \d, \s etc; make sure > 1 */
4809 #ifdef SUPPORT_UTF8
4810           if (-c == ESC_p || -c == ESC_P)
4811             {
4812             if (!class_utf8)
4813               {
4814               class_utf8 = TRUE;
4815               length += LINK_SIZE + 2;
4816               }
4817             length += 2;
4818             }
4819 #endif
4820           }
4821         }
4822
4823       /* Check the syntax for POSIX stuff. The bits we actually handle are
4824       checked during the real compile phase. */
4825
4826       else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block))
4827         {
4828         ptr++;
4829         class_optcount = 10;    /* Make sure > 1 */
4830         }
4831
4832       /* Anything else increments the possible optimization count. We have to
4833       detect ranges here so that we can compute the number of extra ranges for
4834       caseless wide characters when UCP support is available. If there are wide
4835       characters, we are going to have to use an XCLASS, even for single
4836       characters. */
4837
4838       else
4839         {
4840         int d;
4841
4842         GET_ONE_CHARACTER:
4843
4844 #ifdef SUPPORT_UTF8
4845         if (utf8)
4846           {
4847           int extra = 0;
4848           GETCHARLEN(c, ptr, extra);
4849           ptr += extra;
4850           }
4851         else c = *ptr;
4852 #else
4853         c = *ptr;
4854 #endif
4855
4856         /* Come here from handling \ above when it escapes to a char value */
4857
4858         NON_SPECIAL_CHARACTER:
4859         class_optcount++;
4860
4861         d = -1;
4862         if (ptr[1] == '-')
4863           {
4864           uschar const *hyptr = ptr++;
4865           if (ptr[1] == '\\')
4866             {
4867             ptr++;
4868             d = check_escape(&ptr, errorptr, bracount, options, TRUE);
4869             if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4870             if (-d == ESC_b) d = '\b';        /* backspace */
4871             else if (-d == ESC_X) d = 'X';    /* literal X in a class */
4872             }
4873           else if (ptr[1] != 0 && ptr[1] != ']')
4874             {
4875             ptr++;
4876 #ifdef SUPPORT_UTF8
4877             if (utf8)
4878               {
4879               int extra = 0;
4880               GETCHARLEN(d, ptr, extra);
4881               ptr += extra;
4882               }
4883             else
4884 #endif
4885             d = *ptr;
4886             }
4887           if (d < 0) ptr = hyptr;      /* go back to hyphen as data */
4888           }
4889
4890         /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
4891         127 for caseless matching, we will need to use an XCLASS. */
4892
4893         if (d >= 0)
4894           {
4895           class_optcount = 10;     /* Ensure > 1 */
4896           if (d < c)
4897             {
4898             *errorptr = ERR8;
4899             goto PCRE_ERROR_RETURN;
4900             }
4901
4902 #ifdef SUPPORT_UTF8
4903           if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
4904             {
4905             uschar buffer[6];
4906             if (!class_utf8)         /* Allow for XCLASS overhead */
4907               {
4908               class_utf8 = TRUE;
4909               length += LINK_SIZE + 2;
4910               }
4911
4912 #ifdef SUPPORT_UCP
4913             /* If we have UCP support, find out how many extra ranges are
4914             needed to map the other case of characters within this range. We
4915             have to mimic the range optimization here, because extending the
4916             range upwards might push d over a boundary that makes is use
4917             another byte in the UTF-8 representation. */
4918
4919             if ((options & PCRE_CASELESS) != 0)
4920               {
4921               int occ, ocd;
4922               int cc = c;
4923               int origd = d;
4924               while (get_othercase_range(&cc, origd, &occ, &ocd))
4925                 {
4926                 if (occ >= c && ocd <= d) continue;   /* Skip embedded */
4927
4928                 if (occ < c  && ocd >= c - 1)  /* Extend the basic range */
4929                   {                            /* if there is overlap,   */
4930                   c = occ;                     /* noting that if occ < c */
4931                   continue;                    /* we can't have ocd > d  */
4932                   }                            /* because a subrange is  */
4933                 if (ocd > d && occ <= d + 1)   /* always shorter than    */
4934                   {                            /* the basic range.       */
4935                   d = ocd;
4936                   continue;
4937                   }
4938
4939                 /* An extra item is needed */
4940
4941                 length += 1 + ord2utf8(occ, buffer) +
4942                   ((occ == ocd)? 0 : ord2utf8(ocd, buffer));
4943                 }
4944               }
4945 #endif  /* SUPPORT_UCP */
4946
4947             /* The length of the (possibly extended) range */
4948
4949             length += 1 + ord2utf8(c, buffer) + ord2utf8(d, buffer);
4950             }
4951 #endif  /* SUPPORT_UTF8 */
4952
4953           }
4954
4955         /* We have a single character. There is nothing to be done unless we
4956         are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
4957         allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
4958         support. */
4959
4960         else
4961           {
4962 #ifdef SUPPORT_UTF8
4963           if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
4964             {
4965             uschar buffer[6];
4966             class_optcount = 10;     /* Ensure > 1 */
4967             if (!class_utf8)         /* Allow for XCLASS overhead */
4968               {
4969               class_utf8 = TRUE;
4970               length += LINK_SIZE + 2;
4971               }
4972 #ifdef SUPPORT_UCP
4973             length += (((options & PCRE_CASELESS) != 0)? 2 : 1) *
4974               (1 + ord2utf8(c, buffer));
4975 #else   /* SUPPORT_UCP */
4976             length += 1 + ord2utf8(c, buffer);
4977 #endif  /* SUPPORT_UCP */
4978             }
4979 #endif  /* SUPPORT_UTF8 */
4980           }
4981         }
4982       }
4983     while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */
4984
4985     if (*ptr == 0)                          /* Missing terminating ']' */
4986       {
4987       *errorptr = ERR6;
4988       goto PCRE_ERROR_RETURN;
4989       }
4990
4991     /* We can optimize when there was only one optimizable character. Repeats
4992     for positive and negated single one-byte chars are handled by the general
4993     code. Here, we handle repeats for the class opcodes. */
4994
4995     if (class_optcount == 1) length += 3; else
4996       {
4997       length += 33;
4998
4999       /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
5000       we also need extra for wrapping the whole thing in a sub-pattern. */
5001
5002       if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
5003         {
5004         ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
5005         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
5006         if ((min == 0 && (max == 1 || max == -1)) ||
5007           (min == 1 && max == -1))
5008             length++;
5009         else length += 5;
5010         if (ptr[1] == '+')
5011           {
5012           ptr++;
5013           length += 2 + 2*LINK_SIZE;
5014           }
5015         else if (ptr[1] == '?') ptr++;
5016         }
5017       }
5018     continue;
5019
5020     /* Brackets may be genuine groups or special things */
5021
5022     case '(':
5023     branch_newextra = 0;
5024     bracket_length = 1 + LINK_SIZE;
5025
5026     /* Handle special forms of bracket, which all start (? */
5027
5028     if (ptr[1] == '?')
5029       {
5030       int set, unset;
5031       int *optset;
5032
5033       switch (c = ptr[2])
5034         {
5035         /* Skip over comments entirely */
5036         case '#':
5037         ptr += 3;
5038         while (*ptr != 0 && *ptr != ')') ptr++;
5039         if (*ptr == 0)
5040           {
5041           *errorptr = ERR18;
5042           goto PCRE_ERROR_RETURN;
5043           }
5044         continue;
5045
5046         /* Non-referencing groups and lookaheads just move the pointer on, and
5047         then behave like a non-special bracket, except that they don't increment
5048         the count of extracting brackets. Ditto for the "once only" bracket,
5049         which is in Perl from version 5.005. */
5050
5051         case ':':
5052         case '=':
5053         case '!':
5054         case '>':
5055         ptr += 2;
5056         break;
5057
5058         /* (?R) specifies a recursive call to the regex, which is an extension
5059         to provide the facility which can be obtained by (?p{perl-code}) in
5060         Perl 5.6. In Perl 5.8 this has become (??{perl-code}).
5061
5062         From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to
5063         the appropriate numbered brackets. This includes both recursive and
5064         non-recursive calls. (?R) is now synonymous with (?0). */
5065
5066         case 'R':
5067         ptr++;
5068
5069         case '0': case '1': case '2': case '3': case '4':
5070         case '5': case '6': case '7': case '8': case '9':
5071         ptr += 2;
5072         if (c != 'R')
5073           while ((digitab[*(++ptr)] & ctype_digit) != 0);
5074         if (*ptr != ')')
5075           {
5076           *errorptr = ERR29;
5077           goto PCRE_ERROR_RETURN;
5078           }
5079         length += 1 + LINK_SIZE;
5080
5081         /* If this item is quantified, it will get wrapped inside brackets so
5082         as to use the code for quantified brackets. We jump down and use the
5083         code that handles this for real brackets. */
5084
5085         if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{')
5086           {
5087           length += 2 + 2 * LINK_SIZE;       /* to make bracketed */
5088           duplength = 5 + 3 * LINK_SIZE;
5089           goto HANDLE_QUANTIFIED_BRACKETS;
5090           }
5091         continue;
5092
5093         /* (?C) is an extension which provides "callout" - to provide a bit of
5094         the functionality of the Perl (?{...}) feature. An optional number may
5095         follow (default is zero). */
5096
5097         case 'C':
5098         ptr += 2;
5099         while ((digitab[*(++ptr)] & ctype_digit) != 0);
5100         if (*ptr != ')')
5101           {
5102           *errorptr = ERR39;
5103           goto PCRE_ERROR_RETURN;
5104           }
5105         length += 2 + 2*LINK_SIZE;
5106         continue;
5107
5108         /* Named subpatterns are an extension copied from Python */
5109
5110         case 'P':
5111         ptr += 3;
5112         if (*ptr == '<')
5113           {
5114           const uschar *p;    /* Don't amalgamate; some compilers */
5115           p = ++ptr;          /* grumble at autoincrement in declaration */
5116           while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;
5117           if (*ptr != '>')
5118             {
5119             *errorptr = ERR42;
5120             goto PCRE_ERROR_RETURN;
5121             }
5122           name_count++;
5123           if (ptr - p > max_name_size) max_name_size = (ptr - p);
5124           break;
5125           }
5126
5127         if (*ptr == '=' || *ptr == '>')
5128           {
5129           while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0);
5130           if (*ptr != ')')
5131             {
5132             *errorptr = ERR42;
5133             goto PCRE_ERROR_RETURN;
5134             }
5135           break;
5136           }
5137
5138         /* Unknown character after (?P */
5139
5140         *errorptr = ERR41;
5141         goto PCRE_ERROR_RETURN;
5142
5143         /* Lookbehinds are in Perl from version 5.005 */
5144
5145         case '<':
5146         ptr += 3;
5147         if (*ptr == '=' || *ptr == '!')
5148           {
5149           branch_newextra = 1 + LINK_SIZE;
5150           length += 1 + LINK_SIZE;         /* For the first branch */
5151           break;
5152           }
5153         *errorptr = ERR24;
5154         goto PCRE_ERROR_RETURN;
5155
5156         /* Conditionals are in Perl from version 5.005. The bracket must either
5157         be followed by a number (for bracket reference) or by an assertion
5158         group, or (a PCRE extension) by 'R' for a recursion test. */
5159
5160         case '(':
5161         if (ptr[3] == 'R' && ptr[4] == ')')
5162           {
5163           ptr += 4;
5164           length += 3;
5165           }
5166         else if ((digitab[ptr[3]] & ctype_digit) != 0)
5167           {
5168           ptr += 4;
5169           length += 3;
5170           while ((digitab[*ptr] & ctype_digit) != 0) ptr++;
5171           if (*ptr != ')')
5172             {
5173             *errorptr = ERR26;
5174             goto PCRE_ERROR_RETURN;
5175             }
5176           }
5177         else   /* An assertion must follow */
5178           {
5179           ptr++;   /* Can treat like ':' as far as spacing is concerned */
5180           if (ptr[2] != '?' ||
5181              (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
5182             {
5183             ptr += 2;    /* To get right offset in message */
5184             *errorptr = ERR28;
5185             goto PCRE_ERROR_RETURN;
5186             }
5187           }
5188         break;
5189
5190         /* Else loop checking valid options until ) is met. Anything else is an
5191         error. If we are without any brackets, i.e. at top level, the settings
5192         act as if specified in the options, so massage the options immediately.
5193         This is for backward compatibility with Perl 5.004. */
5194
5195         default:
5196         set = unset = 0;
5197         optset = &set;
5198         ptr += 2;
5199
5200         for (;; ptr++)
5201           {
5202           c = *ptr;
5203           switch (c)
5204             {
5205             case 'i':
5206             *optset |= PCRE_CASELESS;
5207             continue;
5208
5209             case 'm':
5210             *optset |= PCRE_MULTILINE;
5211             continue;
5212
5213             case 's':
5214             *optset |= PCRE_DOTALL;
5215             continue;
5216
5217             case 'x':
5218             *optset |= PCRE_EXTENDED;
5219             continue;
5220
5221             case 'X':
5222             *optset |= PCRE_EXTRA;
5223             continue;
5224
5225             case 'U':
5226             *optset |= PCRE_UNGREEDY;
5227             continue;
5228
5229             case '-':
5230             optset = &unset;
5231             continue;
5232
5233             /* A termination by ')' indicates an options-setting-only item; if
5234             this is at the very start of the pattern (indicated by item_count
5235             being zero), we use it to set the global options. This is helpful
5236             when analyzing the pattern for first characters, etc. Otherwise
5237             nothing is done here and it is handled during the compiling
5238             process.
5239
5240             [Historical note: Up to Perl 5.8, options settings at top level
5241             were always global settings, wherever they appeared in the pattern.
5242             That is, they were equivalent to an external setting. From 5.8
5243             onwards, they apply only to what follows (which is what you might
5244             expect).] */
5245
5246             case ')':
5247             if (item_count == 0)
5248               {
5249               options = (options | set) & (~unset);
5250               set = unset = 0;     /* To save length */
5251               item_count--;        /* To allow for several */
5252               }
5253
5254             /* Fall through */
5255
5256             /* A termination by ':' indicates the start of a nested group with
5257             the given options set. This is again handled at compile time, but
5258             we must allow for compiled space if any of the ims options are
5259             set. We also have to allow for resetting space at the end of
5260             the group, which is why 4 is added to the length and not just 2.
5261             If there are several changes of options within the same group, this
5262             will lead to an over-estimate on the length, but this shouldn't
5263             matter very much. We also have to allow for resetting options at
5264             the start of any alternations, which we do by setting
5265             branch_newextra to 2. Finally, we record whether the case-dependent
5266             flag ever changes within the regex. This is used by the "required
5267             character" code. */
5268
5269             case ':':
5270             if (((set|unset) & PCRE_IMS) != 0)
5271               {
5272               length += 4;
5273               branch_newextra = 2;
5274               if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
5275               }
5276             goto END_OPTIONS;
5277
5278             /* Unrecognized option character */
5279
5280             default:
5281             *errorptr = ERR12;
5282             goto PCRE_ERROR_RETURN;
5283             }
5284           }
5285
5286         /* If we hit a closing bracket, that's it - this is a freestanding
5287         option-setting. We need to ensure that branch_extra is updated if
5288         necessary. The only values branch_newextra can have here are 0 or 2.
5289         If the value is 2, then branch_extra must either be 2 or 5, depending
5290         on whether this is a lookbehind group or not. */
5291
5292         END_OPTIONS:
5293         if (c == ')')
5294           {
5295           if (branch_newextra == 2 &&
5296               (branch_extra == 0 || branch_extra == 1+LINK_SIZE))
5297             branch_extra += branch_newextra;
5298           continue;
5299           }
5300
5301         /* If options were terminated by ':' control comes here. Fall through
5302         to handle the group below. */
5303         }
5304       }
5305
5306     /* Extracting brackets must be counted so we can process escapes in a
5307     Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to
5308     need an additional 3 bytes of store per extracting bracket. However, if
5309     PCRE_NO_AUTO)CAPTURE is set, unadorned brackets become non-capturing, so we
5310     must leave the count alone (it will aways be zero). */
5311
5312     else if ((options & PCRE_NO_AUTO_CAPTURE) == 0)
5313       {
5314       bracount++;
5315       if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
5316       }
5317
5318     /* Save length for computing whole length at end if there's a repeat that
5319     requires duplication of the group. Also save the current value of
5320     branch_extra, and start the new group with the new value. If non-zero, this
5321     will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
5322
5323     if (brastackptr >= sizeof(brastack)/sizeof(int))
5324       {
5325       *errorptr = ERR19;
5326       goto PCRE_ERROR_RETURN;
5327       }
5328
5329     bralenstack[brastackptr] = branch_extra;
5330     branch_extra = branch_newextra;
5331
5332     brastack[brastackptr++] = length;
5333     length += bracket_length;
5334     continue;
5335
5336     /* Handle ket. Look for subsequent max/min; for certain sets of values we
5337     have to replicate this bracket up to that many times. If brastackptr is
5338     0 this is an unmatched bracket which will generate an error, but take care
5339     not to try to access brastack[-1] when computing the length and restoring
5340     the branch_extra value. */
5341
5342     case ')':
5343     length += 1 + LINK_SIZE;
5344     if (brastackptr > 0)
5345       {
5346       duplength = length - brastack[--brastackptr];
5347       branch_extra = bralenstack[brastackptr];
5348       }
5349     else duplength = 0;
5350
5351     /* The following code is also used when a recursion such as (?3) is
5352     followed by a quantifier, because in that case, it has to be wrapped inside
5353     brackets so that the quantifier works. The value of duplength must be
5354     set before arrival. */
5355
5356     HANDLE_QUANTIFIED_BRACKETS:
5357
5358     /* Leave ptr at the final char; for read_repeat_counts this happens
5359     automatically; for the others we need an increment. */
5360
5361     if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
5362       {
5363       ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
5364       if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
5365       }
5366     else if (c == '*') { min = 0; max = -1; ptr++; }
5367     else if (c == '+') { min = 1; max = -1; ptr++; }
5368     else if (c == '?') { min = 0; max = 1;  ptr++; }
5369     else { min = 1; max = 1; }
5370
5371     /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
5372     group, and if the maximum is greater than zero, we have to replicate
5373     maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
5374     bracket set. */
5375
5376     if (min == 0)
5377       {
5378       length++;
5379       if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
5380       }
5381
5382     /* When the minimum is greater than zero, we have to replicate up to
5383     minval-1 times, with no additions required in the copies. Then, if there
5384     is a limited maximum we have to replicate up to maxval-1 times allowing
5385     for a BRAZERO item before each optional copy and nesting brackets for all
5386     but one of the optional copies. */
5387
5388     else
5389       {
5390       length += (min - 1) * duplength;
5391       if (max > min)   /* Need this test as max=-1 means no limit */
5392         length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
5393           - (2 + 2*LINK_SIZE);
5394       }
5395
5396     /* Allow space for once brackets for "possessive quantifier" */
5397
5398     if (ptr[1] == '+')
5399       {
5400       ptr++;
5401       length += 2 + 2*LINK_SIZE;
5402       }
5403     continue;
5404
5405     /* Non-special character. It won't be space or # in extended mode, so it is
5406     always a genuine character. If we are in a \Q...\E sequence, check for the
5407     end; if not, we have a literal. */
5408
5409     default:
5410     NORMAL_CHAR:
5411
5412     if (inescq && c == '\\' && ptr[1] == 'E')
5413       {
5414       inescq = FALSE;
5415       ptr++;
5416       continue;
5417       }
5418
5419     length += 2;          /* For a one-byte character */
5420     lastitemlength = 1;   /* Default length of last item for repeats */
5421
5422     /* In UTF-8 mode, check for additional bytes. */
5423
5424 #ifdef SUPPORT_UTF8
5425     if (utf8 && (c & 0xc0) == 0xc0)
5426       {
5427       while ((ptr[1] & 0xc0) == 0x80)         /* Can't flow over the end */
5428         {                                     /* because the end is marked */
5429         lastitemlength++;                     /* by a zero byte. */
5430         length++;
5431         ptr++;
5432         }
5433       }
5434 #endif
5435
5436     continue;
5437     }
5438   }
5439
5440 length += 2 + LINK_SIZE;    /* For final KET and END */
5441
5442 if ((options & PCRE_AUTO_CALLOUT) != 0)
5443   length += 2 + 2*LINK_SIZE;  /* For final callout */
5444
5445 if (length > MAX_PATTERN_SIZE)
5446   {
5447   *errorptr = ERR20;
5448   return NULL;
5449   }
5450
5451 /* Compute the size of data block needed and get it, either from malloc or
5452 externally provided function. */
5453
5454 size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
5455 re = (real_pcre *)(pcre_malloc)(size);
5456
5457 if (re == NULL)
5458   {
5459   *errorptr = ERR21;
5460   return NULL;
5461   }
5462
5463 /* Put in the magic number, and save the sizes, options, and character table
5464 pointer. NULL is used for the default character tables. The nullpad field is at
5465 the end; it's there to help in the case when a regex compiled on a system with
5466 4-byte pointers is run on another with 8-byte pointers. */
5467
5468 re->magic_number = MAGIC_NUMBER;
5469 re->size = size;
5470 re->options = options;
5471 re->dummy1 = re->dummy2 = 0;
5472 re->name_table_offset = sizeof(real_pcre);
5473 re->name_entry_size = max_name_size + 3;
5474 re->name_count = name_count;
5475 re->tables = (tables == pcre_default_tables)? NULL : tables;
5476 re->nullpad = NULL;
5477
5478 /* The starting points of the name/number translation table and of the code are
5479 passed around in the compile data block. */
5480
5481 compile_block.names_found = 0;
5482 compile_block.name_entry_size = max_name_size + 3;
5483 compile_block.name_table = (uschar *)re + re->name_table_offset;
5484 codestart = compile_block.name_table + re->name_entry_size * re->name_count;
5485 compile_block.start_code = codestart;
5486 compile_block.start_pattern = (const uschar *)pattern;
5487 compile_block.req_varyopt = 0;
5488 compile_block.nopartial = FALSE;
5489
5490 /* Set up a starting, non-extracting bracket, then compile the expression. On
5491 error, *errorptr will be set non-NULL, so we don't need to look at the result
5492 of the function here. */
5493
5494 ptr = (const uschar *)pattern;
5495 code = (uschar *)codestart;
5496 *code = OP_BRA;
5497 bracount = 0;
5498 (void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr,
5499   errorptr, FALSE, 0, &firstbyte, &reqbyte, NULL, &compile_block);
5500 re->top_bracket = bracount;
5501 re->top_backref = compile_block.top_backref;
5502
5503 if (compile_block.nopartial) re->options |= PCRE_NOPARTIAL;
5504
5505 /* If not reached end of pattern on success, there's an excess bracket. */
5506
5507 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
5508
5509 /* Fill in the terminating state and check for disastrous overflow, but
5510 if debugging, leave the test till after things are printed out. */
5511
5512 *code++ = OP_END;
5513
5514 #ifndef DEBUG
5515 if (code - codestart > length) *errorptr = ERR23;
5516 #endif
5517
5518 /* Give an error if there's back reference to a non-existent capturing
5519 subpattern. */
5520
5521 if (re->top_backref > re->top_bracket) *errorptr = ERR15;
5522
5523 /* Failed to compile, or error while post-processing */
5524
5525 if (*errorptr != NULL)
5526   {
5527   (pcre_free)(re);
5528   PCRE_ERROR_RETURN:
5529   *erroroffset = ptr - (const uschar *)pattern;
5530   return NULL;
5531   }
5532
5533 /* If the anchored option was not passed, set the flag if we can determine that
5534 the pattern is anchored by virtue of ^ characters or \A or anything else (such
5535 as starting with .* when DOTALL is set).
5536
5537 Otherwise, if we know what the first character has to be, save it, because that
5538 speeds up unanchored matches no end. If not, see if we can set the
5539 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
5540 start with ^. and also when all branches start with .* for non-DOTALL matches.
5541 */
5542
5543 if ((options & PCRE_ANCHORED) == 0)
5544   {
5545   int temp_options = options;
5546   if (is_anchored(codestart, &temp_options, 0, compile_block.backref_map))
5547     re->options |= PCRE_ANCHORED;
5548   else
5549     {
5550     if (firstbyte < 0)
5551       firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
5552     if (firstbyte >= 0)   /* Remove caseless flag for non-caseable chars */
5553       {
5554       int ch = firstbyte & 255;
5555       re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
5556          compile_block.fcc[ch] == ch)? ch : firstbyte;
5557       re->options |= PCRE_FIRSTSET;
5558       }
5559     else if (is_startline(codestart, 0, compile_block.backref_map))
5560       re->options |= PCRE_STARTLINE;
5561     }
5562   }
5563
5564 /* For an anchored pattern, we use the "required byte" only if it follows a
5565 variable length item in the regex. Remove the caseless flag for non-caseable
5566 bytes. */
5567
5568 if (reqbyte >= 0 &&
5569      ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
5570   {
5571   int ch = reqbyte & 255;
5572   re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
5573     compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5574   re->options |= PCRE_REQCHSET;
5575   }
5576
5577 /* Print out the compiled data for debugging */
5578
5579 #ifdef DEBUG
5580
5581 printf("Length = %d top_bracket = %d top_backref = %d\n",
5582   length, re->top_bracket, re->top_backref);
5583
5584 if (re->options != 0)
5585   {
5586   printf("%s%s%s%s%s%s%s%s%s%s\n",
5587     ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5588     ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5589     ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5590     ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
5591     ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5592     ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5593     ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5594     ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5595     ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5596     ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5597   }
5598
5599 if ((re->options & PCRE_FIRSTSET) != 0)
5600   {
5601   int ch = re->first_byte & 255;
5602   const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
5603   if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5604     else printf("First char = \\x%02x%s\n", ch, caseless);
5605   }
5606
5607 if ((re->options & PCRE_REQCHSET) != 0)
5608   {
5609   int ch = re->req_byte & 255;
5610   const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
5611   if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5612     else printf("Req char = \\x%02x%s\n", ch, caseless);
5613   }
5614
5615 print_internals(re, stdout);
5616
5617 /* This check is done here in the debugging case so that the code that
5618 was compiled can be seen. */
5619
5620 if (code - codestart > length)
5621   {
5622   *errorptr = ERR23;
5623   (pcre_free)(re);
5624   *erroroffset = ptr - (uschar *)pattern;
5625   return NULL;
5626   }
5627 #endif
5628
5629 return (pcre *)re;
5630 }
5631
5632
5633
5634 /*************************************************
5635 *          Match a back-reference                *
5636 *************************************************/
5637
5638 /* If a back reference hasn't been set, the length that is passed is greater
5639 than the number of characters left in the string, so the match fails.
5640
5641 Arguments:
5642   offset      index into the offset vector
5643   eptr        points into the subject
5644   length      length to be matched
5645   md          points to match data block
5646   ims         the ims flags
5647
5648 Returns:      TRUE if matched
5649 */
5650
5651 static BOOL
5652 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
5653   unsigned long int ims)
5654 {
5655 const uschar *p = md->start_subject + md->offset_vector[offset];
5656
5657 #ifdef DEBUG
5658 if (eptr >= md->end_subject)
5659   printf("matching subject <null>");
5660 else
5661   {
5662   printf("matching subject ");
5663   pchars(eptr, length, TRUE, md);
5664   }
5665 printf(" against backref ");
5666 pchars(p, length, FALSE, md);
5667 printf("\n");
5668 #endif
5669
5670 /* Always fail if not enough characters left */
5671
5672 if (length > md->end_subject - eptr) return FALSE;
5673
5674 /* Separate the caselesss case for speed */
5675
5676 if ((ims & PCRE_CASELESS) != 0)
5677   {
5678   while (length-- > 0)
5679     if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
5680   }
5681 else
5682   { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
5683
5684 return TRUE;
5685 }
5686
5687
5688 #ifdef SUPPORT_UTF8
5689 /*************************************************
5690 *       Match character against an XCLASS        *
5691 *************************************************/
5692
5693 /* This function is called from within the XCLASS code below, to match a
5694 character against an extended class which might match values > 255.
5695
5696 Arguments:
5697   c           the character
5698   data        points to the flag byte of the XCLASS data
5699
5700 Returns:      TRUE if character matches, else FALSE
5701 */
5702
5703 static BOOL
5704 match_xclass(int c, const uschar *data)
5705 {
5706 int t;
5707 BOOL negated = (*data & XCL_NOT) != 0;
5708
5709 /* Character values < 256 are matched against a bitmap, if one is present. If
5710 not, we still carry on, because there may be ranges that start below 256 in the
5711 additional data. */
5712
5713 if (c < 256)
5714   {
5715   if ((*data & XCL_MAP) != 0 && (data[1 + c/8] & (1 << (c&7))) != 0)
5716     return !negated;   /* char found */
5717   }
5718
5719 /* First skip the bit map if present. Then match against the list of Unicode
5720 properties or large chars or ranges that end with a large char. We won't ever
5721 encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */
5722
5723 if ((*data++ & XCL_MAP) != 0) data += 32;
5724
5725 while ((t = *data++) != XCL_END)
5726   {
5727   int x, y;
5728   if (t == XCL_SINGLE)
5729     {
5730     GETCHARINC(x, data);
5731     if (c == x) return !negated;
5732     }
5733   else if (t == XCL_RANGE)
5734     {
5735     GETCHARINC(x, data);
5736     GETCHARINC(y, data);
5737     if (c >= x && c <= y) return !negated;
5738     }
5739
5740 #ifdef SUPPORT_UCP
5741   else  /* XCL_PROP & XCL_NOTPROP */
5742     {
5743     int chartype, othercase;
5744     int rqdtype = *data++;
5745     int category = ucp_findchar(c, &chartype, &othercase);
5746     if (rqdtype >= 128)
5747       {
5748       if ((rqdtype - 128 == category) == (t == XCL_PROP)) return !negated;
5749       }
5750     else
5751       {
5752       if ((rqdtype == chartype) == (t == XCL_PROP)) return !negated;
5753       }
5754     }
5755 #endif  /* SUPPORT_UCP */
5756   }
5757
5758 return negated;   /* char did not match */
5759 }
5760 #endif
5761
5762
5763 /***************************************************************************
5764 ****************************************************************************
5765                    RECURSION IN THE match() FUNCTION
5766
5767 The match() function is highly recursive. Some regular expressions can cause
5768 it to recurse thousands of times. I was writing for Unix, so I just let it
5769 call itself recursively. This uses the stack for saving everything that has
5770 to be saved for a recursive call. On Unix, the stack can be large, and this
5771 works fine.
5772
5773 It turns out that on non-Unix systems there are problems with programs that
5774 use a lot of stack. (This despite the fact that every last chip has oodles
5775 of memory these days, and techniques for extending the stack have been known
5776 for decades.) So....
5777
5778 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
5779 calls by keeping local variables that need to be preserved in blocks of memory
5780 obtained from malloc instead instead of on the stack. Macros are used to
5781 achieve this so that the actual code doesn't look very different to what it
5782 always used to.
5783 ****************************************************************************
5784 ***************************************************************************/
5785
5786
5787 /* These versions of the macros use the stack, as normal */
5788
5789 #ifndef NO_RECURSE
5790 #define REGISTER register
5791 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) rx = match(ra,rb,rc,rd,re,rf,rg)
5792 #define RRETURN(ra) return ra
5793 #else
5794
5795
5796 /* These versions of the macros manage a private stack on the heap. Note
5797 that the rd argument of RMATCH isn't actually used. It's the md argument of
5798 match(), which never changes. */
5799
5800 #define REGISTER
5801
5802 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg)\
5803   {\
5804   heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
5805   if (setjmp(frame->Xwhere) == 0)\
5806     {\
5807     newframe->Xeptr = ra;\
5808     newframe->Xecode = rb;\
5809     newframe->Xoffset_top = rc;\
5810     newframe->Xims = re;\
5811     newframe->Xeptrb = rf;\
5812     newframe->Xflags = rg;\
5813     newframe->Xprevframe = frame;\
5814     frame = newframe;\
5815     DPRINTF(("restarting from line %d\n", __LINE__));\
5816     goto HEAP_RECURSE;\
5817     }\
5818   else\
5819     {\
5820     DPRINTF(("longjumped back to line %d\n", __LINE__));\
5821     frame = md->thisframe;\
5822     rx = frame->Xresult;\
5823     }\
5824   }
5825
5826 #define RRETURN(ra)\
5827   {\
5828   heapframe *newframe = frame;\
5829   frame = newframe->Xprevframe;\
5830   (pcre_stack_free)(newframe);\
5831   if (frame != NULL)\
5832     {\
5833     frame->Xresult = ra;\
5834     md->thisframe = frame;\
5835     longjmp(frame->Xwhere, 1);\
5836     }\
5837   return ra;\
5838   }
5839
5840
5841 /* Structure for remembering the local variables in a private frame */
5842
5843 typedef struct heapframe {
5844   struct heapframe *Xprevframe;
5845
5846   /* Function arguments that may change */
5847
5848   const uschar *Xeptr;
5849   const uschar *Xecode;
5850   int Xoffset_top;
5851   long int Xims;
5852   eptrblock *Xeptrb;
5853   int Xflags;
5854
5855   /* Function local variables */
5856
5857   const uschar *Xcallpat;
5858   const uschar *Xcharptr;
5859   const uschar *Xdata;
5860   const uschar *Xnext;
5861   const uschar *Xpp;
5862   const uschar *Xprev;
5863   const uschar *Xsaved_eptr;
5864
5865   recursion_info Xnew_recursive;
5866
5867   BOOL Xcur_is_word;
5868   BOOL Xcondition;
5869   BOOL Xminimize;
5870   BOOL Xprev_is_word;
5871
5872   unsigned long int Xoriginal_ims;
5873
5874 #ifdef SUPPORT_UCP
5875   int Xprop_type;
5876   int Xprop_fail_result;
5877   int Xprop_category;
5878   int Xprop_chartype;
5879   int Xprop_othercase;
5880   int Xprop_test_against;
5881   int *Xprop_test_variable;
5882 #endif
5883
5884   int Xctype;
5885   int Xfc;
5886   int Xfi;
5887   int Xlength;
5888   int Xmax;
5889   int Xmin;
5890   int Xnumber;
5891   int Xoffset;
5892   int Xop;
5893   int Xsave_capture_last;
5894   int Xsave_offset1, Xsave_offset2, Xsave_offset3;
5895   int Xstacksave[REC_STACK_SAVE_MAX];
5896
5897   eptrblock Xnewptrb;
5898
5899   /* Place to pass back result, and where to jump back to */
5900
5901   int  Xresult;
5902   jmp_buf Xwhere;
5903
5904 } heapframe;
5905
5906 #endif
5907
5908
5909 /***************************************************************************
5910 ***************************************************************************/
5911
5912
5913
5914 /*************************************************
5915 *         Match from current position            *
5916 *************************************************/
5917
5918 /* On entry ecode points to the first opcode, and eptr to the first character
5919 in the subject string, while eptrb holds the value of eptr at the start of the
5920 last bracketed group - used for breaking infinite loops matching zero-length
5921 strings. This function is called recursively in many circumstances. Whenever it
5922 returns a negative (error) response, the outer incarnation must also return the
5923 same response.
5924
5925 Performance note: It might be tempting to extract commonly used fields from the
5926 md structure (e.g. utf8, end_subject) into individual variables to improve
5927 performance. Tests using gcc on a SPARC disproved this; in the first case, it
5928 made performance worse.
5929
5930 Arguments:
5931    eptr        pointer in subject
5932    ecode       position in code
5933    offset_top  current top pointer
5934    md          pointer to "static" info for the match
5935    ims         current /i, /m, and /s options
5936    eptrb       pointer to chain of blocks containing eptr at start of
5937                  brackets - for testing for empty matches
5938    flags       can contain
5939                  match_condassert - this is an assertion condition
5940                  match_isgroup - this is the start of a bracketed group
5941
5942 Returns:       MATCH_MATCH if matched            )  these values are >= 0
5943                MATCH_NOMATCH if failed to match  )
5944                a negative PCRE_ERROR_xxx value if aborted by an error condition
5945                  (e.g. stopped by recursion limit)
5946 */
5947
5948 static int
5949 match(REGISTER const uschar *eptr, REGISTER const uschar *ecode,
5950   int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
5951   int flags)
5952 {
5953 /* These variables do not need to be preserved over recursion in this function,
5954 so they can be ordinary variables in all cases. Mark them with "register"
5955 because they are used a lot in loops. */
5956
5957 register int rrc;    /* Returns from recursive calls */
5958 register int i;      /* Used for loops not involving calls to RMATCH() */
5959 register int c;      /* Character values not kept over RMATCH() calls */
5960
5961 /* When recursion is not being used, all "local" variables that have to be
5962 preserved over calls to RMATCH() are part of a "frame" which is obtained from
5963 heap storage. Set up the top-level frame here; others are obtained from the
5964 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
5965
5966 #ifdef NO_RECURSE
5967 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
5968 frame->Xprevframe = NULL;            /* Marks the top level */
5969
5970 /* Copy in the original argument variables */
5971
5972 frame->Xeptr = eptr;
5973 frame->Xecode = ecode;
5974 frame->Xoffset_top = offset_top;
5975 frame->Xims = ims;
5976 frame->Xeptrb = eptrb;
5977 frame->Xflags = flags;
5978
5979 /* This is where control jumps back to to effect "recursion" */
5980
5981 HEAP_RECURSE:
5982
5983 /* Macros make the argument variables come from the current frame */
5984
5985 #define eptr               frame->Xeptr
5986 #define ecode              frame->Xecode
5987 #define offset_top         frame->Xoffset_top
5988 #define ims                frame->Xims
5989 #define eptrb              frame->Xeptrb
5990 #define flags              frame->Xflags
5991
5992 /* Ditto for the local variables */
5993
5994 #ifdef SUPPORT_UTF8
5995 #define charptr            frame->Xcharptr
5996 #endif
5997 #define callpat            frame->Xcallpat
5998 #define data               frame->Xdata
5999 #define next               frame->Xnext
6000 #define pp                 frame->Xpp
6001 #define prev               frame->Xprev
6002 #define saved_eptr         frame->Xsaved_eptr
6003
6004 #define new_recursive      frame->Xnew_recursive
6005
6006 #define cur_is_word        frame->Xcur_is_word
6007 #define condition          frame->Xcondition
6008 #define minimize           frame->Xminimize
6009 #define prev_is_word       frame->Xprev_is_word
6010
6011 #define original_ims       frame->Xoriginal_ims
6012
6013 #ifdef SUPPORT_UCP
6014 #define prop_type          frame->Xprop_type
6015 #define prop_fail_result   frame->Xprop_fail_result
6016 #define prop_category      frame->Xprop_category
6017 #define prop_chartype      frame->Xprop_chartype
6018 #define prop_othercase     frame->Xprop_othercase
6019 #define prop_test_against  frame->Xprop_test_against
6020 #define prop_test_variable frame->Xprop_test_variable
6021 #endif
6022
6023 #define ctype              frame->Xctype
6024 #define fc                 frame->Xfc
6025 #define fi                 frame->Xfi
6026 #define length             frame->Xlength
6027 #define max                frame->Xmax
6028 #define min                frame->Xmin
6029 #define number             frame->Xnumber
6030 #define offset             frame->Xoffset
6031 #define op                 frame->Xop
6032 #define save_capture_last  frame->Xsave_capture_last
6033 #define save_offset1       frame->Xsave_offset1
6034 #define save_offset2       frame->Xsave_offset2
6035 #define save_offset3       frame->Xsave_offset3
6036 #define stacksave          frame->Xstacksave
6037
6038 #define newptrb            frame->Xnewptrb
6039
6040 /* When recursion is being used, local variables are allocated on the stack and
6041 get preserved during recursion in the normal way. In this environment, fi and
6042 i, and fc and c, can be the same variables. */
6043
6044 #else
6045 #define fi i
6046 #define fc c
6047
6048
6049 #ifdef SUPPORT_UTF8                /* Many of these variables are used ony */
6050 const uschar *charptr;             /* small blocks of the code. My normal  */
6051 #endif                             /* style of coding would have declared  */
6052 const uschar *callpat;             /* them within each of those blocks.    */
6053 const uschar *data;                /* However, in order to accommodate the */
6054 const uschar *next;                /* version of this code that uses an    */
6055 const uschar *pp;                  /* external "stack" implemented on the  */
6056 const uschar *prev;                /* heap, it is easier to declare them   */
6057 const uschar *saved_eptr;          /* all here, so the declarations can    */
6058                                    /* be cut out in a block. The only      */
6059 recursion_info new_recursive;      /* declarations within blocks below are */
6060                                    /* for variables that do not have to    */
6061 BOOL cur_is_word;                  /* be preserved over a recursive call   */
6062 BOOL condition;                    /* to RMATCH().                         */
6063 BOOL minimize;
6064 BOOL prev_is_word;
6065
6066 unsigned long int original_ims;
6067
6068 #ifdef SUPPORT_UCP
6069 int prop_type;
6070 int prop_fail_result;
6071 int prop_category;
6072 int prop_chartype;
6073 int prop_othercase;
6074 int prop_test_against;
6075 int *prop_test_variable;
6076 #endif
6077
6078 int ctype;
6079 int length;
6080 int max;
6081 int min;
6082 int number;
6083 int offset;
6084 int op;
6085 int save_capture_last;
6086 int save_offset1, save_offset2, save_offset3;
6087 int stacksave[REC_STACK_SAVE_MAX];
6088
6089 eptrblock newptrb;
6090 #endif
6091
6092 /* These statements are here to stop the compiler complaining about unitialized
6093 variables. */
6094
6095 #ifdef SUPPORT_UCP
6096 prop_fail_result = 0;
6097 prop_test_against = 0;
6098 prop_test_variable = NULL;
6099 #endif
6100
6101 /* OK, now we can get on with the real code of the function. Recursion is
6102 specified by the macros RMATCH and RRETURN. When NO_RECURSE is *not* defined,
6103 these just turn into a recursive call to match() and a "return", respectively.
6104 However, RMATCH isn't like a function call because it's quite a complicated
6105 macro. It has to be used in one particular way. This shouldn't, however, impact
6106 performance when true recursion is being used. */
6107
6108 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
6109
6110 original_ims = ims;    /* Save for resetting on ')' */
6111
6112 /* At the start of a bracketed group, add the current subject pointer to the
6113 stack of such pointers, to be re-instated at the end of the group when we hit
6114 the closing ket. When match() is called in other circumstances, we don't add to
6115 this stack. */
6116
6117 if ((flags & match_isgroup) != 0)
6118   {
6119   newptrb.epb_prev = eptrb;
6120   newptrb.epb_saved_eptr = eptr;
6121   eptrb = &newptrb;
6122   }
6123
6124 /* Now start processing the operations. */
6125
6126 for (;;)
6127   {
6128   op = *ecode;
6129   minimize = FALSE;
6130
6131   /* For partial matching, remember if we ever hit the end of the subject after
6132   matching at least one subject character. */
6133
6134   if (md->partial &&
6135       eptr >= md->end_subject &&
6136       eptr > md->start_match)
6137     md->hitend = TRUE;
6138
6139   /* Opening capturing bracket. If there is space in the offset vector, save
6140   the current subject position in the working slot at the top of the vector. We
6141   mustn't change the current values of the data slot, because they may be set
6142   from a previous iteration of this group, and be referred to by a reference
6143   inside the group.
6144
6145   If the bracket fails to match, we need to restore this value and also the
6146   values of the final offsets, in case they were set by a previous iteration of
6147   the same bracket.
6148
6149   If there isn't enough space in the offset vector, treat this as if it were a
6150   non-capturing bracket. Don't worry about setting the flag for the error case
6151   here; that is handled in the code for KET. */
6152
6153   if (op > OP_BRA)
6154     {
6155     number = op - OP_BRA;
6156
6157     /* For extended extraction brackets (large number), we have to fish out the
6158     number from a dummy opcode at the start. */
6159
6160     if (number > EXTRACT_BASIC_MAX)
6161       number = GET2(ecode, 2+LINK_SIZE);
6162     offset = number << 1;
6163
6164 #ifdef DEBUG
6165     printf("start bracket %d subject=", number);
6166     pchars(eptr, 16, TRUE, md);
6167     printf("\n");
6168 #endif
6169
6170     if (offset < md->offset_max)
6171       {
6172       save_offset1 = md->offset_vector[offset];
6173       save_offset2 = md->offset_vector[offset+1];
6174       save_offset3 = md->offset_vector[md->offset_end - number];
6175       save_capture_last = md->capture_last;
6176
6177       DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
6178       md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
6179
6180       do
6181         {
6182         RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
6183           match_isgroup);
6184         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6185         md->capture_last = save_capture_last;
6186         ecode += GET(ecode, 1);
6187         }
6188       while (*ecode == OP_ALT);
6189
6190       DPRINTF(("bracket %d failed\n", number));
6191
6192       md->offset_vector[offset] = save_offset1;
6193       md->offset_vector[offset+1] = save_offset2;
6194       md->offset_vector[md->offset_end - number] = save_offset3;
6195
6196       RRETURN(MATCH_NOMATCH);
6197       }
6198
6199     /* Insufficient room for saving captured contents */
6200
6201     else op = OP_BRA;
6202     }
6203
6204   /* Other types of node can be handled by a switch */
6205
6206   switch(op)
6207     {
6208     case OP_BRA:     /* Non-capturing bracket: optimized */
6209     DPRINTF(("start bracket 0\n"));
6210     do
6211       {
6212       RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
6213         match_isgroup);
6214       if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6215       ecode += GET(ecode, 1);
6216       }
6217     while (*ecode == OP_ALT);
6218     DPRINTF(("bracket 0 failed\n"));
6219     RRETURN(MATCH_NOMATCH);
6220
6221     /* Conditional group: compilation checked that there are no more than
6222     two branches. If the condition is false, skipping the first branch takes us
6223     past the end if there is only one branch, but that's OK because that is
6224     exactly what going to the ket would do. */
6225
6226     case OP_COND:
6227     if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */
6228       {
6229       offset = GET2(ecode, LINK_SIZE+2) << 1;  /* Doubled ref number */
6230       condition = (offset == CREF_RECURSE * 2)?
6231         (md->recursive != NULL) :
6232         (offset < offset_top && md->offset_vector[offset] >= 0);
6233       RMATCH(rrc, eptr, ecode + (condition?
6234         (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1))),
6235         offset_top, md, ims, eptrb, match_isgroup);
6236       RRETURN(rrc);
6237       }
6238
6239     /* The condition is an assertion. Call match() to evaluate it - setting
6240     the final argument TRUE causes it to stop at the end of an assertion. */
6241
6242     else
6243       {
6244       RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
6245           match_condassert | match_isgroup);
6246       if (rrc == MATCH_MATCH)
6247         {
6248         ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2);
6249         while (*ecode == OP_ALT) ecode += GET(ecode, 1);
6250         }
6251       else if (rrc != MATCH_NOMATCH)
6252         {
6253         RRETURN(rrc);         /* Need braces because of following else */
6254         }
6255       else ecode += GET(ecode, 1);
6256       RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
6257         match_isgroup);
6258       RRETURN(rrc);
6259       }
6260     /* Control never reaches here */
6261
6262     /* Skip over conditional reference or large extraction number data if
6263     encountered. */
6264
6265     case OP_CREF:
6266     case OP_BRANUMBER:
6267     ecode += 3;
6268     break;
6269
6270     /* End of the pattern. If we are in a recursion, we should restore the
6271     offsets appropriately and continue from after the call. */
6272
6273     case OP_END:
6274     if (md->recursive != NULL && md->recursive->group_num == 0)
6275       {
6276       recursion_info *rec = md->recursive;
6277       DPRINTF(("Hit the end in a (?0) recursion\n"));
6278       md->recursive = rec->prevrec;
6279       memmove(md->offset_vector, rec->offset_save,
6280         rec->saved_max * sizeof(int));
6281       md->start_match = rec->save_start;
6282       ims = original_ims;
6283       ecode = rec->after_call;
6284       break;
6285       }
6286
6287     /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
6288     string - backtracking will then try other alternatives, if any. */
6289
6290     if (md->notempty && eptr == md->start_match) RRETURN(MATCH_NOMATCH);
6291     md->end_match_ptr = eptr;          /* Record where we ended */
6292     md->end_offset_top = offset_top;   /* and how many extracts were taken */
6293     RRETURN(MATCH_MATCH);
6294
6295     /* Change option settings */
6296
6297     case OP_OPT:
6298     ims = ecode[1];
6299     ecode += 2;
6300     DPRINTF(("ims set to %02lx\n", ims));
6301     break;
6302
6303     /* Assertion brackets. Check the alternative branches in turn - the
6304     matching won't pass the KET for an assertion. If any one branch matches,
6305     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
6306     start of each branch to move the current point backwards, so the code at
6307     this level is identical to the lookahead case. */
6308
6309     case OP_ASSERT:
6310     case OP_ASSERTBACK:
6311     do
6312       {
6313       RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
6314         match_isgroup);
6315       if (rrc == MATCH_MATCH) break;
6316       if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6317       ecode += GET(ecode, 1);
6318       }
6319     while (*ecode == OP_ALT);
6320     if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
6321
6322     /* If checking an assertion for a condition, return MATCH_MATCH. */
6323
6324     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
6325
6326     /* Continue from after the assertion, updating the offsets high water
6327     mark, since extracts may have been taken during the assertion. */
6328
6329     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
6330     ecode += 1 + LINK_SIZE;
6331     offset_top = md->end_offset_top;
6332     continue;
6333
6334     /* Negative assertion: all branches must fail to match */
6335
6336     case OP_ASSERT_NOT:
6337     case OP_ASSERTBACK_NOT:
6338     do
6339       {
6340       RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
6341         match_isgroup);
6342       if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
6343       if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6344       ecode += GET(ecode,1);
6345       }
6346     while (*ecode == OP_ALT);
6347
6348     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
6349
6350     ecode += 1 + LINK_SIZE;
6351     continue;
6352
6353     /* Move the subject pointer back. This occurs only at the start of
6354     each branch of a lookbehind assertion. If we are too close to the start to
6355     move back, this match function fails. When working with UTF-8 we move
6356     back a number of characters, not bytes. */
6357
6358     case OP_REVERSE:
6359 #ifdef SUPPORT_UTF8
6360     if (md->utf8)
6361       {
6362       c = GET(ecode,1);
6363       for (i = 0; i < c; i++)
6364         {
6365         eptr--;
6366         if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
6367         BACKCHAR(eptr)
6368         }
6369       }
6370     else
6371 #endif
6372
6373     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
6374
6375       {
6376       eptr -= GET(ecode,1);
6377       if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
6378       }
6379
6380     /* Skip to next op code */
6381
6382     ecode += 1 + LINK_SIZE;
6383     break;
6384
6385     /* The callout item calls an external function, if one is provided, passing
6386     details of the match so far. This is mainly for debugging, though the
6387     function is able to force a failure. */
6388
6389     case OP_CALLOUT:
6390     if (pcre_callout != NULL)
6391       {
6392       pcre_callout_block cb;
6393       cb.version          = 1;   /* Version 1 of the callout block */
6394       cb.callout_number   = ecode[1];
6395       cb.offset_vector    = md->offset_vector;
6396       cb.subject          = (const char *)md->start_subject;
6397       cb.subject_length   = md->end_subject - md->start_subject;
6398       cb.start_match      = md->start_match - md->start_subject;
6399       cb.current_position = eptr - md->start_subject;
6400       cb.pattern_position = GET(ecode, 2);
6401       cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
6402       cb.capture_top      = offset_top/2;
6403       cb.capture_last     = md->capture_last;
6404       cb.callout_data     = md->callout_data;
6405       if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
6406       if (rrc < 0) RRETURN(rrc);
6407       }
6408     ecode += 2 + 2*LINK_SIZE;
6409     break;
6410
6411     /* Recursion either matches the current regex, or some subexpression. The
6412     offset data is the offset to the starting bracket from the start of the
6413     whole pattern. (This is so that it works from duplicated subpatterns.)
6414
6415     If there are any capturing brackets started but not finished, we have to
6416     save their starting points and reinstate them after the recursion. However,
6417     we don't know how many such there are (offset_top records the completed
6418     total) so we just have to save all the potential data. There may be up to
6419     65535 such values, which is too large to put on the stack, but using malloc
6420     for small numbers seems expensive. As a compromise, the stack is used when
6421     there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
6422     is used. A problem is what to do if the malloc fails ... there is no way of
6423     returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
6424     values on the stack, and accept that the rest may be wrong.
6425
6426     There are also other values that have to be saved. We use a chained
6427     sequence of blocks that actually live on the stack. Thanks to Robin Houston
6428     for the original version of this logic. */
6429
6430     case OP_RECURSE:
6431       {
6432       callpat = md->start_code + GET(ecode, 1);
6433       new_recursive.group_num = *callpat - OP_BRA;
6434
6435       /* For extended extraction brackets (large number), we have to fish out
6436       the number from a dummy opcode at the start. */
6437
6438       if (new_recursive.group_num > EXTRACT_BASIC_MAX)
6439         new_recursive.group_num = GET2(callpat, 2+LINK_SIZE);
6440
6441       /* Add to "recursing stack" */
6442
6443       new_recursive.prevrec = md->recursive;
6444       md->recursive = &new_recursive;
6445
6446       /* Find where to continue from afterwards */
6447
6448       ecode += 1 + LINK_SIZE;
6449       new_recursive.after_call = ecode;
6450
6451       /* Now save the offset data. */
6452
6453       new_recursive.saved_max = md->offset_end;
6454       if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
6455         new_recursive.offset_save = stacksave;
6456       else
6457         {
6458         new_recursive.offset_save =
6459           (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
6460         if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
6461         }
6462
6463       memcpy(new_recursive.offset_save, md->offset_vector,
6464             new_recursive.saved_max * sizeof(int));
6465       new_recursive.save_start = md->start_match;
6466       md->start_match = eptr;
6467
6468       /* OK, now we can do the recursion. For each top-level alternative we
6469       restore the offset and recursion data. */
6470
6471       DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
6472       do
6473         {
6474         RMATCH(rrc, eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims,
6475             eptrb, match_isgroup);
6476         if (rrc == MATCH_MATCH)
6477           {
6478           md->recursive = new_recursive.prevrec;
6479           if (new_recursive.offset_save != stacksave)
6480             (pcre_free)(new_recursive.offset_save);
6481           RRETURN(MATCH_MATCH);
6482           }
6483         else if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6484
6485         md->recursive = &new_recursive;
6486         memcpy(md->offset_vector, new_recursive.offset_save,
6487             new_recursive.saved_max * sizeof(int));
6488         callpat += GET(callpat, 1);
6489         }
6490       while (*callpat == OP_ALT);
6491
6492       DPRINTF(("Recursion didn't match\n"));
6493       md->recursive = new_recursive.prevrec;
6494       if (new_recursive.offset_save != stacksave)
6495         (pcre_free)(new_recursive.offset_save);
6496       RRETURN(MATCH_NOMATCH);
6497       }
6498     /* Control never reaches here */
6499
6500     /* "Once" brackets are like assertion brackets except that after a match,
6501     the point in the subject string is not moved back. Thus there can never be
6502     a move back into the brackets. Friedl calls these "atomic" subpatterns.
6503     Check the alternative branches in turn - the matching won't pass the KET
6504     for this kind of subpattern. If any one branch matches, we carry on as at
6505     the end of a normal bracket, leaving the subject pointer. */
6506
6507     case OP_ONCE:
6508       {
6509       prev = ecode;
6510       saved_eptr = eptr;
6511
6512       do
6513         {
6514         RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
6515           eptrb, match_isgroup);
6516         if (rrc == MATCH_MATCH) break;
6517         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6518         ecode += GET(ecode,1);
6519         }
6520       while (*ecode == OP_ALT);
6521
6522       /* If hit the end of the group (which could be repeated), fail */
6523
6524       if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
6525
6526       /* Continue as from after the assertion, updating the offsets high water
6527       mark, since extracts may have been taken. */
6528
6529       do ecode += GET(ecode,1); while (*ecode == OP_ALT);
6530
6531       offset_top = md->end_offset_top;
6532       eptr = md->end_match_ptr;
6533
6534       /* For a non-repeating ket, just continue at this level. This also
6535       happens for a repeating ket if no characters were matched in the group.
6536       This is the forcible breaking of infinite loops as implemented in Perl
6537       5.005. If there is an options reset, it will get obeyed in the normal
6538       course of events. */
6539
6540       if (*ecode == OP_KET || eptr == saved_eptr)
6541         {
6542         ecode += 1+LINK_SIZE;
6543         break;
6544         }
6545
6546       /* The repeating kets try the rest of the pattern or restart from the
6547       preceding bracket, in the appropriate order. We need to reset any options
6548       that changed within the bracket before re-running it, so check the next
6549       opcode. */
6550
6551       if (ecode[1+LINK_SIZE] == OP_OPT)
6552         {
6553         ims = (ims & ~PCRE_IMS) | ecode[4];
6554         DPRINTF(("ims set to %02lx at group repeat\n", ims));
6555         }
6556
6557       if (*ecode == OP_KETRMIN)
6558         {
6559         RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);
6560         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6561         RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6562         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6563         }
6564       else  /* OP_KETRMAX */
6565         {
6566         RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6567         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6568         RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
6569         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6570         }
6571       }
6572     RRETURN(MATCH_NOMATCH);
6573
6574     /* An alternation is the end of a branch; scan along to find the end of the
6575     bracketed group and go to there. */
6576
6577     case OP_ALT:
6578     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
6579     break;
6580
6581     /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
6582     that it may occur zero times. It may repeat infinitely, or not at all -
6583     i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
6584     repeat limits are compiled as a number of copies, with the optional ones
6585     preceded by BRAZERO or BRAMINZERO. */
6586
6587     case OP_BRAZERO:
6588       {
6589       next = ecode+1;
6590       RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, match_isgroup);
6591       if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6592       do next += GET(next,1); while (*next == OP_ALT);
6593       ecode = next + 1+LINK_SIZE;
6594       }
6595     break;
6596
6597     case OP_BRAMINZERO:
6598       {
6599       next = ecode+1;
6600       do next += GET(next,1); while (*next == OP_ALT);
6601       RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb,
6602         match_isgroup);
6603       if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6604       ecode++;
6605       }
6606     break;
6607
6608     /* End of a group, repeated or non-repeating. If we are at the end of
6609     an assertion "group", stop matching and return MATCH_MATCH, but record the
6610     current high water mark for use by positive assertions. Do this also
6611     for the "once" (not-backup up) groups. */
6612
6613     case OP_KET:
6614     case OP_KETRMIN:
6615     case OP_KETRMAX:
6616       {
6617       prev = ecode - GET(ecode, 1);
6618       saved_eptr = eptrb->epb_saved_eptr;
6619
6620       /* Back up the stack of bracket start pointers. */
6621
6622       eptrb = eptrb->epb_prev;
6623
6624       if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
6625           *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
6626           *prev == OP_ONCE)
6627         {
6628         md->end_match_ptr = eptr;      /* For ONCE */
6629         md->end_offset_top = offset_top;
6630         RRETURN(MATCH_MATCH);
6631         }
6632
6633       /* In all other cases except a conditional group we have to check the
6634       group number back at the start and if necessary complete handling an
6635       extraction by setting the offsets and bumping the high water mark. */
6636
6637       if (*prev != OP_COND)
6638         {
6639         number = *prev - OP_BRA;
6640
6641         /* For extended extraction brackets (large number), we have to fish out
6642         the number from a dummy opcode at the start. */
6643
6644         if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);
6645         offset = number << 1;
6646
6647 #ifdef DEBUG
6648         printf("end bracket %d", number);
6649         printf("\n");
6650 #endif
6651
6652         /* Test for a numbered group. This includes groups called as a result
6653         of recursion. Note that whole-pattern recursion is coded as a recurse
6654         into group 0, so it won't be picked up here. Instead, we catch it when
6655         the OP_END is reached. */
6656
6657         if (number > 0)
6658           {
6659           md->capture_last = number;
6660           if (offset >= md->offset_max) md->offset_overflow = TRUE; else
6661             {
6662             md->offset_vector[offset] =
6663               md->offset_vector[md->offset_end - number];
6664             md->offset_vector[offset+1] = eptr - md->start_subject;
6665             if (offset_top <= offset) offset_top = offset + 2;
6666             }
6667
6668           /* Handle a recursively called group. Restore the offsets
6669           appropriately and continue from after the call. */
6670
6671           if (md->recursive != NULL && md->recursive->group_num == number)
6672             {
6673             recursion_info *rec = md->recursive;
6674             DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
6675             md->recursive = rec->prevrec;
6676             md->start_match = rec->save_start;
6677             memcpy(md->offset_vector, rec->offset_save,
6678               rec->saved_max * sizeof(int));
6679             ecode = rec->after_call;
6680             ims = original_ims;
6681             break;
6682             }
6683           }
6684         }
6685
6686       /* Reset the value of the ims flags, in case they got changed during
6687       the group. */
6688
6689       ims = original_ims;
6690       DPRINTF(("ims reset to %02lx\n", ims));
6691
6692       /* For a non-repeating ket, just continue at this level. This also
6693       happens for a repeating ket if no characters were matched in the group.
6694       This is the forcible breaking of infinite loops as implemented in Perl
6695       5.005. If there is an options reset, it will get obeyed in the normal
6696       course of events. */
6697
6698       if (*ecode == OP_KET || eptr == saved_eptr)
6699         {
6700         ecode += 1 + LINK_SIZE;
6701         break;
6702         }
6703
6704       /* The repeating kets try the rest of the pattern or restart from the
6705       preceding bracket, in the appropriate order. */
6706
6707       if (*ecode == OP_KETRMIN)
6708         {
6709         RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
6710         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6711         RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6712         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6713         }
6714       else  /* OP_KETRMAX */
6715         {
6716         RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6717         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6718         RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
6719         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6720         }
6721       }
6722
6723     RRETURN(MATCH_NOMATCH);
6724
6725     /* Start of subject unless notbol, or after internal newline if multiline */
6726
6727     case OP_CIRC:
6728     if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
6729     if ((ims & PCRE_MULTILINE) != 0)
6730       {
6731       if (eptr != md->start_subject && eptr[-1] != NEWLINE)
6732         RRETURN(MATCH_NOMATCH);
6733       ecode++;
6734       break;
6735       }
6736     /* ... else fall through */
6737
6738     /* Start of subject assertion */
6739
6740     case OP_SOD:
6741     if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
6742     ecode++;
6743     break;
6744
6745     /* Start of match assertion */
6746
6747     case OP_SOM:
6748     if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
6749     ecode++;
6750     break;
6751
6752     /* Assert before internal newline if multiline, or before a terminating
6753     newline unless endonly is set, else end of subject unless noteol is set. */
6754
6755     case OP_DOLL:
6756     if ((ims & PCRE_MULTILINE) != 0)
6757       {
6758       if (eptr < md->end_subject)
6759         { if (*eptr != NEWLINE) RRETURN(MATCH_NOMATCH); }
6760       else
6761         { if (md->noteol) RRETURN(MATCH_NOMATCH); }
6762       ecode++;
6763       break;
6764       }
6765     else
6766       {
6767       if (md->noteol) RRETURN(MATCH_NOMATCH);
6768       if (!md->endonly)
6769         {
6770         if (eptr < md->end_subject - 1 ||
6771            (eptr == md->end_subject - 1 && *eptr != NEWLINE))
6772           RRETURN(MATCH_NOMATCH);
6773         ecode++;
6774         break;
6775         }
6776       }
6777     /* ... else fall through */
6778
6779     /* End of subject assertion (\z) */
6780
6781     case OP_EOD:
6782     if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
6783     ecode++;
6784     break;
6785
6786     /* End of subject or ending \n assertion (\Z) */
6787
6788     case OP_EODN:
6789     if (eptr < md->end_subject - 1 ||
6790        (eptr == md->end_subject - 1 && *eptr != NEWLINE)) RRETURN(MATCH_NOMATCH);
6791     ecode++;
6792     break;
6793
6794     /* Word boundary assertions */
6795
6796     case OP_NOT_WORD_BOUNDARY:
6797     case OP_WORD_BOUNDARY:
6798       {
6799
6800       /* Find out if the previous and current characters are "word" characters.
6801       It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
6802       be "non-word" characters. */
6803
6804 #ifdef SUPPORT_UTF8
6805       if (md->utf8)
6806         {
6807         if (eptr == md->start_subject) prev_is_word = FALSE; else
6808           {
6809           const uschar *lastptr = eptr - 1;
6810           while((*lastptr & 0xc0) == 0x80) lastptr--;
6811           GETCHAR(c, lastptr);
6812           prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
6813           }
6814         if (eptr >= md->end_subject) cur_is_word = FALSE; else
6815           {
6816           GETCHAR(c, eptr);
6817           cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
6818           }
6819         }
6820       else
6821 #endif
6822
6823       /* More streamlined when not in UTF-8 mode */
6824
6825         {
6826         prev_is_word = (eptr != md->start_subject) &&
6827           ((md->ctypes[eptr[-1]] & ctype_word) != 0);
6828         cur_is_word = (eptr < md->end_subject) &&
6829           ((md->ctypes[*eptr] & ctype_word) != 0);
6830         }
6831
6832       /* Now see if the situation is what we want */
6833
6834       if ((*ecode++ == OP_WORD_BOUNDARY)?
6835            cur_is_word == prev_is_word : cur_is_word != prev_is_word)
6836         RRETURN(MATCH_NOMATCH);
6837       }
6838     break;
6839
6840     /* Match a single character type; inline for speed */
6841
6842     case OP_ANY:
6843     if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
6844       RRETURN(MATCH_NOMATCH);
6845     if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
6846 #ifdef SUPPORT_UTF8
6847     if (md->utf8)
6848       while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
6849 #endif
6850     ecode++;
6851     break;
6852
6853     /* Match a single byte, even in UTF-8 mode. This opcode really does match
6854     any byte, even newline, independent of the setting of PCRE_DOTALL. */
6855
6856     case OP_ANYBYTE:
6857     if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
6858     ecode++;
6859     break;
6860
6861     case OP_NOT_DIGIT:
6862     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6863     GETCHARINCTEST(c, eptr);
6864     if (
6865 #ifdef SUPPORT_UTF8
6866        c < 256 &&
6867 #endif
6868        (md->ctypes[c] & ctype_digit) != 0
6869        )
6870       RRETURN(MATCH_NOMATCH);
6871     ecode++;
6872     break;
6873
6874     case OP_DIGIT:
6875     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6876     GETCHARINCTEST(c, eptr);
6877     if (
6878 #ifdef SUPPORT_UTF8
6879        c >= 256 ||
6880 #endif
6881        (md->ctypes[c] & ctype_digit) == 0
6882        )
6883       RRETURN(MATCH_NOMATCH);
6884     ecode++;
6885     break;
6886
6887     case OP_NOT_WHITESPACE:
6888     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6889     GETCHARINCTEST(c, eptr);
6890     if (
6891 #ifdef SUPPORT_UTF8
6892        c < 256 &&
6893 #endif
6894        (md->ctypes[c] & ctype_space) != 0
6895        )
6896       RRETURN(MATCH_NOMATCH);
6897     ecode++;
6898     break;
6899
6900     case OP_WHITESPACE:
6901     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6902     GETCHARINCTEST(c, eptr);
6903     if (
6904 #ifdef SUPPORT_UTF8
6905        c >= 256 ||
6906 #endif
6907        (md->ctypes[c] & ctype_space) == 0
6908        )
6909       RRETURN(MATCH_NOMATCH);
6910     ecode++;
6911     break;
6912
6913     case OP_NOT_WORDCHAR:
6914     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6915     GETCHARINCTEST(c, eptr);
6916     if (
6917 #ifdef SUPPORT_UTF8
6918        c < 256 &&
6919 #endif
6920        (md->ctypes[c] & ctype_word) != 0
6921        )
6922       RRETURN(MATCH_NOMATCH);
6923     ecode++;
6924     break;
6925
6926     case OP_WORDCHAR:
6927     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6928     GETCHARINCTEST(c, eptr);
6929     if (
6930 #ifdef SUPPORT_UTF8
6931        c >= 256 ||
6932 #endif
6933        (md->ctypes[c] & ctype_word) == 0
6934        )
6935       RRETURN(MATCH_NOMATCH);
6936     ecode++;
6937     break;
6938
6939 #ifdef SUPPORT_UCP
6940     /* Check the next character by Unicode property. We will get here only
6941     if the support is in the binary; otherwise a compile-time error occurs. */
6942
6943     case OP_PROP:
6944     case OP_NOTPROP:
6945     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6946     GETCHARINCTEST(c, eptr);
6947       {
6948       int chartype, rqdtype;
6949       int othercase;
6950       int category = ucp_findchar(c, &chartype, &othercase);
6951
6952       rqdtype = *(++ecode);
6953       ecode++;
6954
6955       if (rqdtype >= 128)
6956         {
6957         if ((rqdtype - 128 != category) == (op == OP_PROP))
6958           RRETURN(MATCH_NOMATCH);
6959         }
6960       else
6961         {
6962         if ((rqdtype != chartype) == (op == OP_PROP))
6963           RRETURN(MATCH_NOMATCH);
6964         }
6965       }
6966     break;
6967
6968     /* Match an extended Unicode sequence. We will get here only if the support
6969     is in the binary; otherwise a compile-time error occurs. */
6970
6971     case OP_EXTUNI:
6972     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6973     GETCHARINCTEST(c, eptr);
6974       {
6975       int chartype;
6976       int othercase;
6977       int category = ucp_findchar(c, &chartype, &othercase);
6978       if (category == ucp_M) RRETURN(MATCH_NOMATCH);
6979       while (eptr < md->end_subject)
6980         {
6981         int len = 1;
6982         if (!md->utf8) c = *eptr; else
6983           {
6984           GETCHARLEN(c, eptr, len);
6985           }
6986         category = ucp_findchar(c, &chartype, &othercase);
6987         if (category != ucp_M) break;
6988         eptr += len;
6989         }
6990       }
6991     ecode++;
6992     break;
6993 #endif
6994
6995
6996     /* Match a back reference, possibly repeatedly. Look past the end of the
6997     item to see if there is repeat information following. The code is similar
6998     to that for character classes, but repeated for efficiency. Then obey
6999     similar code to character type repeats - written out again for speed.
7000     However, if the referenced string is the empty string, always treat
7001     it as matched, any number of times (otherwise there could be infinite
7002     loops). */
7003
7004     case OP_REF:
7005       {
7006       offset = GET2(ecode, 1) << 1;               /* Doubled ref number */
7007       ecode += 3;                                 /* Advance past item */
7008
7009       /* If the reference is unset, set the length to be longer than the amount
7010       of subject left; this ensures that every attempt at a match fails. We
7011       can't just fail here, because of the possibility of quantifiers with zero
7012       minima. */
7013
7014       length = (offset >= offset_top || md->offset_vector[offset] < 0)?
7015         md->end_subject - eptr + 1 :
7016         md->offset_vector[offset+1] - md->offset_vector[offset];
7017
7018       /* Set up for repetition, or handle the non-repeated case */
7019
7020       switch (*ecode)
7021         {
7022         case OP_CRSTAR:
7023         case OP_CRMINSTAR:
7024         case OP_CRPLUS:
7025         case OP_CRMINPLUS:
7026         case OP_CRQUERY:
7027         case OP_CRMINQUERY:
7028         c = *ecode++ - OP_CRSTAR;
7029         minimize = (c & 1) != 0;
7030         min = rep_min[c];                 /* Pick up values from tables; */
7031         max = rep_max[c];                 /* zero for max => infinity */
7032         if (max == 0) max = INT_MAX;
7033         break;
7034
7035         case OP_CRRANGE:
7036         case OP_CRMINRANGE:
7037         minimize = (*ecode == OP_CRMINRANGE);
7038         min = GET2(ecode, 1);
7039         max = GET2(ecode, 3);
7040         if (max == 0) max = INT_MAX;
7041         ecode += 5;
7042         break;
7043
7044         default:               /* No repeat follows */
7045         if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
7046         eptr += length;
7047         continue;              /* With the main loop */
7048         }
7049
7050       /* If the length of the reference is zero, just continue with the
7051       main loop. */
7052
7053       if (length == 0) continue;
7054
7055       /* First, ensure the minimum number of matches are present. We get back
7056       the length of the reference string explicitly rather than passing the
7057       address of eptr, so that eptr can be a register variable. */
7058
7059       for (i = 1; i <= min; i++)
7060         {
7061         if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
7062         eptr += length;
7063         }
7064
7065       /* If min = max, continue at the same level without recursion.
7066       They are not both allowed to be zero. */
7067
7068       if (min == max) continue;
7069
7070       /* If minimizing, keep trying and advancing the pointer */
7071
7072       if (minimize)
7073         {
7074         for (fi = min;; fi++)
7075           {
7076           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7077           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7078           if (fi >= max || !match_ref(offset, eptr, length, md, ims))
7079             RRETURN(MATCH_NOMATCH);
7080           eptr += length;
7081           }
7082         /* Control never gets here */
7083         }
7084
7085       /* If maximizing, find the longest string and work backwards */
7086
7087       else
7088         {
7089         pp = eptr;
7090         for (i = min; i < max; i++)
7091           {
7092           if (!match_ref(offset, eptr, length, md, ims)) break;
7093           eptr += length;
7094           }
7095         while (eptr >= pp)
7096           {
7097           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7098           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7099           eptr -= length;
7100           }
7101         RRETURN(MATCH_NOMATCH);
7102         }
7103       }
7104     /* Control never gets here */
7105
7106
7107
7108     /* Match a bit-mapped character class, possibly repeatedly. This op code is
7109     used when all the characters in the class have values in the range 0-255,
7110     and either the matching is caseful, or the characters are in the range
7111     0-127 when UTF-8 processing is enabled. The only difference between
7112     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
7113     encountered.
7114
7115     First, look past the end of the item to see if there is repeat information
7116     following. Then obey similar code to character type repeats - written out
7117     again for speed. */
7118
7119     case OP_NCLASS:
7120     case OP_CLASS:
7121       {
7122       data = ecode + 1;                /* Save for matching */
7123       ecode += 33;                     /* Advance past the item */
7124
7125       switch (*ecode)
7126         {
7127         case OP_CRSTAR:
7128         case OP_CRMINSTAR:
7129         case OP_CRPLUS:
7130         case OP_CRMINPLUS:
7131         case OP_CRQUERY:
7132         case OP_CRMINQUERY:
7133         c = *ecode++ - OP_CRSTAR;
7134         minimize = (c & 1) != 0;
7135         min = rep_min[c];                 /* Pick up values from tables; */
7136         max = rep_max[c];                 /* zero for max => infinity */
7137         if (max == 0) max = INT_MAX;
7138         break;
7139
7140         case OP_CRRANGE:
7141         case OP_CRMINRANGE:
7142         minimize = (*ecode == OP_CRMINRANGE);
7143         min = GET2(ecode, 1);
7144         max = GET2(ecode, 3);
7145         if (max == 0) max = INT_MAX;
7146         ecode += 5;
7147         break;
7148
7149         default:               /* No repeat follows */
7150         min = max = 1;
7151         break;
7152         }
7153
7154       /* First, ensure the minimum number of matches are present. */
7155
7156 #ifdef SUPPORT_UTF8
7157       /* UTF-8 mode */
7158       if (md->utf8)
7159         {
7160         for (i = 1; i <= min; i++)
7161           {
7162           if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7163           GETCHARINC(c, eptr);
7164           if (c > 255)
7165             {
7166             if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
7167             }
7168           else
7169             {
7170             if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
7171             }
7172           }
7173         }
7174       else
7175 #endif
7176       /* Not UTF-8 mode */
7177         {
7178         for (i = 1; i <= min; i++)
7179           {
7180           if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7181           c = *eptr++;
7182           if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
7183           }
7184         }
7185
7186       /* If max == min we can continue with the main loop without the
7187       need to recurse. */
7188
7189       if (min == max) continue;
7190
7191       /* If minimizing, keep testing the rest of the expression and advancing
7192       the pointer while it matches the class. */
7193
7194       if (minimize)
7195         {
7196 #ifdef SUPPORT_UTF8
7197         /* UTF-8 mode */
7198         if (md->utf8)
7199           {
7200           for (fi = min;; fi++)
7201             {
7202             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7203             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7204             if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7205             GETCHARINC(c, eptr);
7206             if (c > 255)
7207               {
7208               if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
7209               }
7210             else
7211               {
7212               if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
7213               }
7214             }
7215           }
7216         else
7217 #endif
7218         /* Not UTF-8 mode */
7219           {
7220           for (fi = min;; fi++)
7221             {
7222             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7223             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7224             if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7225             c = *eptr++;
7226             if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
7227             }
7228           }
7229         /* Control never gets here */
7230         }
7231
7232       /* If maximizing, find the longest possible run, then work backwards. */
7233
7234       else
7235         {
7236         pp = eptr;
7237
7238 #ifdef SUPPORT_UTF8
7239         /* UTF-8 mode */
7240         if (md->utf8)
7241           {
7242           for (i = min; i < max; i++)
7243             {
7244             int len = 1;
7245             if (eptr >= md->end_subject) break;
7246             GETCHARLEN(c, eptr, len);
7247             if (c > 255)
7248               {
7249               if (op == OP_CLASS) break;
7250               }
7251             else
7252               {
7253               if ((data[c/8] & (1 << (c&7))) == 0) break;
7254               }
7255             eptr += len;
7256             }
7257           for (;;)
7258             {
7259             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7260             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7261             if (eptr-- == pp) break;        /* Stop if tried at original pos */
7262             BACKCHAR(eptr);
7263             }
7264           }
7265         else
7266 #endif
7267           /* Not UTF-8 mode */
7268           {
7269           for (i = min; i < max; i++)
7270             {
7271             if (eptr >= md->end_subject) break;
7272             c = *eptr;
7273             if ((data[c/8] & (1 << (c&7))) == 0) break;
7274             eptr++;
7275             }
7276           while (eptr >= pp)
7277             {
7278             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7279             eptr--;
7280             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7281             }
7282           }
7283
7284         RRETURN(MATCH_NOMATCH);
7285         }
7286       }
7287     /* Control never gets here */
7288
7289
7290     /* Match an extended character class. This opcode is encountered only
7291     in UTF-8 mode, because that's the only time it is compiled. */
7292
7293 #ifdef SUPPORT_UTF8
7294     case OP_XCLASS:
7295       {
7296       data = ecode + 1 + LINK_SIZE;                /* Save for matching */
7297       ecode += GET(ecode, 1);                      /* Advance past the item */
7298
7299       switch (*ecode)
7300         {
7301         case OP_CRSTAR:
7302         case OP_CRMINSTAR:
7303         case OP_CRPLUS:
7304         case OP_CRMINPLUS:
7305         case OP_CRQUERY:
7306         case OP_CRMINQUERY:
7307         c = *ecode++ - OP_CRSTAR;
7308         minimize = (c & 1) != 0;
7309         min = rep_min[c];                 /* Pick up values from tables; */
7310         max = rep_max[c];                 /* zero for max => infinity */
7311         if (max == 0) max = INT_MAX;
7312         break;
7313
7314         case OP_CRRANGE:
7315         case OP_CRMINRANGE:
7316         minimize = (*ecode == OP_CRMINRANGE);
7317         min = GET2(ecode, 1);
7318         max = GET2(ecode, 3);
7319         if (max == 0) max = INT_MAX;
7320         ecode += 5;
7321         break;
7322
7323         default:               /* No repeat follows */
7324         min = max = 1;
7325         break;
7326         }
7327
7328       /* First, ensure the minimum number of matches are present. */
7329
7330       for (i = 1; i <= min; i++)
7331         {
7332         if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7333         GETCHARINC(c, eptr);
7334         if (!match_xclass(c, data)) RRETURN(MATCH_NOMATCH);
7335         }
7336
7337       /* If max == min we can continue with the main loop without the
7338       need to recurse. */
7339
7340       if (min == max) continue;
7341
7342       /* If minimizing, keep testing the rest of the expression and advancing
7343       the pointer while it matches the class. */
7344
7345       if (minimize)
7346         {
7347         for (fi = min;; fi++)
7348           {
7349           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7350           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7351           if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7352           GETCHARINC(c, eptr);
7353           if (!match_xclass(c, data)) RRETURN(MATCH_NOMATCH);
7354           }
7355         /* Control never gets here */
7356         }
7357
7358       /* If maximizing, find the longest possible run, then work backwards. */
7359
7360       else
7361         {
7362         pp = eptr;
7363         for (i = min; i < max; i++)
7364           {
7365           int len = 1;
7366           if (eptr >= md->end_subject) break;
7367           GETCHARLEN(c, eptr, len);
7368           if (!match_xclass(c, data)) break;
7369           eptr += len;
7370           }
7371         for(;;)
7372           {
7373           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7374           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7375           if (eptr-- == pp) break;        /* Stop if tried at original pos */
7376           BACKCHAR(eptr)
7377           }
7378         RRETURN(MATCH_NOMATCH);
7379         }
7380
7381       /* Control never gets here */
7382       }
7383 #endif    /* End of XCLASS */
7384
7385     /* Match a single character, casefully */
7386
7387     case OP_CHAR:
7388 #ifdef SUPPORT_UTF8
7389     if (md->utf8)
7390       {
7391       length = 1;
7392       ecode++;
7393       GETCHARLEN(fc, ecode, length);
7394       if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7395       while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
7396       }
7397     else
7398 #endif
7399
7400     /* Non-UTF-8 mode */
7401       {
7402       if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
7403       if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
7404       ecode += 2;
7405       }
7406     break;
7407
7408     /* Match a single character, caselessly */
7409
7410     case OP_CHARNC:
7411 #ifdef SUPPORT_UTF8
7412     if (md->utf8)
7413       {
7414       length = 1;
7415       ecode++;
7416       GETCHARLEN(fc, ecode, length);
7417
7418       if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7419
7420       /* If the pattern character's value is < 128, we have only one byte, and
7421       can use the fast lookup table. */
7422
7423       if (fc < 128)
7424         {
7425         if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
7426         }
7427
7428       /* Otherwise we must pick up the subject character */
7429
7430       else
7431         {
7432         int dc;
7433         GETCHARINC(dc, eptr);
7434         ecode += length;
7435
7436         /* If we have Unicode property support, we can use it to test the other
7437         case of the character, if there is one. The result of ucp_findchar() is
7438         < 0 if the char isn't found, and othercase is returned as zero if there
7439         isn't one. */
7440
7441         if (fc != dc)
7442           {
7443 #ifdef SUPPORT_UCP
7444           int chartype;
7445           int othercase;
7446           if (ucp_findchar(fc, &chartype, &othercase) < 0 || dc != othercase)
7447 #endif
7448             RRETURN(MATCH_NOMATCH);
7449           }
7450         }
7451       }
7452     else
7453 #endif   /* SUPPORT_UTF8 */
7454
7455     /* Non-UTF-8 mode */
7456       {
7457       if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
7458       if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
7459       ecode += 2;
7460       }
7461     break;
7462
7463     /* Match a single character repeatedly; different opcodes share code. */
7464
7465     case OP_EXACT:
7466     min = max = GET2(ecode, 1);
7467     ecode += 3;
7468     goto REPEATCHAR;
7469
7470     case OP_UPTO:
7471     case OP_MINUPTO:
7472     min = 0;
7473     max = GET2(ecode, 1);
7474     minimize = *ecode == OP_MINUPTO;
7475     ecode += 3;
7476     goto REPEATCHAR;
7477
7478     case OP_STAR:
7479     case OP_MINSTAR:
7480     case OP_PLUS:
7481     case OP_MINPLUS:
7482     case OP_QUERY:
7483     case OP_MINQUERY:
7484     c = *ecode++ - OP_STAR;
7485     minimize = (c & 1) != 0;
7486     min = rep_min[c];                 /* Pick up values from tables; */
7487     max = rep_max[c];                 /* zero for max => infinity */
7488     if (max == 0) max = INT_MAX;
7489
7490     /* Common code for all repeated single-character matches. We can give
7491     up quickly if there are fewer than the minimum number of characters left in
7492     the subject. */
7493
7494     REPEATCHAR:
7495 #ifdef SUPPORT_UTF8
7496     if (md->utf8)
7497       {
7498       length = 1;
7499       charptr = ecode;
7500       GETCHARLEN(fc, ecode, length);
7501       if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7502       ecode += length;
7503
7504       /* Handle multibyte character matching specially here. There is
7505       support for caseless matching if UCP support is present. */
7506
7507       if (length > 1)
7508         {
7509         int oclength = 0;
7510         uschar occhars[8];
7511
7512 #ifdef SUPPORT_UCP
7513         int othercase;
7514         int chartype;
7515         if ((ims & PCRE_CASELESS) != 0 &&
7516              ucp_findchar(fc, &chartype, &othercase) >= 0 &&
7517              othercase > 0)
7518           oclength = ord2utf8(othercase, occhars);
7519 #endif  /* SUPPORT_UCP */
7520
7521         for (i = 1; i <= min; i++)
7522           {
7523           if (memcmp(eptr, charptr, length) == 0) eptr += length;
7524           /* Need braces because of following else */
7525           else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
7526           else
7527             {
7528             if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
7529             eptr += oclength;
7530             }
7531           }
7532
7533         if (min == max) continue;
7534
7535         if (minimize)
7536           {
7537           for (fi = min;; fi++)
7538             {
7539             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7540             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7541             if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7542             if (memcmp(eptr, charptr, length) == 0) eptr += length;
7543             /* Need braces because of following else */
7544             else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
7545             else
7546               {
7547               if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
7548               eptr += oclength;
7549               }
7550             }
7551           /* Control never gets here */
7552           }
7553         else
7554           {
7555           pp = eptr;
7556           for (i = min; i < max; i++)
7557             {
7558             if (eptr > md->end_subject - length) break;
7559             if (memcmp(eptr, charptr, length) == 0) eptr += length;
7560             else if (oclength == 0) break;
7561             else
7562               {
7563               if (memcmp(eptr, occhars, oclength) != 0) break;
7564               eptr += oclength;
7565               }
7566             }
7567           while (eptr >= pp)
7568            {
7569            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7570            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7571            eptr -= length;
7572            }
7573           RRETURN(MATCH_NOMATCH);
7574           }
7575         /* Control never gets here */
7576         }
7577
7578       /* If the length of a UTF-8 character is 1, we fall through here, and
7579       obey the code as for non-UTF-8 characters below, though in this case the
7580       value of fc will always be < 128. */
7581       }
7582     else
7583 #endif  /* SUPPORT_UTF8 */
7584
7585     /* When not in UTF-8 mode, load a single-byte character. */
7586       {
7587       if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7588       fc = *ecode++;
7589       }
7590
7591     /* The value of fc at this point is always less than 256, though we may or
7592     may not be in UTF-8 mode. The code is duplicated for the caseless and
7593     caseful cases, for speed, since matching characters is likely to be quite
7594     common. First, ensure the minimum number of matches are present. If min =
7595     max, continue at the same level without recursing. Otherwise, if
7596     minimizing, keep trying the rest of the expression and advancing one
7597     matching character if failing, up to the maximum. Alternatively, if
7598     maximizing, find the maximum number of characters and work backwards. */
7599
7600     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
7601       max, eptr));
7602
7603     if ((ims & PCRE_CASELESS) != 0)
7604       {
7605       fc = md->lcc[fc];
7606       for (i = 1; i <= min; i++)
7607         if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
7608       if (min == max) continue;
7609       if (minimize)
7610         {
7611         for (fi = min;; fi++)
7612           {
7613           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7614           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7615           if (fi >= max || eptr >= md->end_subject ||
7616               fc != md->lcc[*eptr++])
7617             RRETURN(MATCH_NOMATCH);
7618           }
7619         /* Control never gets here */
7620         }
7621       else
7622         {
7623         pp = eptr;
7624         for (i = min; i < max; i++)
7625           {
7626           if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
7627           eptr++;
7628           }
7629         while (eptr >= pp)
7630           {
7631           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7632           eptr--;
7633           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7634           }
7635         RRETURN(MATCH_NOMATCH);
7636         }
7637       /* Control never gets here */
7638       }
7639
7640     /* Caseful comparisons (includes all multi-byte characters) */
7641
7642     else
7643       {
7644       for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
7645       if (min == max) continue;
7646       if (minimize)
7647         {
7648         for (fi = min;; fi++)
7649           {
7650           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7651           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7652           if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
7653             RRETURN(MATCH_NOMATCH);
7654           }
7655         /* Control never gets here */
7656         }
7657       else
7658         {
7659         pp = eptr;
7660         for (i = min; i < max; i++)
7661           {
7662           if (eptr >= md->end_subject || fc != *eptr) break;
7663           eptr++;
7664           }
7665         while (eptr >= pp)
7666           {
7667           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7668           eptr--;
7669           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7670           }
7671         RRETURN(MATCH_NOMATCH);
7672         }
7673       }
7674     /* Control never gets here */
7675
7676     /* Match a negated single one-byte character. The character we are
7677     checking can be multibyte. */
7678
7679     case OP_NOT:
7680     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7681     ecode++;
7682     GETCHARINCTEST(c, eptr);
7683     if ((ims & PCRE_CASELESS) != 0)
7684       {
7685 #ifdef SUPPORT_UTF8
7686       if (c < 256)
7687 #endif
7688       c = md->lcc[c];
7689       if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
7690       }
7691     else
7692       {
7693       if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
7694       }
7695     break;
7696
7697     /* Match a negated single one-byte character repeatedly. This is almost a
7698     repeat of the code for a repeated single character, but I haven't found a
7699     nice way of commoning these up that doesn't require a test of the
7700     positive/negative option for each character match. Maybe that wouldn't add
7701     very much to the time taken, but character matching *is* what this is all
7702     about... */
7703
7704     case OP_NOTEXACT:
7705     min = max = GET2(ecode, 1);
7706     ecode += 3;
7707     goto REPEATNOTCHAR;
7708
7709     case OP_NOTUPTO:
7710     case OP_NOTMINUPTO:
7711     min = 0;
7712     max = GET2(ecode, 1);
7713     minimize = *ecode == OP_NOTMINUPTO;
7714     ecode += 3;
7715     goto REPEATNOTCHAR;
7716
7717     case OP_NOTSTAR:
7718     case OP_NOTMINSTAR:
7719     case OP_NOTPLUS:
7720     case OP_NOTMINPLUS:
7721     case OP_NOTQUERY:
7722     case OP_NOTMINQUERY:
7723     c = *ecode++ - OP_NOTSTAR;
7724     minimize = (c & 1) != 0;
7725     min = rep_min[c];                 /* Pick up values from tables; */
7726     max = rep_max[c];                 /* zero for max => infinity */
7727     if (max == 0) max = INT_MAX;
7728
7729     /* Common code for all repeated single-byte matches. We can give up quickly
7730     if there are fewer than the minimum number of bytes left in the
7731     subject. */
7732
7733     REPEATNOTCHAR:
7734     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7735     fc = *ecode++;
7736
7737     /* The code is duplicated for the caseless and caseful cases, for speed,
7738     since matching characters is likely to be quite common. First, ensure the
7739     minimum number of matches are present. If min = max, continue at the same
7740     level without recursing. Otherwise, if minimizing, keep trying the rest of
7741     the expression and advancing one matching character if failing, up to the
7742     maximum. Alternatively, if maximizing, find the maximum number of
7743     characters and work backwards. */
7744
7745     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
7746       max, eptr));
7747
7748     if ((ims & PCRE_CASELESS) != 0)
7749       {
7750       fc = md->lcc[fc];
7751
7752 #ifdef SUPPORT_UTF8
7753       /* UTF-8 mode */
7754       if (md->utf8)
7755         {
7756         register int d;
7757         for (i = 1; i <= min; i++)
7758           {
7759           GETCHARINC(d, eptr);
7760           if (d < 256) d = md->lcc[d];
7761           if (fc == d) RRETURN(MATCH_NOMATCH);
7762           }
7763         }
7764       else
7765 #endif
7766
7767       /* Not UTF-8 mode */
7768         {
7769         for (i = 1; i <= min; i++)
7770           if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
7771         }
7772
7773       if (min == max) continue;
7774
7775       if (minimize)
7776         {
7777 #ifdef SUPPORT_UTF8
7778         /* UTF-8 mode */
7779         if (md->utf8)
7780           {
7781           register int d;
7782           for (fi = min;; fi++)
7783             {
7784             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7785             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7786             GETCHARINC(d, eptr);
7787             if (d < 256) d = md->lcc[d];
7788             if (fi >= max || eptr >= md->end_subject || fc == d)
7789               RRETURN(MATCH_NOMATCH);
7790             }
7791           }
7792         else
7793 #endif
7794         /* Not UTF-8 mode */
7795           {
7796           for (fi = min;; fi++)
7797             {
7798             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7799             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7800             if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
7801               RRETURN(MATCH_NOMATCH);
7802             }
7803           }
7804         /* Control never gets here */
7805         }
7806
7807       /* Maximize case */
7808
7809       else
7810         {
7811         pp = eptr;
7812
7813 #ifdef SUPPORT_UTF8
7814         /* UTF-8 mode */
7815         if (md->utf8)
7816           {
7817           register int d;
7818           for (i = min; i < max; i++)
7819             {
7820             int len = 1;
7821             if (eptr >= md->end_subject) break;
7822             GETCHARLEN(d, eptr, len);
7823             if (d < 256) d = md->lcc[d];
7824             if (fc == d) break;
7825             eptr += len;
7826             }
7827           for(;;)
7828             {
7829             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7830             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7831             if (eptr-- == pp) break;        /* Stop if tried at original pos */
7832             BACKCHAR(eptr);
7833             }
7834           }
7835         else
7836 #endif
7837         /* Not UTF-8 mode */
7838           {
7839           for (i = min; i < max; i++)
7840             {
7841             if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
7842             eptr++;
7843             }
7844           while (eptr >= pp)
7845             {
7846             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7847             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7848             eptr--;
7849             }
7850           }
7851
7852         RRETURN(MATCH_NOMATCH);
7853         }
7854       /* Control never gets here */
7855       }
7856
7857     /* Caseful comparisons */
7858
7859     else
7860       {
7861 #ifdef SUPPORT_UTF8
7862       /* UTF-8 mode */
7863       if (md->utf8)
7864         {
7865         register int d;
7866         for (i = 1; i <= min; i++)
7867           {
7868           GETCHARINC(d, eptr);
7869           if (fc == d) RRETURN(MATCH_NOMATCH);
7870           }
7871         }
7872       else
7873 #endif
7874       /* Not UTF-8 mode */
7875         {
7876         for (i = 1; i <= min; i++)
7877           if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
7878         }
7879
7880       if (min == max) continue;
7881
7882       if (minimize)
7883         {
7884 #ifdef SUPPORT_UTF8
7885         /* UTF-8 mode */
7886         if (md->utf8)
7887           {
7888           register int d;
7889           for (fi = min;; fi++)
7890             {
7891             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7892             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7893             GETCHARINC(d, eptr);
7894             if (fi >= max || eptr >= md->end_subject || fc == d)
7895               RRETURN(MATCH_NOMATCH);
7896             }
7897           }
7898         else
7899 #endif
7900         /* Not UTF-8 mode */
7901           {
7902           for (fi = min;; fi++)
7903             {
7904             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7905             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7906             if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
7907               RRETURN(MATCH_NOMATCH);
7908             }
7909           }
7910         /* Control never gets here */
7911         }
7912
7913       /* Maximize case */
7914
7915       else
7916         {
7917         pp = eptr;
7918
7919 #ifdef SUPPORT_UTF8
7920         /* UTF-8 mode */
7921         if (md->utf8)
7922           {
7923           register int d;
7924           for (i = min; i < max; i++)
7925             {
7926             int len = 1;
7927             if (eptr >= md->end_subject) break;
7928             GETCHARLEN(d, eptr, len);
7929             if (fc == d) break;
7930             eptr += len;
7931             }
7932           for(;;)
7933             {
7934             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7935             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7936             if (eptr-- == pp) break;        /* Stop if tried at original pos */
7937             BACKCHAR(eptr);
7938             }
7939           }
7940         else
7941 #endif
7942         /* Not UTF-8 mode */
7943           {
7944           for (i = min; i < max; i++)
7945             {
7946             if (eptr >= md->end_subject || fc == *eptr) break;
7947             eptr++;
7948             }
7949           while (eptr >= pp)
7950             {
7951             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7952             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7953             eptr--;
7954             }
7955           }
7956
7957         RRETURN(MATCH_NOMATCH);
7958         }
7959       }
7960     /* Control never gets here */
7961
7962     /* Match a single character type repeatedly; several different opcodes
7963     share code. This is very similar to the code for single characters, but we
7964     repeat it in the interests of efficiency. */
7965
7966     case OP_TYPEEXACT:
7967     min = max = GET2(ecode, 1);
7968     minimize = TRUE;
7969     ecode += 3;
7970     goto REPEATTYPE;
7971
7972     case OP_TYPEUPTO:
7973     case OP_TYPEMINUPTO:
7974     min = 0;
7975     max = GET2(ecode, 1);
7976     minimize = *ecode == OP_TYPEMINUPTO;
7977     ecode += 3;
7978     goto REPEATTYPE;
7979
7980     case OP_TYPESTAR:
7981     case OP_TYPEMINSTAR:
7982     case OP_TYPEPLUS:
7983     case OP_TYPEMINPLUS:
7984     case OP_TYPEQUERY:
7985     case OP_TYPEMINQUERY:
7986     c = *ecode++ - OP_TYPESTAR;
7987     minimize = (c & 1) != 0;
7988     min = rep_min[c];                 /* Pick up values from tables; */
7989     max = rep_max[c];                 /* zero for max => infinity */
7990     if (max == 0) max = INT_MAX;
7991
7992     /* Common code for all repeated single character type matches. Note that
7993     in UTF-8 mode, '.' matches a character of any length, but for the other
7994     character types, the valid characters are all one-byte long. */
7995
7996     REPEATTYPE:
7997     ctype = *ecode++;      /* Code for the character type */
7998
7999 #ifdef SUPPORT_UCP
8000     if (ctype == OP_PROP || ctype == OP_NOTPROP)
8001       {
8002       prop_fail_result = ctype == OP_NOTPROP;
8003       prop_type = *ecode++;
8004       if (prop_type >= 128)
8005         {
8006         prop_test_against = prop_type - 128;
8007         prop_test_variable = &prop_category;
8008         }
8009       else
8010         {
8011         prop_test_against = prop_type;
8012         prop_test_variable = &prop_chartype;
8013         }
8014       }
8015     else prop_type = -1;
8016 #endif
8017
8018     /* First, ensure the minimum number of matches are present. Use inline
8019     code for maximizing the speed, and do the type test once at the start
8020     (i.e. keep it out of the loop). Also we can test that there are at least
8021     the minimum number of bytes before we start. This isn't as effective in
8022     UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
8023     is tidier. Also separate the UCP code, which can be the same for both UTF-8
8024     and single-bytes. */
8025
8026     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
8027     if (min > 0)
8028       {
8029 #ifdef SUPPORT_UCP
8030       if (prop_type > 0)
8031         {
8032         for (i = 1; i <= min; i++)
8033           {
8034           GETCHARINC(c, eptr);
8035           prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8036           if ((*prop_test_variable == prop_test_against) == prop_fail_result)
8037             RRETURN(MATCH_NOMATCH);
8038           }
8039         }
8040
8041       /* Match extended Unicode sequences. We will get here only if the
8042       support is in the binary; otherwise a compile-time error occurs. */
8043
8044       else if (ctype == OP_EXTUNI)
8045         {
8046         for (i = 1; i <= min; i++)
8047           {
8048           GETCHARINCTEST(c, eptr);
8049           prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8050           if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
8051           while (eptr < md->end_subject)
8052             {
8053             int len = 1;
8054             if (!md->utf8) c = *eptr; else
8055               {
8056               GETCHARLEN(c, eptr, len);
8057               }
8058             prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8059             if (prop_category != ucp_M) break;
8060             eptr += len;
8061             }
8062           }
8063         }
8064
8065       else
8066 #endif     /* SUPPORT_UCP */
8067
8068 /* Handle all other cases when the coding is UTF-8 */
8069
8070 #ifdef SUPPORT_UTF8
8071       if (md->utf8) switch(ctype)
8072         {
8073         case OP_ANY:
8074         for (i = 1; i <= min; i++)
8075           {
8076           if (eptr >= md->end_subject ||
8077              (*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0))
8078             RRETURN(MATCH_NOMATCH);
8079           while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8080           }
8081         break;
8082
8083         case OP_ANYBYTE:
8084         eptr += min;
8085         break;
8086
8087         case OP_NOT_DIGIT:
8088         for (i = 1; i <= min; i++)
8089           {
8090           if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8091           GETCHARINC(c, eptr);
8092           if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
8093             RRETURN(MATCH_NOMATCH);
8094           }
8095         break;
8096
8097         case OP_DIGIT:
8098         for (i = 1; i <= min; i++)
8099           {
8100           if (eptr >= md->end_subject ||
8101              *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
8102             RRETURN(MATCH_NOMATCH);
8103           /* No need to skip more bytes - we know it's a 1-byte character */
8104           }
8105         break;
8106
8107         case OP_NOT_WHITESPACE:
8108         for (i = 1; i <= min; i++)
8109           {
8110           if (eptr >= md->end_subject ||
8111              (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
8112             RRETURN(MATCH_NOMATCH);
8113           while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8114           }
8115         break;
8116
8117         case OP_WHITESPACE:
8118         for (i = 1; i <= min; i++)
8119           {
8120           if (eptr >= md->end_subject ||
8121              *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
8122             RRETURN(MATCH_NOMATCH);
8123           /* No need to skip more bytes - we know it's a 1-byte character */
8124           }
8125         break;
8126
8127         case OP_NOT_WORDCHAR:
8128         for (i = 1; i <= min; i++)
8129           {
8130           if (eptr >= md->end_subject ||
8131              (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
8132             RRETURN(MATCH_NOMATCH);
8133           while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8134           }
8135         break;
8136
8137         case OP_WORDCHAR:
8138         for (i = 1; i <= min; i++)
8139           {
8140           if (eptr >= md->end_subject ||
8141              *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
8142             RRETURN(MATCH_NOMATCH);
8143           /* No need to skip more bytes - we know it's a 1-byte character */
8144           }
8145         break;
8146
8147         default:
8148         RRETURN(PCRE_ERROR_INTERNAL);
8149         }  /* End switch(ctype) */
8150
8151       else
8152 #endif     /* SUPPORT_UTF8 */
8153
8154       /* Code for the non-UTF-8 case for minimum matching of operators other
8155       than OP_PROP and OP_NOTPROP. */
8156
8157       switch(ctype)
8158         {
8159         case OP_ANY:
8160         if ((ims & PCRE_DOTALL) == 0)
8161           {
8162           for (i = 1; i <= min; i++)
8163             if (*eptr++ == NEWLINE) RRETURN(MATCH_NOMATCH);
8164           }
8165         else eptr += min;
8166         break;
8167
8168         case OP_ANYBYTE:
8169         eptr += min;
8170         break;
8171
8172         case OP_NOT_DIGIT:
8173         for (i = 1; i <= min; i++)
8174           if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
8175         break;
8176
8177         case OP_DIGIT:
8178         for (i = 1; i <= min; i++)
8179           if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
8180         break;
8181
8182         case OP_NOT_WHITESPACE:
8183         for (i = 1; i <= min; i++)
8184           if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
8185         break;
8186
8187         case OP_WHITESPACE:
8188         for (i = 1; i <= min; i++)
8189           if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
8190         break;
8191
8192         case OP_NOT_WORDCHAR:
8193         for (i = 1; i <= min; i++)
8194           if ((md->ctypes[*eptr++] & ctype_word) != 0)
8195             RRETURN(MATCH_NOMATCH);
8196         break;
8197
8198         case OP_WORDCHAR:
8199         for (i = 1; i <= min; i++)
8200           if ((md->ctypes[*eptr++] & ctype_word) == 0)
8201             RRETURN(MATCH_NOMATCH);
8202         break;
8203
8204         default:
8205         RRETURN(PCRE_ERROR_INTERNAL);
8206         }
8207       }
8208
8209     /* If min = max, continue at the same level without recursing */
8210
8211     if (min == max) continue;
8212
8213     /* If minimizing, we have to test the rest of the pattern before each
8214     subsequent match. Again, separate the UTF-8 case for speed, and also
8215     separate the UCP cases. */
8216
8217     if (minimize)
8218       {
8219 #ifdef SUPPORT_UCP
8220       if (prop_type > 0)
8221         {
8222         for (fi = min;; fi++)
8223           {
8224           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8225           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8226           if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8227           GETCHARINC(c, eptr);
8228           prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8229           if ((*prop_test_variable == prop_test_against) == prop_fail_result)
8230             RRETURN(MATCH_NOMATCH);
8231           }
8232         }
8233
8234       /* Match extended Unicode sequences. We will get here only if the
8235       support is in the binary; otherwise a compile-time error occurs. */
8236
8237       else if (ctype == OP_EXTUNI)
8238         {
8239         for (fi = min;; fi++)
8240           {
8241           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8242           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8243           if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8244           GETCHARINCTEST(c, eptr);
8245           prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8246           if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
8247           while (eptr < md->end_subject)
8248             {
8249             int len = 1;
8250             if (!md->utf8) c = *eptr; else
8251               {
8252               GETCHARLEN(c, eptr, len);
8253               }
8254             prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8255             if (prop_category != ucp_M) break;
8256             eptr += len;
8257             }
8258           }
8259         }
8260
8261       else
8262 #endif     /* SUPPORT_UCP */
8263
8264 #ifdef SUPPORT_UTF8
8265       /* UTF-8 mode */
8266       if (md->utf8)
8267         {
8268         for (fi = min;; fi++)
8269           {
8270           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8271           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8272           if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8273
8274           GETCHARINC(c, eptr);
8275           switch(ctype)
8276             {
8277             case OP_ANY:
8278             if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
8279             break;
8280
8281             case OP_ANYBYTE:
8282             break;
8283
8284             case OP_NOT_DIGIT:
8285             if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
8286               RRETURN(MATCH_NOMATCH);
8287             break;
8288
8289             case OP_DIGIT:
8290             if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
8291               RRETURN(MATCH_NOMATCH);
8292             break;
8293
8294             case OP_NOT_WHITESPACE:
8295             if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
8296               RRETURN(MATCH_NOMATCH);
8297             break;
8298
8299             case OP_WHITESPACE:
8300             if  (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
8301               RRETURN(MATCH_NOMATCH);
8302             break;
8303
8304             case OP_NOT_WORDCHAR:
8305             if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
8306               RRETURN(MATCH_NOMATCH);
8307             break;
8308
8309             case OP_WORDCHAR:
8310             if (c >= 256 && (md->ctypes[c] & ctype_word) == 0)
8311               RRETURN(MATCH_NOMATCH);
8312             break;
8313
8314             default:
8315             RRETURN(PCRE_ERROR_INTERNAL);
8316             }
8317           }
8318         }
8319       else
8320 #endif
8321       /* Not UTF-8 mode */
8322         {
8323         for (fi = min;; fi++)
8324           {
8325           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8326           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8327           if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8328           c = *eptr++;
8329           switch(ctype)
8330             {
8331             case OP_ANY:
8332             if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
8333             break;
8334
8335             case OP_ANYBYTE:
8336             break;
8337
8338             case OP_NOT_DIGIT:
8339             if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
8340             break;
8341
8342             case OP_DIGIT:
8343             if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
8344             break;
8345
8346             case OP_NOT_WHITESPACE:
8347             if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
8348             break;
8349
8350             case OP_WHITESPACE:
8351             if  ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
8352             break;
8353
8354             case OP_NOT_WORDCHAR:
8355             if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
8356             break;
8357
8358             case OP_WORDCHAR:
8359             if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
8360             break;
8361
8362             default:
8363             RRETURN(PCRE_ERROR_INTERNAL);
8364             }
8365           }
8366         }
8367       /* Control never gets here */
8368       }
8369
8370     /* If maximizing it is worth using inline code for speed, doing the type
8371     test once at the start (i.e. keep it out of the loop). Again, keep the
8372     UTF-8 and UCP stuff separate. */
8373
8374     else
8375       {
8376       pp = eptr;  /* Remember where we started */
8377
8378 #ifdef SUPPORT_UCP
8379       if (prop_type > 0)
8380         {
8381         for (i = min; i < max; i++)
8382           {
8383           int len = 1;
8384           if (eptr >= md->end_subject) break;
8385           GETCHARLEN(c, eptr, len);
8386           prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8387           if ((*prop_test_variable == prop_test_against) == prop_fail_result)
8388             break;
8389           eptr+= len;
8390           }
8391
8392         /* eptr is now past the end of the maximum run */
8393
8394         for(;;)
8395           {
8396           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8397           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8398           if (eptr-- == pp) break;        /* Stop if tried at original pos */
8399           BACKCHAR(eptr);
8400           }
8401         }
8402
8403       /* Match extended Unicode sequences. We will get here only if the
8404       support is in the binary; otherwise a compile-time error occurs. */
8405
8406       else if (ctype == OP_EXTUNI)
8407         {
8408         for (i = min; i < max; i++)
8409           {
8410           if (eptr >= md->end_subject) break;
8411           GETCHARINCTEST(c, eptr);
8412           prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8413           if (prop_category == ucp_M) break;
8414           while (eptr < md->end_subject)
8415             {
8416             int len = 1;
8417             if (!md->utf8) c = *eptr; else
8418               {
8419               GETCHARLEN(c, eptr, len);
8420               }
8421             prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8422             if (prop_category != ucp_M) break;
8423             eptr += len;
8424             }
8425           }
8426
8427         /* eptr is now past the end of the maximum run */
8428
8429         for(;;)
8430           {
8431           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8432           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8433           if (eptr-- == pp) break;        /* Stop if tried at original pos */
8434           for (;;)                        /* Move back over one extended */
8435             {
8436             int len = 1;
8437             BACKCHAR(eptr);
8438             if (!md->utf8) c = *eptr; else
8439               {
8440               GETCHARLEN(c, eptr, len);
8441               }
8442             prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8443             if (prop_category != ucp_M) break;
8444             eptr--;
8445             }
8446           }
8447         }
8448
8449       else
8450 #endif   /* SUPPORT_UCP */
8451
8452 #ifdef SUPPORT_UTF8
8453       /* UTF-8 mode */
8454
8455       if (md->utf8)
8456         {
8457         switch(ctype)
8458           {
8459           case OP_ANY:
8460
8461           /* Special code is required for UTF8, but when the maximum is unlimited
8462           we don't need it, so we repeat the non-UTF8 code. This is probably
8463           worth it, because .* is quite a common idiom. */
8464
8465           if (max < INT_MAX)
8466             {
8467             if ((ims & PCRE_DOTALL) == 0)
8468               {
8469               for (i = min; i < max; i++)
8470                 {
8471                 if (eptr >= md->end_subject || *eptr == NEWLINE) break;
8472                 eptr++;
8473                 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8474                 }
8475               }
8476             else
8477               {
8478               for (i = min; i < max; i++)
8479                 {
8480                 eptr++;
8481                 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8482                 }
8483               }
8484             }
8485
8486           /* Handle unlimited UTF-8 repeat */
8487
8488           else
8489             {
8490             if ((ims & PCRE_DOTALL) == 0)
8491               {
8492               for (i = min; i < max; i++)
8493                 {
8494                 if (eptr >= md->end_subject || *eptr == NEWLINE) break;
8495                 eptr++;
8496                 }
8497               break;
8498               }
8499             else
8500               {
8501               c = max - min;
8502               if (c > md->end_subject - eptr) c = md->end_subject - eptr;
8503               eptr += c;
8504               }
8505             }
8506           break;
8507
8508           /* The byte case is the same as non-UTF8 */
8509
8510           case OP_ANYBYTE:
8511           c = max - min;
8512           if (c > md->end_subject - eptr) c = md->end_subject - eptr;
8513           eptr += c;
8514           break;
8515
8516           case OP_NOT_DIGIT:
8517           for (i = min; i < max; i++)
8518             {
8519             int len = 1;
8520             if (eptr >= md->end_subject) break;
8521             GETCHARLEN(c, eptr, len);
8522             if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
8523             eptr+= len;
8524             }
8525           break;
8526
8527           case OP_DIGIT:
8528           for (i = min; i < max; i++)
8529             {
8530             int len = 1;
8531             if (eptr >= md->end_subject) break;
8532             GETCHARLEN(c, eptr, len);
8533             if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
8534             eptr+= len;
8535             }
8536           break;
8537
8538           case OP_NOT_WHITESPACE:
8539           for (i = min; i < max; i++)
8540             {
8541             int len = 1;
8542             if (eptr >= md->end_subject) break;
8543             GETCHARLEN(c, eptr, len);
8544             if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
8545             eptr+= len;
8546             }
8547           break;
8548
8549           case OP_WHITESPACE:
8550           for (i = min; i < max; i++)
8551             {
8552             int len = 1;
8553             if (eptr >= md->end_subject) break;
8554             GETCHARLEN(c, eptr, len);
8555             if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
8556             eptr+= len;
8557             }
8558           break;
8559
8560           case OP_NOT_WORDCHAR:
8561           for (i = min; i < max; i++)
8562             {
8563             int len = 1;
8564             if (eptr >= md->end_subject) break;
8565             GETCHARLEN(c, eptr, len);
8566             if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
8567             eptr+= len;
8568             }
8569           break;
8570
8571           case OP_WORDCHAR:
8572           for (i = min; i < max; i++)
8573             {
8574             int len = 1;
8575             if (eptr >= md->end_subject) break;
8576             GETCHARLEN(c, eptr, len);
8577             if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
8578             eptr+= len;
8579             }
8580           break;
8581
8582           default:
8583           RRETURN(PCRE_ERROR_INTERNAL);
8584           }
8585
8586         /* eptr is now past the end of the maximum run */
8587
8588         for(;;)
8589           {
8590           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8591           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8592           if (eptr-- == pp) break;        /* Stop if tried at original pos */
8593           BACKCHAR(eptr);
8594           }
8595         }
8596       else
8597 #endif
8598
8599       /* Not UTF-8 mode */
8600         {
8601         switch(ctype)
8602           {
8603           case OP_ANY:
8604           if ((ims & PCRE_DOTALL) == 0)
8605             {
8606             for (i = min; i < max; i++)
8607               {
8608               if (eptr >= md->end_subject || *eptr == NEWLINE) break;
8609               eptr++;
8610               }
8611             break;
8612             }
8613           /* For DOTALL case, fall through and treat as \C */
8614
8615           case OP_ANYBYTE:
8616           c = max - min;
8617           if (c > md->end_subject - eptr) c = md->end_subject - eptr;
8618           eptr += c;
8619           break;
8620
8621           case OP_NOT_DIGIT:
8622           for (i = min; i < max; i++)
8623             {
8624             if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
8625               break;
8626             eptr++;
8627             }
8628           break;
8629
8630           case OP_DIGIT:
8631           for (i = min; i < max; i++)
8632             {
8633             if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
8634               break;
8635             eptr++;
8636             }
8637           break;
8638
8639           case OP_NOT_WHITESPACE:
8640           for (i = min; i < max; i++)
8641             {
8642             if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
8643               break;
8644             eptr++;
8645             }
8646           break;
8647
8648           case OP_WHITESPACE:
8649           for (i = min; i < max; i++)
8650             {
8651             if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
8652               break;
8653             eptr++;
8654             }
8655           break;
8656
8657           case OP_NOT_WORDCHAR:
8658           for (i = min; i < max; i++)
8659             {
8660             if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
8661               break;
8662             eptr++;
8663             }
8664           break;
8665
8666           case OP_WORDCHAR:
8667           for (i = min; i < max; i++)
8668             {
8669             if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
8670               break;
8671             eptr++;
8672             }
8673           break;
8674
8675           default:
8676           RRETURN(PCRE_ERROR_INTERNAL);
8677           }
8678
8679         /* eptr is now past the end of the maximum run */
8680
8681         while (eptr >= pp)
8682           {
8683           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8684           eptr--;
8685           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8686           }
8687         }
8688
8689       /* Get here if we can't make it match with any permitted repetitions */
8690
8691       RRETURN(MATCH_NOMATCH);
8692       }
8693     /* Control never gets here */
8694
8695     /* There's been some horrible disaster. Since all codes > OP_BRA are
8696     for capturing brackets, and there shouldn't be any gaps between 0 and
8697     OP_BRA, arrival here can only mean there is something seriously wrong
8698     in the code above or the OP_xxx definitions. */
8699
8700     default:
8701     DPRINTF(("Unknown opcode %d\n", *ecode));
8702     RRETURN(PCRE_ERROR_UNKNOWN_NODE);
8703     }
8704
8705   /* Do not stick any code in here without much thought; it is assumed
8706   that "continue" in the code above comes out to here to repeat the main
8707   loop. */
8708
8709   }             /* End of main loop */
8710 /* Control never reaches here */
8711 }
8712
8713
8714 /***************************************************************************
8715 ****************************************************************************
8716                    RECURSION IN THE match() FUNCTION
8717
8718 Undefine all the macros that were defined above to handle this. */
8719
8720 #ifdef NO_RECURSE
8721 #undef eptr
8722 #undef ecode
8723 #undef offset_top
8724 #undef ims
8725 #undef eptrb
8726 #undef flags
8727
8728 #undef callpat
8729 #undef charptr
8730 #undef data
8731 #undef next
8732 #undef pp
8733 #undef prev
8734 #undef saved_eptr
8735
8736 #undef new_recursive
8737
8738 #undef cur_is_word
8739 #undef condition
8740 #undef minimize
8741 #undef prev_is_word
8742
8743 #undef original_ims
8744
8745 #undef ctype
8746 #undef length
8747 #undef max
8748 #undef min
8749 #undef number
8750 #undef offset
8751 #undef op
8752 #undef save_capture_last
8753 #undef save_offset1
8754 #undef save_offset2
8755 #undef save_offset3
8756 #undef stacksave
8757
8758 #undef newptrb
8759
8760 #endif
8761
8762 /* These two are defined as macros in both cases */
8763
8764 #undef fc
8765 #undef fi
8766
8767 /***************************************************************************
8768 ***************************************************************************/
8769
8770
8771
8772 /*************************************************
8773 *         Execute a Regular Expression           *
8774 *************************************************/
8775
8776 /* This function applies a compiled re to a subject string and picks out
8777 portions of the string if it matches. Two elements in the vector are set for
8778 each substring: the offsets to the start and end of the substring.
8779
8780 Arguments:
8781   argument_re     points to the compiled expression
8782   extra_data      points to extra data or is NULL
8783   subject         points to the subject string
8784   length          length of subject string (may contain binary zeros)
8785   start_offset    where to start in the subject string
8786   options         option bits
8787   offsets         points to a vector of ints to be filled in with offsets
8788   offsetcount     the number of elements in the vector
8789
8790 Returns:          > 0 => success; value is the number of elements filled in
8791                   = 0 => success, but offsets is not big enough
8792                    -1 => failed to match
8793                  < -1 => some kind of unexpected problem
8794 */
8795
8796 EXPORT int
8797 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
8798   const char *subject, int length, int start_offset, int options, int *offsets,
8799   int offsetcount)
8800 {
8801 int rc, resetcount, ocount;
8802 int first_byte = -1;
8803 int req_byte = -1;
8804 int req_byte2 = -1;
8805 unsigned long int ims = 0;
8806 BOOL using_temporary_offsets = FALSE;
8807 BOOL anchored;
8808 BOOL startline;
8809 BOOL first_byte_caseless = FALSE;
8810 BOOL req_byte_caseless = FALSE;
8811 match_data match_block;
8812 const uschar *tables;
8813 const uschar *start_bits = NULL;
8814 const uschar *start_match = (const uschar *)subject + start_offset;
8815 const uschar *end_subject;
8816 const uschar *req_byte_ptr = start_match - 1;
8817
8818 pcre_study_data internal_study;
8819 const pcre_study_data *study;
8820
8821 real_pcre internal_re;
8822 const real_pcre *external_re = (const real_pcre *)argument_re;
8823 const real_pcre *re = external_re;
8824
8825 /* Plausibility checks */
8826
8827 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
8828 if (re == NULL || subject == NULL ||
8829    (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
8830 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
8831
8832 /* Fish out the optional data from the extra_data structure, first setting
8833 the default values. */
8834
8835 study = NULL;
8836 match_block.match_limit = MATCH_LIMIT;
8837 match_block.callout_data = NULL;
8838
8839 /* The table pointer is always in native byte order. */
8840
8841 tables = external_re->tables;
8842
8843 if (extra_data != NULL)
8844   {
8845   register unsigned int flags = extra_data->flags;
8846   if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
8847     study = (const pcre_study_data *)extra_data->study_data;
8848   if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
8849     match_block.match_limit = extra_data->match_limit;
8850   if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
8851     match_block.callout_data = extra_data->callout_data;
8852   if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
8853   }
8854
8855 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
8856 is a feature that makes it possible to save compiled regex and re-use them
8857 in other programs later. */
8858
8859 if (tables == NULL) tables = pcre_default_tables;
8860
8861 /* Check that the first field in the block is the magic number. If it is not,
8862 test for a regex that was compiled on a host of opposite endianness. If this is
8863 the case, flipped values are put in internal_re and internal_study if there was
8864 study data too. */
8865
8866 if (re->magic_number != MAGIC_NUMBER)
8867   {
8868   re = try_flipped(re, &internal_re, study, &internal_study);
8869   if (re == NULL) return PCRE_ERROR_BADMAGIC;
8870   if (study != NULL) study = &internal_study;
8871   }
8872
8873 /* Set up other data */
8874
8875 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
8876 startline = (re->options & PCRE_STARTLINE) != 0;
8877
8878 /* The code starts after the real_pcre block and the capture name table. */
8879
8880 match_block.start_code = (const uschar *)external_re + re->name_table_offset +
8881   re->name_count * re->name_entry_size;
8882
8883 match_block.start_subject = (const uschar *)subject;
8884 match_block.start_offset = start_offset;
8885 match_block.end_subject = match_block.start_subject + length;
8886 end_subject = match_block.end_subject;
8887
8888 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
8889 match_block.utf8 = (re->options & PCRE_UTF8) != 0;
8890
8891 match_block.notbol = (options & PCRE_NOTBOL) != 0;
8892 match_block.noteol = (options & PCRE_NOTEOL) != 0;
8893 match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
8894 match_block.partial = (options & PCRE_PARTIAL) != 0;
8895 match_block.hitend = FALSE;
8896
8897 match_block.recursive = NULL;                   /* No recursion at top level */
8898
8899 match_block.lcc = tables + lcc_offset;
8900 match_block.ctypes = tables + ctypes_offset;
8901
8902 /* Partial matching is supported only for a restricted set of regexes at the
8903 moment. */
8904
8905 if (match_block.partial && (re->options & PCRE_NOPARTIAL) != 0)
8906   return PCRE_ERROR_BADPARTIAL;
8907
8908 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
8909 back the character offset. */
8910
8911 #ifdef SUPPORT_UTF8
8912 if (match_block.utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
8913   {
8914   if (valid_utf8((uschar *)subject, length) >= 0)
8915     return PCRE_ERROR_BADUTF8;
8916   if (start_offset > 0 && start_offset < length)
8917     {
8918     int tb = ((uschar *)subject)[start_offset];
8919     if (tb > 127)
8920       {
8921       tb &= 0xc0;
8922       if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
8923       }
8924     }
8925   }
8926 #endif
8927
8928 /* The ims options can vary during the matching as a result of the presence
8929 of (?ims) items in the pattern. They are kept in a local variable so that
8930 restoring at the exit of a group is easy. */
8931
8932 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
8933
8934 /* If the expression has got more back references than the offsets supplied can
8935 hold, we get a temporary chunk of working store to use during the matching.
8936 Otherwise, we can use the vector supplied, rounding down its size to a multiple
8937 of 3. */
8938
8939 ocount = offsetcount - (offsetcount % 3);
8940
8941 if (re->top_backref > 0 && re->top_backref >= ocount/3)
8942   {
8943   ocount = re->top_backref * 3 + 3;
8944   match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
8945   if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
8946   using_temporary_offsets = TRUE;
8947   DPRINTF(("Got memory to hold back references\n"));
8948   }
8949 else match_block.offset_vector = offsets;
8950
8951 match_block.offset_end = ocount;
8952 match_block.offset_max = (2*ocount)/3;
8953 match_block.offset_overflow = FALSE;
8954 match_block.capture_last = -1;
8955
8956 /* Compute the minimum number of offsets that we need to reset each time. Doing
8957 this makes a huge difference to execution time when there aren't many brackets
8958 in the pattern. */
8959
8960 resetcount = 2 + re->top_bracket * 2;
8961 if (resetcount > offsetcount) resetcount = ocount;
8962
8963 /* Reset the working variable associated with each extraction. These should
8964 never be used unless previously set, but they get saved and restored, and so we
8965 initialize them to avoid reading uninitialized locations. */
8966
8967 if (match_block.offset_vector != NULL)
8968   {
8969   register int *iptr = match_block.offset_vector + ocount;
8970   register int *iend = iptr - resetcount/2 + 1;
8971   while (--iptr >= iend) *iptr = -1;
8972   }
8973
8974 /* Set up the first character to match, if available. The first_byte value is
8975 never set for an anchored regular expression, but the anchoring may be forced
8976 at run time, so we have to test for anchoring. The first char may be unset for
8977 an unanchored pattern, of course. If there's no first char and the pattern was
8978 studied, there may be a bitmap of possible first characters. */
8979
8980 if (!anchored)
8981   {
8982   if ((re->options & PCRE_FIRSTSET) != 0)
8983     {
8984     first_byte = re->first_byte & 255;
8985     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
8986       first_byte = match_block.lcc[first_byte];
8987     }
8988   else
8989     if (!startline && study != NULL &&
8990       (study->options & PCRE_STUDY_MAPPED) != 0)
8991         start_bits = study->start_bits;
8992   }
8993
8994 /* For anchored or unanchored matches, there may be a "last known required
8995 character" set. */
8996
8997 if ((re->options & PCRE_REQCHSET) != 0)
8998   {
8999   req_byte = re->req_byte & 255;
9000   req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
9001   req_byte2 = (tables + fcc_offset)[req_byte];  /* case flipped */
9002   }
9003
9004 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
9005 the loop runs just once. */
9006
9007 do
9008   {
9009   /* Reset the maximum number of extractions we might see. */
9010
9011   if (match_block.offset_vector != NULL)
9012     {
9013     register int *iptr = match_block.offset_vector;
9014     register int *iend = iptr + resetcount;
9015     while (iptr < iend) *iptr++ = -1;
9016     }
9017
9018   /* Advance to a unique first char if possible */
9019
9020   if (first_byte >= 0)
9021     {
9022     if (first_byte_caseless)
9023       while (start_match < end_subject &&
9024              match_block.lcc[*start_match] != first_byte)
9025         start_match++;
9026     else
9027       while (start_match < end_subject && *start_match != first_byte)
9028         start_match++;
9029     }
9030
9031   /* Or to just after \n for a multiline match if possible */
9032
9033   else if (startline)
9034     {
9035     if (start_match > match_block.start_subject + start_offset)
9036       {
9037       while (start_match < end_subject && start_match[-1] != NEWLINE)
9038         start_match++;
9039       }
9040     }
9041
9042   /* Or to a non-unique first char after study */
9043
9044   else if (start_bits != NULL)
9045     {
9046     while (start_match < end_subject)
9047       {
9048       register unsigned int c = *start_match;
9049       if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
9050       }
9051     }
9052
9053 #ifdef DEBUG  /* Sigh. Some compilers never learn. */
9054   printf(">>>> Match against: ");
9055   pchars(start_match, end_subject - start_match, TRUE, &match_block);
9056   printf("\n");
9057 #endif
9058
9059   /* If req_byte is set, we know that that character must appear in the subject
9060   for the match to succeed. If the first character is set, req_byte must be
9061   later in the subject; otherwise the test starts at the match point. This
9062   optimization can save a huge amount of backtracking in patterns with nested
9063   unlimited repeats that aren't going to match. Writing separate code for
9064   cased/caseless versions makes it go faster, as does using an autoincrement
9065   and backing off on a match.
9066
9067   HOWEVER: when the subject string is very, very long, searching to its end can
9068   take a long time, and give bad performance on quite ordinary patterns. This
9069   showed up when somebody was matching /^C/ on a 32-megabyte string... so we
9070   don't do this when the string is sufficiently long.
9071
9072   ALSO: this processing is disabled when partial matching is requested.
9073   */
9074
9075   if (req_byte >= 0 &&
9076       end_subject - start_match < REQ_BYTE_MAX &&
9077       !match_block.partial)
9078     {
9079     register const uschar *p = start_match + ((first_byte >= 0)? 1 : 0);
9080
9081     /* We don't need to repeat the search if we haven't yet reached the
9082     place we found it at last time. */
9083
9084     if (p > req_byte_ptr)
9085       {
9086       if (req_byte_caseless)
9087         {
9088         while (p < end_subject)
9089           {
9090           register int pp = *p++;
9091           if (pp == req_byte || pp == req_byte2) { p--; break; }
9092           }
9093         }
9094       else
9095         {
9096         while (p < end_subject)
9097           {
9098           if (*p++ == req_byte) { p--; break; }
9099           }
9100         }
9101
9102       /* If we can't find the required character, break the matching loop */
9103
9104       if (p >= end_subject) break;
9105
9106       /* If we have found the required character, save the point where we
9107       found it, so that we don't search again next time round the loop if
9108       the start hasn't passed this character yet. */
9109
9110       req_byte_ptr = p;
9111       }
9112     }
9113
9114   /* When a match occurs, substrings will be set for all internal extractions;
9115   we just need to set up the whole thing as substring 0 before returning. If
9116   there were too many extractions, set the return code to zero. In the case
9117   where we had to get some local store to hold offsets for backreferences, copy
9118   those back references that we can. In this case there need not be overflow
9119   if certain parts of the pattern were not used. */
9120
9121   match_block.start_match = start_match;
9122   match_block.match_call_count = 0;
9123
9124   rc = match(start_match, match_block.start_code, 2, &match_block, ims, NULL,
9125     match_isgroup);
9126
9127   if (rc == MATCH_NOMATCH)
9128     {
9129     start_match++;
9130 #ifdef SUPPORT_UTF8
9131     if (match_block.utf8)
9132       while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
9133         start_match++;
9134 #endif
9135     continue;
9136     }
9137
9138   if (rc != MATCH_MATCH)
9139     {
9140     DPRINTF((">>>> error: returning %d\n", rc));
9141     return rc;
9142     }
9143
9144   /* We have a match! Copy the offset information from temporary store if
9145   necessary */
9146
9147   if (using_temporary_offsets)
9148     {
9149     if (offsetcount >= 4)
9150       {
9151       memcpy(offsets + 2, match_block.offset_vector + 2,
9152         (offsetcount - 2) * sizeof(int));
9153       DPRINTF(("Copied offsets from temporary memory\n"));
9154       }
9155     if (match_block.end_offset_top > offsetcount)
9156       match_block.offset_overflow = TRUE;
9157
9158     DPRINTF(("Freeing temporary memory\n"));
9159     (pcre_free)(match_block.offset_vector);
9160     }
9161
9162   rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
9163
9164   if (offsetcount < 2) rc = 0; else
9165     {
9166     offsets[0] = start_match - match_block.start_subject;
9167     offsets[1] = match_block.end_match_ptr - match_block.start_subject;
9168     }
9169
9170   DPRINTF((">>>> returning %d\n", rc));
9171   return rc;
9172   }
9173
9174 /* This "while" is the end of the "do" above */
9175
9176 while (!anchored && start_match <= end_subject);
9177
9178 if (using_temporary_offsets)
9179   {
9180   DPRINTF(("Freeing temporary memory\n"));
9181   (pcre_free)(match_block.offset_vector);
9182   }
9183
9184 if (match_block.partial && match_block.hitend)
9185   {
9186   DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
9187   return PCRE_ERROR_PARTIAL;
9188   }
9189 else
9190   {
9191   DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
9192   return PCRE_ERROR_NOMATCH;
9193   }
9194 }
9195
9196 /* End of pcre.c */