Install PCRE 6.2.
[exim.git] / src / src / pcre / pcre_compile.c
1 /* $Cambridge: exim/src/src/pcre/pcre_compile.c,v 1.2 2005/08/08 10:22:14 ph10 Exp $ */
2
3 /*************************************************
4 * Perl-Compatible Regular Expressions *
5 *************************************************/
6
7 /* PCRE is a library of functions to support regular expressions whose syntax
8 and semantics are as close as possible to those of the Perl 5 language.
9
10 Written by Philip Hazel
11 Copyright (c) 1997-2005 University of Cambridge
12
13 -----------------------------------------------------------------------------
14 Redistribution and use in source and binary forms, with or without
15 modification, are permitted provided that the following conditions are met:
16
17 * Redistributions of source code must retain the above copyright notice,
18 this list of conditions and the following disclaimer.
19
20 * Redistributions in binary form must reproduce the above copyright
21 notice, this list of conditions and the following disclaimer in the
22 documentation and/or other materials provided with the distribution.
23
24 * Neither the name of the University of Cambridge nor the names of its
25 contributors may be used to endorse or promote products derived from
26 this software without specific prior written permission.
27
28 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 POSSIBILITY OF SUCH DAMAGE.
39 -----------------------------------------------------------------------------
40 */
41
42
43 /* This module contains the external function pcre_compile(), along with
44 supporting internal functions that are not used by other modules. */
45
46
47 #include "pcre_internal.h"
48
49
50 /*************************************************
51 * Code parameters and static tables *
52 *************************************************/
53
54 /* Maximum number of items on the nested bracket stacks at compile time. This
55 applies to the nesting of all kinds of parentheses. It does not limit
56 un-nested, non-capturing parentheses. This number can be made bigger if
57 necessary - it is used to dimension one int and one unsigned char vector at
58 compile time. */
59
60 #define BRASTACK_SIZE 200
61
62
63 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
64 are simple data values; negative values are for special things like \d and so
65 on. Zero means further processing is needed (for things like \x), or the escape
66 is invalid. */
67
68 #if !EBCDIC /* This is the "normal" table for ASCII systems */
69 static const short int escapes[] = {
70 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
71 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
72 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
73 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
74 -ESC_P, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
75 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
76 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
77 0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
78 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
79 0, 0, -ESC_z /* x - z */
80 };
81
82 #else /* This is the "abnormal" table for EBCDIC systems */
83 static const short int escapes[] = {
84 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
85 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
86 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
87 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
88 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
89 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
90 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
91 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
92 /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
93 /* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, -ESC_p,
94 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
95 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
96 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
97 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
98 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
99 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
100 /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
101 /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
102 /* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0,
103 /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
104 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
105 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
106 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
107 };
108 #endif
109
110
111 /* Tables of names of POSIX character classes and their lengths. The list is
112 terminated by a zero length entry. The first three must be alpha, upper, lower,
113 as this is assumed for handling case independence. */
114
115 static const char *const posix_names[] = {
116 "alpha", "lower", "upper",
117 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
118 "print", "punct", "space", "word", "xdigit" };
119
120 static const uschar posix_name_lengths[] = {
121 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
122
123 /* Table of class bit maps for each POSIX class; up to three may be combined
124 to form the class. The table for [:blank:] is dynamically modified to remove
125 the vertical space characters. */
126
127 static const int posix_class_maps[] = {
128 cbit_lower, cbit_upper, -1, /* alpha */
129 cbit_lower, -1, -1, /* lower */
130 cbit_upper, -1, -1, /* upper */
131 cbit_digit, cbit_lower, cbit_upper, /* alnum */
132 cbit_print, cbit_cntrl, -1, /* ascii */
133 cbit_space, -1, -1, /* blank - a GNU extension */
134 cbit_cntrl, -1, -1, /* cntrl */
135 cbit_digit, -1, -1, /* digit */
136 cbit_graph, -1, -1, /* graph */
137 cbit_print, -1, -1, /* print */
138 cbit_punct, -1, -1, /* punct */
139 cbit_space, -1, -1, /* space */
140 cbit_word, -1, -1, /* word - a Perl extension */
141 cbit_xdigit,-1, -1 /* xdigit */
142 };
143
144
145 /* The texts of compile-time error messages. These are "char *" because they
146 are passed to the outside world. */
147
148 static const char *error_texts[] = {
149 "no error",
150 "\\ at end of pattern",
151 "\\c at end of pattern",
152 "unrecognized character follows \\",
153 "numbers out of order in {} quantifier",
154 /* 5 */
155 "number too big in {} quantifier",
156 "missing terminating ] for character class",
157 "invalid escape sequence in character class",
158 "range out of order in character class",
159 "nothing to repeat",
160 /* 10 */
161 "operand of unlimited repeat could match the empty string",
162 "internal error: unexpected repeat",
163 "unrecognized character after (?",
164 "POSIX named classes are supported only within a class",
165 "missing )",
166 /* 15 */
167 "reference to non-existent subpattern",
168 "erroffset passed as NULL",
169 "unknown option bit(s) set",
170 "missing ) after comment",
171 "parentheses nested too deeply",
172 /* 20 */
173 "regular expression too large",
174 "failed to get memory",
175 "unmatched parentheses",
176 "internal error: code overflow",
177 "unrecognized character after (?<",
178 /* 25 */
179 "lookbehind assertion is not fixed length",
180 "malformed number after (?(",
181 "conditional group contains more than two branches",
182 "assertion expected after (?(",
183 "(?R or (?digits must be followed by )",
184 /* 30 */
185 "unknown POSIX class name",
186 "POSIX collating elements are not supported",
187 "this version of PCRE is not compiled with PCRE_UTF8 support",
188 "spare error",
189 "character value in \\x{...} sequence is too large",
190 /* 35 */
191 "invalid condition (?(0)",
192 "\\C not allowed in lookbehind assertion",
193 "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
194 "number after (?C is > 255",
195 "closing ) for (?C expected",
196 /* 40 */
197 "recursive call could loop indefinitely",
198 "unrecognized character after (?P",
199 "syntax error after (?P",
200 "two named groups have the same name",
201 "invalid UTF-8 string",
202 /* 45 */
203 "support for \\P, \\p, and \\X has not been compiled",
204 "malformed \\P or \\p sequence",
205 "unknown property name after \\P or \\p"
206 };
207
208
209 /* Table to identify digits and hex digits. This is used when compiling
210 patterns. Note that the tables in chartables are dependent on the locale, and
211 may mark arbitrary characters as digits - but the PCRE compiling code expects
212 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
213 a private table here. It costs 256 bytes, but it is a lot faster than doing
214 character value tests (at least in some simple cases I timed), and in some
215 applications one wants PCRE to compile efficiently as well as match
216 efficiently.
217
218 For convenience, we use the same bit definitions as in chartables:
219
220 0x04 decimal digit
221 0x08 hexadecimal digit
222
223 Then we can use ctype_digit and ctype_xdigit in the code. */
224
225 #if !EBCDIC /* This is the "normal" case, for ASCII systems */
226 static const unsigned char digitab[] =
227 {
228 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
229 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
230 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
231 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
232 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
233 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
234 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
235 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
236 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
237 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
238 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
239 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
240 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
241 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
242 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
243 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
244 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
245 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
246 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
247 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
248 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
249 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
250 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
251 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
252 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
253 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
254 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
255 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
256 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
257 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
258 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
259 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
260
261 #else /* This is the "abnormal" case, for EBCDIC systems */
262 static const unsigned char digitab[] =
263 {
264 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
265 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
266 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
267 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
268 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
269 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
270 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
271 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
272 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
273 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
274 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
275 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- ¬ */
276 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
277 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
278 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
279 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
280 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
281 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
282 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
283 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
284 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
285 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
286 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
287 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
288 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
289 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
290 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
291 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
292 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
293 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
294 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
295 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
296
297 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
298 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
299 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
300 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
301 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
302 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
303 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
304 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
305 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
306 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
307 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
308 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
309 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- ¬ */
310 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
311 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
312 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
313 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
314 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
315 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
316 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
317 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
318 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
319 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
320 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
321 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
322 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
323 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
324 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
325 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
326 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
327 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
328 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
329 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
330 #endif
331
332
333 /* Definition to allow mutual recursion */
334
335 static BOOL
336 compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,
337 int *, int *, branch_chain *, compile_data *);
338
339
340
341 /*************************************************
342 * Handle escapes *
343 *************************************************/
344
345 /* This function is called when a \ has been encountered. It either returns a
346 positive value for a simple escape such as \n, or a negative value which
347 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
348 a positive value greater than 255 may be returned. On entry, ptr is pointing at
349 the \. On exit, it is on the final character of the escape sequence.
350
351 Arguments:
352 ptrptr points to the pattern position pointer
353 errorcodeptr points to the errorcode variable
354 bracount number of previous extracting brackets
355 options the options bits
356 isclass TRUE if inside a character class
357
358 Returns: zero or positive => a data character
359 negative => a special escape sequence
360 on error, errorptr is set
361 */
362
363 static int
364 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
365 int options, BOOL isclass)
366 {
367 const uschar *ptr = *ptrptr;
368 int c, i;
369
370 /* If backslash is at the end of the pattern, it's an error. */
371
372 c = *(++ptr);
373 if (c == 0) *errorcodeptr = ERR1;
374
375 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
376 a table. A non-zero result is something that can be returned immediately.
377 Otherwise further processing may be required. */
378
379 #if !EBCDIC /* ASCII coding */
380 else if (c < '0' || c > 'z') {} /* Not alphameric */
381 else if ((i = escapes[c - '0']) != 0) c = i;
382
383 #else /* EBCDIC coding */
384 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
385 else if ((i = escapes[c - 0x48]) != 0) c = i;
386 #endif
387
388 /* Escapes that need further processing, or are illegal. */
389
390 else
391 {
392 const uschar *oldptr;
393 switch (c)
394 {
395 /* A number of Perl escapes are not handled by PCRE. We give an explicit
396 error. */
397
398 case 'l':
399 case 'L':
400 case 'N':
401 case 'u':
402 case 'U':
403 *errorcodeptr = ERR37;
404 break;
405
406 /* The handling of escape sequences consisting of a string of digits
407 starting with one that is not zero is not straightforward. By experiment,
408 the way Perl works seems to be as follows:
409
410 Outside a character class, the digits are read as a decimal number. If the
411 number is less than 10, or if there are that many previous extracting
412 left brackets, then it is a back reference. Otherwise, up to three octal
413 digits are read to form an escaped byte. Thus \123 is likely to be octal
414 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
415 value is greater than 377, the least significant 8 bits are taken. Inside a
416 character class, \ followed by a digit is always an octal number. */
417
418 case '1': case '2': case '3': case '4': case '5':
419 case '6': case '7': case '8': case '9':
420
421 if (!isclass)
422 {
423 oldptr = ptr;
424 c -= '0';
425 while ((digitab[ptr[1]] & ctype_digit) != 0)
426 c = c * 10 + *(++ptr) - '0';
427 if (c < 10 || c <= bracount)
428 {
429 c = -(ESC_REF + c);
430 break;
431 }
432 ptr = oldptr; /* Put the pointer back and fall through */
433 }
434
435 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
436 generates a binary zero byte and treats the digit as a following literal.
437 Thus we have to pull back the pointer by one. */
438
439 if ((c = *ptr) >= '8')
440 {
441 ptr--;
442 c = 0;
443 break;
444 }
445
446 /* \0 always starts an octal number, but we may drop through to here with a
447 larger first octal digit. */
448
449 case '0':
450 c -= '0';
451 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
452 c = c * 8 + *(++ptr) - '0';
453 c &= 255; /* Take least significant 8 bits */
454 break;
455
456 /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
457 which can be greater than 0xff, but only if the ddd are hex digits. */
458
459 case 'x':
460 #ifdef SUPPORT_UTF8
461 if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
462 {
463 const uschar *pt = ptr + 2;
464 register int count = 0;
465 c = 0;
466 while ((digitab[*pt] & ctype_xdigit) != 0)
467 {
468 int cc = *pt++;
469 count++;
470 #if !EBCDIC /* ASCII coding */
471 if (cc >= 'a') cc -= 32; /* Convert to upper case */
472 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
473 #else /* EBCDIC coding */
474 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
475 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
476 #endif
477 }
478 if (*pt == '}')
479 {
480 if (c < 0 || count > 8) *errorcodeptr = ERR34;
481 ptr = pt;
482 break;
483 }
484 /* If the sequence of hex digits does not end with '}', then we don't
485 recognize this construct; fall through to the normal \x handling. */
486 }
487 #endif
488
489 /* Read just a single hex char */
490
491 c = 0;
492 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
493 {
494 int cc; /* Some compilers don't like ++ */
495 cc = *(++ptr); /* in initializers */
496 #if !EBCDIC /* ASCII coding */
497 if (cc >= 'a') cc -= 32; /* Convert to upper case */
498 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
499 #else /* EBCDIC coding */
500 if (cc <= 'z') cc += 64; /* Convert to upper case */
501 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
502 #endif
503 }
504 break;
505
506 /* Other special escapes not starting with a digit are straightforward */
507
508 case 'c':
509 c = *(++ptr);
510 if (c == 0)
511 {
512 *errorcodeptr = ERR2;
513 return 0;
514 }
515
516 /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
517 is ASCII-specific, but then the whole concept of \cx is ASCII-specific.
518 (However, an EBCDIC equivalent has now been added.) */
519
520 #if !EBCDIC /* ASCII coding */
521 if (c >= 'a' && c <= 'z') c -= 32;
522 c ^= 0x40;
523 #else /* EBCDIC coding */
524 if (c >= 'a' && c <= 'z') c += 64;
525 c ^= 0xC0;
526 #endif
527 break;
528
529 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
530 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
531 for Perl compatibility, it is a literal. This code looks a bit odd, but
532 there used to be some cases other than the default, and there may be again
533 in future, so I haven't "optimized" it. */
534
535 default:
536 if ((options & PCRE_EXTRA) != 0) switch(c)
537 {
538 default:
539 *errorcodeptr = ERR3;
540 break;
541 }
542 break;
543 }
544 }
545
546 *ptrptr = ptr;
547 return c;
548 }
549
550
551
552 #ifdef SUPPORT_UCP
553 /*************************************************
554 * Handle \P and \p *
555 *************************************************/
556
557 /* This function is called after \P or \p has been encountered, provided that
558 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
559 pointing at the P or p. On exit, it is pointing at the final character of the
560 escape sequence.
561
562 Argument:
563 ptrptr points to the pattern position pointer
564 negptr points to a boolean that is set TRUE for negation else FALSE
565 errorcodeptr points to the error code variable
566
567 Returns: value from ucp_type_table, or -1 for an invalid type
568 */
569
570 static int
571 get_ucp(const uschar **ptrptr, BOOL *negptr, int *errorcodeptr)
572 {
573 int c, i, bot, top;
574 const uschar *ptr = *ptrptr;
575 char name[4];
576
577 c = *(++ptr);
578 if (c == 0) goto ERROR_RETURN;
579
580 *negptr = FALSE;
581
582 /* \P or \p can be followed by a one- or two-character name in {}, optionally
583 preceded by ^ for negation. */
584
585 if (c == '{')
586 {
587 if (ptr[1] == '^')
588 {
589 *negptr = TRUE;
590 ptr++;
591 }
592 for (i = 0; i <= 2; i++)
593 {
594 c = *(++ptr);
595 if (c == 0) goto ERROR_RETURN;
596 if (c == '}') break;
597 name[i] = c;
598 }
599 if (c !='}') /* Try to distinguish error cases */
600 {
601 while (*(++ptr) != 0 && *ptr != '}');
602 if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;
603 }
604 name[i] = 0;
605 }
606
607 /* Otherwise there is just one following character */
608
609 else
610 {
611 name[0] = c;
612 name[1] = 0;
613 }
614
615 *ptrptr = ptr;
616
617 /* Search for a recognized property name using binary chop */
618
619 bot = 0;
620 top = _pcre_utt_size;
621
622 while (bot < top)
623 {
624 i = (bot + top)/2;
625 c = strcmp(name, _pcre_utt[i].name);
626 if (c == 0) return _pcre_utt[i].value;
627 if (c > 0) bot = i + 1; else top = i;
628 }
629
630 UNKNOWN_RETURN:
631 *errorcodeptr = ERR47;
632 *ptrptr = ptr;
633 return -1;
634
635 ERROR_RETURN:
636 *errorcodeptr = ERR46;
637 *ptrptr = ptr;
638 return -1;
639 }
640 #endif
641
642
643
644
645 /*************************************************
646 * Check for counted repeat *
647 *************************************************/
648
649 /* This function is called when a '{' is encountered in a place where it might
650 start a quantifier. It looks ahead to see if it really is a quantifier or not.
651 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
652 where the ddds are digits.
653
654 Arguments:
655 p pointer to the first char after '{'
656
657 Returns: TRUE or FALSE
658 */
659
660 static BOOL
661 is_counted_repeat(const uschar *p)
662 {
663 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
664 while ((digitab[*p] & ctype_digit) != 0) p++;
665 if (*p == '}') return TRUE;
666
667 if (*p++ != ',') return FALSE;
668 if (*p == '}') return TRUE;
669
670 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
671 while ((digitab[*p] & ctype_digit) != 0) p++;
672
673 return (*p == '}');
674 }
675
676
677
678 /*************************************************
679 * Read repeat counts *
680 *************************************************/
681
682 /* Read an item of the form {n,m} and return the values. This is called only
683 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
684 so the syntax is guaranteed to be correct, but we need to check the values.
685
686 Arguments:
687 p pointer to first char after '{'
688 minp pointer to int for min
689 maxp pointer to int for max
690 returned as -1 if no max
691 errorcodeptr points to error code variable
692
693 Returns: pointer to '}' on success;
694 current ptr on error, with errorcodeptr set non-zero
695 */
696
697 static const uschar *
698 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
699 {
700 int min = 0;
701 int max = -1;
702
703 /* Read the minimum value and do a paranoid check: a negative value indicates
704 an integer overflow. */
705
706 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
707 if (min < 0 || min > 65535)
708 {
709 *errorcodeptr = ERR5;
710 return p;
711 }
712
713 /* Read the maximum value if there is one, and again do a paranoid on its size.
714 Also, max must not be less than min. */
715
716 if (*p == '}') max = min; else
717 {
718 if (*(++p) != '}')
719 {
720 max = 0;
721 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
722 if (max < 0 || max > 65535)
723 {
724 *errorcodeptr = ERR5;
725 return p;
726 }
727 if (max < min)
728 {
729 *errorcodeptr = ERR4;
730 return p;
731 }
732 }
733 }
734
735 /* Fill in the required variables, and pass back the pointer to the terminating
736 '}'. */
737
738 *minp = min;
739 *maxp = max;
740 return p;
741 }
742
743
744
745 /*************************************************
746 * Find first significant op code *
747 *************************************************/
748
749 /* This is called by several functions that scan a compiled expression looking
750 for a fixed first character, or an anchoring op code etc. It skips over things
751 that do not influence this. For some calls, a change of option is important.
752 For some calls, it makes sense to skip negative forward and all backward
753 assertions, and also the \b assertion; for others it does not.
754
755 Arguments:
756 code pointer to the start of the group
757 options pointer to external options
758 optbit the option bit whose changing is significant, or
759 zero if none are
760 skipassert TRUE if certain assertions are to be skipped
761
762 Returns: pointer to the first significant opcode
763 */
764
765 static const uschar*
766 first_significant_code(const uschar *code, int *options, int optbit,
767 BOOL skipassert)
768 {
769 for (;;)
770 {
771 switch ((int)*code)
772 {
773 case OP_OPT:
774 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
775 *options = (int)code[1];
776 code += 2;
777 break;
778
779 case OP_ASSERT_NOT:
780 case OP_ASSERTBACK:
781 case OP_ASSERTBACK_NOT:
782 if (!skipassert) return code;
783 do code += GET(code, 1); while (*code == OP_ALT);
784 code += _pcre_OP_lengths[*code];
785 break;
786
787 case OP_WORD_BOUNDARY:
788 case OP_NOT_WORD_BOUNDARY:
789 if (!skipassert) return code;
790 /* Fall through */
791
792 case OP_CALLOUT:
793 case OP_CREF:
794 case OP_BRANUMBER:
795 code += _pcre_OP_lengths[*code];
796 break;
797
798 default:
799 return code;
800 }
801 }
802 /* Control never reaches here */
803 }
804
805
806
807
808 /*************************************************
809 * Find the fixed length of a pattern *
810 *************************************************/
811
812 /* Scan a pattern and compute the fixed length of subject that will match it,
813 if the length is fixed. This is needed for dealing with backward assertions.
814 In UTF8 mode, the result is in characters rather than bytes.
815
816 Arguments:
817 code points to the start of the pattern (the bracket)
818 options the compiling options
819
820 Returns: the fixed length, or -1 if there is no fixed length,
821 or -2 if \C was encountered
822 */
823
824 static int
825 find_fixedlength(uschar *code, int options)
826 {
827 int length = -1;
828
829 register int branchlength = 0;
830 register uschar *cc = code + 1 + LINK_SIZE;
831
832 /* Scan along the opcodes for this branch. If we get to the end of the
833 branch, check the length against that of the other branches. */
834
835 for (;;)
836 {
837 int d;
838 register int op = *cc;
839 if (op >= OP_BRA) op = OP_BRA;
840
841 switch (op)
842 {
843 case OP_BRA:
844 case OP_ONCE:
845 case OP_COND:
846 d = find_fixedlength(cc, options);
847 if (d < 0) return d;
848 branchlength += d;
849 do cc += GET(cc, 1); while (*cc == OP_ALT);
850 cc += 1 + LINK_SIZE;
851 break;
852
853 /* Reached end of a branch; if it's a ket it is the end of a nested
854 call. If it's ALT it is an alternation in a nested call. If it is
855 END it's the end of the outer call. All can be handled by the same code. */
856
857 case OP_ALT:
858 case OP_KET:
859 case OP_KETRMAX:
860 case OP_KETRMIN:
861 case OP_END:
862 if (length < 0) length = branchlength;
863 else if (length != branchlength) return -1;
864 if (*cc != OP_ALT) return length;
865 cc += 1 + LINK_SIZE;
866 branchlength = 0;
867 break;
868
869 /* Skip over assertive subpatterns */
870
871 case OP_ASSERT:
872 case OP_ASSERT_NOT:
873 case OP_ASSERTBACK:
874 case OP_ASSERTBACK_NOT:
875 do cc += GET(cc, 1); while (*cc == OP_ALT);
876 /* Fall through */
877
878 /* Skip over things that don't match chars */
879
880 case OP_REVERSE:
881 case OP_BRANUMBER:
882 case OP_CREF:
883 case OP_OPT:
884 case OP_CALLOUT:
885 case OP_SOD:
886 case OP_SOM:
887 case OP_EOD:
888 case OP_EODN:
889 case OP_CIRC:
890 case OP_DOLL:
891 case OP_NOT_WORD_BOUNDARY:
892 case OP_WORD_BOUNDARY:
893 cc += _pcre_OP_lengths[*cc];
894 break;
895
896 /* Handle literal characters */
897
898 case OP_CHAR:
899 case OP_CHARNC:
900 branchlength++;
901 cc += 2;
902 #ifdef SUPPORT_UTF8
903 if ((options & PCRE_UTF8) != 0)
904 {
905 while ((*cc & 0xc0) == 0x80) cc++;
906 }
907 #endif
908 break;
909
910 /* Handle exact repetitions. The count is already in characters, but we
911 need to skip over a multibyte character in UTF8 mode. */
912
913 case OP_EXACT:
914 branchlength += GET2(cc,1);
915 cc += 4;
916 #ifdef SUPPORT_UTF8
917 if ((options & PCRE_UTF8) != 0)
918 {
919 while((*cc & 0x80) == 0x80) cc++;
920 }
921 #endif
922 break;
923
924 case OP_TYPEEXACT:
925 branchlength += GET2(cc,1);
926 cc += 4;
927 break;
928
929 /* Handle single-char matchers */
930
931 case OP_PROP:
932 case OP_NOTPROP:
933 cc++;
934 /* Fall through */
935
936 case OP_NOT_DIGIT:
937 case OP_DIGIT:
938 case OP_NOT_WHITESPACE:
939 case OP_WHITESPACE:
940 case OP_NOT_WORDCHAR:
941 case OP_WORDCHAR:
942 case OP_ANY:
943 branchlength++;
944 cc++;
945 break;
946
947 /* The single-byte matcher isn't allowed */
948
949 case OP_ANYBYTE:
950 return -2;
951
952 /* Check a class for variable quantification */
953
954 #ifdef SUPPORT_UTF8
955 case OP_XCLASS:
956 cc += GET(cc, 1) - 33;
957 /* Fall through */
958 #endif
959
960 case OP_CLASS:
961 case OP_NCLASS:
962 cc += 33;
963
964 switch (*cc)
965 {
966 case OP_CRSTAR:
967 case OP_CRMINSTAR:
968 case OP_CRQUERY:
969 case OP_CRMINQUERY:
970 return -1;
971
972 case OP_CRRANGE:
973 case OP_CRMINRANGE:
974 if (GET2(cc,1) != GET2(cc,3)) return -1;
975 branchlength += GET2(cc,1);
976 cc += 5;
977 break;
978
979 default:
980 branchlength++;
981 }
982 break;
983
984 /* Anything else is variable length */
985
986 default:
987 return -1;
988 }
989 }
990 /* Control never gets here */
991 }
992
993
994
995
996 /*************************************************
997 * Scan compiled regex for numbered bracket *
998 *************************************************/
999
1000 /* This little function scans through a compiled pattern until it finds a
1001 capturing bracket with the given number.
1002
1003 Arguments:
1004 code points to start of expression
1005 utf8 TRUE in UTF-8 mode
1006 number the required bracket number
1007
1008 Returns: pointer to the opcode for the bracket, or NULL if not found
1009 */
1010
1011 static const uschar *
1012 find_bracket(const uschar *code, BOOL utf8, int number)
1013 {
1014 #ifndef SUPPORT_UTF8
1015 utf8 = utf8; /* Stop pedantic compilers complaining */
1016 #endif
1017
1018 for (;;)
1019 {
1020 register int c = *code;
1021 if (c == OP_END) return NULL;
1022 else if (c > OP_BRA)
1023 {
1024 int n = c - OP_BRA;
1025 if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1026 if (n == number) return (uschar *)code;
1027 code += _pcre_OP_lengths[OP_BRA];
1028 }
1029 else
1030 {
1031 code += _pcre_OP_lengths[c];
1032
1033 #ifdef SUPPORT_UTF8
1034
1035 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1036 by a multi-byte character. The length in the table is a minimum, so we have
1037 to scan along to skip the extra bytes. All opcodes are less than 128, so we
1038 can use relatively efficient code. */
1039
1040 if (utf8) switch(c)
1041 {
1042 case OP_CHAR:
1043 case OP_CHARNC:
1044 case OP_EXACT:
1045 case OP_UPTO:
1046 case OP_MINUPTO:
1047 case OP_STAR:
1048 case OP_MINSTAR:
1049 case OP_PLUS:
1050 case OP_MINPLUS:
1051 case OP_QUERY:
1052 case OP_MINQUERY:
1053 while ((*code & 0xc0) == 0x80) code++;
1054 break;
1055
1056 /* XCLASS is used for classes that cannot be represented just by a bit
1057 map. This includes negated single high-valued characters. The length in
1058 the table is zero; the actual length is stored in the compiled code. */
1059
1060 case OP_XCLASS:
1061 code += GET(code, 1) + 1;
1062 break;
1063 }
1064 #endif
1065 }
1066 }
1067 }
1068
1069
1070
1071 /*************************************************
1072 * Scan compiled regex for recursion reference *
1073 *************************************************/
1074
1075 /* This little function scans through a compiled pattern until it finds an
1076 instance of OP_RECURSE.
1077
1078 Arguments:
1079 code points to start of expression
1080 utf8 TRUE in UTF-8 mode
1081
1082 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1083 */
1084
1085 static const uschar *
1086 find_recurse(const uschar *code, BOOL utf8)
1087 {
1088 #ifndef SUPPORT_UTF8
1089 utf8 = utf8; /* Stop pedantic compilers complaining */
1090 #endif
1091
1092 for (;;)
1093 {
1094 register int c = *code;
1095 if (c == OP_END) return NULL;
1096 else if (c == OP_RECURSE) return code;
1097 else if (c > OP_BRA)
1098 {
1099 code += _pcre_OP_lengths[OP_BRA];
1100 }
1101 else
1102 {
1103 code += _pcre_OP_lengths[c];
1104
1105 #ifdef SUPPORT_UTF8
1106
1107 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1108 by a multi-byte character. The length in the table is a minimum, so we have
1109 to scan along to skip the extra bytes. All opcodes are less than 128, so we
1110 can use relatively efficient code. */
1111
1112 if (utf8) switch(c)
1113 {
1114 case OP_CHAR:
1115 case OP_CHARNC:
1116 case OP_EXACT:
1117 case OP_UPTO:
1118 case OP_MINUPTO:
1119 case OP_STAR:
1120 case OP_MINSTAR:
1121 case OP_PLUS:
1122 case OP_MINPLUS:
1123 case OP_QUERY:
1124 case OP_MINQUERY:
1125 while ((*code & 0xc0) == 0x80) code++;
1126 break;
1127
1128 /* XCLASS is used for classes that cannot be represented just by a bit
1129 map. This includes negated single high-valued characters. The length in
1130 the table is zero; the actual length is stored in the compiled code. */
1131
1132 case OP_XCLASS:
1133 code += GET(code, 1) + 1;
1134 break;
1135 }
1136 #endif
1137 }
1138 }
1139 }
1140
1141
1142
1143 /*************************************************
1144 * Scan compiled branch for non-emptiness *
1145 *************************************************/
1146
1147 /* This function scans through a branch of a compiled pattern to see whether it
1148 can match the empty string or not. It is called only from could_be_empty()
1149 below. Note that first_significant_code() skips over assertions. If we hit an
1150 unclosed bracket, we return "empty" - this means we've struck an inner bracket
1151 whose current branch will already have been scanned.
1152
1153 Arguments:
1154 code points to start of search
1155 endcode points to where to stop
1156 utf8 TRUE if in UTF8 mode
1157
1158 Returns: TRUE if what is matched could be empty
1159 */
1160
1161 static BOOL
1162 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1163 {
1164 register int c;
1165 for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);
1166 code < endcode;
1167 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1168 {
1169 const uschar *ccode;
1170
1171 c = *code;
1172
1173 if (c >= OP_BRA)
1174 {
1175 BOOL empty_branch;
1176 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1177
1178 /* Scan a closed bracket */
1179
1180 empty_branch = FALSE;
1181 do
1182 {
1183 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1184 empty_branch = TRUE;
1185 code += GET(code, 1);
1186 }
1187 while (*code == OP_ALT);
1188 if (!empty_branch) return FALSE; /* All branches are non-empty */
1189 code += 1 + LINK_SIZE;
1190 c = *code;
1191 }
1192
1193 else switch (c)
1194 {
1195 /* Check for quantifiers after a class */
1196
1197 #ifdef SUPPORT_UTF8
1198 case OP_XCLASS:
1199 ccode = code + GET(code, 1);
1200 goto CHECK_CLASS_REPEAT;
1201 #endif
1202
1203 case OP_CLASS:
1204 case OP_NCLASS:
1205 ccode = code + 33;
1206
1207 #ifdef SUPPORT_UTF8
1208 CHECK_CLASS_REPEAT:
1209 #endif
1210
1211 switch (*ccode)
1212 {
1213 case OP_CRSTAR: /* These could be empty; continue */
1214 case OP_CRMINSTAR:
1215 case OP_CRQUERY:
1216 case OP_CRMINQUERY:
1217 break;
1218
1219 default: /* Non-repeat => class must match */
1220 case OP_CRPLUS: /* These repeats aren't empty */
1221 case OP_CRMINPLUS:
1222 return FALSE;
1223
1224 case OP_CRRANGE:
1225 case OP_CRMINRANGE:
1226 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1227 break;
1228 }
1229 break;
1230
1231 /* Opcodes that must match a character */
1232
1233 case OP_PROP:
1234 case OP_NOTPROP:
1235 case OP_EXTUNI:
1236 case OP_NOT_DIGIT:
1237 case OP_DIGIT:
1238 case OP_NOT_WHITESPACE:
1239 case OP_WHITESPACE:
1240 case OP_NOT_WORDCHAR:
1241 case OP_WORDCHAR:
1242 case OP_ANY:
1243 case OP_ANYBYTE:
1244 case OP_CHAR:
1245 case OP_CHARNC:
1246 case OP_NOT:
1247 case OP_PLUS:
1248 case OP_MINPLUS:
1249 case OP_EXACT:
1250 case OP_NOTPLUS:
1251 case OP_NOTMINPLUS:
1252 case OP_NOTEXACT:
1253 case OP_TYPEPLUS:
1254 case OP_TYPEMINPLUS:
1255 case OP_TYPEEXACT:
1256 return FALSE;
1257
1258 /* End of branch */
1259
1260 case OP_KET:
1261 case OP_KETRMAX:
1262 case OP_KETRMIN:
1263 case OP_ALT:
1264 return TRUE;
1265
1266 /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO may be
1267 followed by a multibyte character */
1268
1269 #ifdef SUPPORT_UTF8
1270 case OP_STAR:
1271 case OP_MINSTAR:
1272 case OP_QUERY:
1273 case OP_MINQUERY:
1274 case OP_UPTO:
1275 case OP_MINUPTO:
1276 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1277 break;
1278 #endif
1279 }
1280 }
1281
1282 return TRUE;
1283 }
1284
1285
1286
1287 /*************************************************
1288 * Scan compiled regex for non-emptiness *
1289 *************************************************/
1290
1291 /* This function is called to check for left recursive calls. We want to check
1292 the current branch of the current pattern to see if it could match the empty
1293 string. If it could, we must look outwards for branches at other levels,
1294 stopping when we pass beyond the bracket which is the subject of the recursion.
1295
1296 Arguments:
1297 code points to start of the recursion
1298 endcode points to where to stop (current RECURSE item)
1299 bcptr points to the chain of current (unclosed) branch starts
1300 utf8 TRUE if in UTF-8 mode
1301
1302 Returns: TRUE if what is matched could be empty
1303 */
1304
1305 static BOOL
1306 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1307 BOOL utf8)
1308 {
1309 while (bcptr != NULL && bcptr->current >= code)
1310 {
1311 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1312 bcptr = bcptr->outer;
1313 }
1314 return TRUE;
1315 }
1316
1317
1318
1319 /*************************************************
1320 * Check for POSIX class syntax *
1321 *************************************************/
1322
1323 /* This function is called when the sequence "[:" or "[." or "[=" is
1324 encountered in a character class. It checks whether this is followed by an
1325 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1326 ".]" or "=]".
1327
1328 Argument:
1329 ptr pointer to the initial [
1330 endptr where to return the end pointer
1331 cd pointer to compile data
1332
1333 Returns: TRUE or FALSE
1334 */
1335
1336 static BOOL
1337 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1338 {
1339 int terminator; /* Don't combine these lines; the Solaris cc */
1340 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1341 if (*(++ptr) == '^') ptr++;
1342 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1343 if (*ptr == terminator && ptr[1] == ']')
1344 {
1345 *endptr = ptr;
1346 return TRUE;
1347 }
1348 return FALSE;
1349 }
1350
1351
1352
1353
1354 /*************************************************
1355 * Check POSIX class name *
1356 *************************************************/
1357
1358 /* This function is called to check the name given in a POSIX-style class entry
1359 such as [:alnum:].
1360
1361 Arguments:
1362 ptr points to the first letter
1363 len the length of the name
1364
1365 Returns: a value representing the name, or -1 if unknown
1366 */
1367
1368 static int
1369 check_posix_name(const uschar *ptr, int len)
1370 {
1371 register int yield = 0;
1372 while (posix_name_lengths[yield] != 0)
1373 {
1374 if (len == posix_name_lengths[yield] &&
1375 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1376 yield++;
1377 }
1378 return -1;
1379 }
1380
1381
1382 /*************************************************
1383 * Adjust OP_RECURSE items in repeated group *
1384 *************************************************/
1385
1386 /* OP_RECURSE items contain an offset from the start of the regex to the group
1387 that is referenced. This means that groups can be replicated for fixed
1388 repetition simply by copying (because the recursion is allowed to refer to
1389 earlier groups that are outside the current group). However, when a group is
1390 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1391 it, after it has been compiled. This means that any OP_RECURSE items within it
1392 that refer to the group itself or any contained groups have to have their
1393 offsets adjusted. That is the job of this function. Before it is called, the
1394 partially compiled regex must be temporarily terminated with OP_END.
1395
1396 Arguments:
1397 group points to the start of the group
1398 adjust the amount by which the group is to be moved
1399 utf8 TRUE in UTF-8 mode
1400 cd contains pointers to tables etc.
1401
1402 Returns: nothing
1403 */
1404
1405 static void
1406 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)
1407 {
1408 uschar *ptr = group;
1409 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1410 {
1411 int offset = GET(ptr, 1);
1412 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1413 ptr += 1 + LINK_SIZE;
1414 }
1415 }
1416
1417
1418
1419 /*************************************************
1420 * Insert an automatic callout point *
1421 *************************************************/
1422
1423 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1424 callout points before each pattern item.
1425
1426 Arguments:
1427 code current code pointer
1428 ptr current pattern pointer
1429 cd pointers to tables etc
1430
1431 Returns: new code pointer
1432 */
1433
1434 static uschar *
1435 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1436 {
1437 *code++ = OP_CALLOUT;
1438 *code++ = 255;
1439 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1440 PUT(code, LINK_SIZE, 0); /* Default length */
1441 return code + 2*LINK_SIZE;
1442 }
1443
1444
1445
1446 /*************************************************
1447 * Complete a callout item *
1448 *************************************************/
1449
1450 /* A callout item contains the length of the next item in the pattern, which
1451 we can't fill in till after we have reached the relevant point. This is used
1452 for both automatic and manual callouts.
1453
1454 Arguments:
1455 previous_callout points to previous callout item
1456 ptr current pattern pointer
1457 cd pointers to tables etc
1458
1459 Returns: nothing
1460 */
1461
1462 static void
1463 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1464 {
1465 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1466 PUT(previous_callout, 2 + LINK_SIZE, length);
1467 }
1468
1469
1470
1471 #ifdef SUPPORT_UCP
1472 /*************************************************
1473 * Get othercase range *
1474 *************************************************/
1475
1476 /* This function is passed the start and end of a class range, in UTF-8 mode
1477 with UCP support. It searches up the characters, looking for internal ranges of
1478 characters in the "other" case. Each call returns the next one, updating the
1479 start address.
1480
1481 Arguments:
1482 cptr points to starting character value; updated
1483 d end value
1484 ocptr where to put start of othercase range
1485 odptr where to put end of othercase range
1486
1487 Yield: TRUE when range returned; FALSE when no more
1488 */
1489
1490 static BOOL
1491 get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)
1492 {
1493 int c, chartype, othercase, next;
1494
1495 for (c = *cptr; c <= d; c++)
1496 {
1497 if (_pcre_ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0)
1498 break;
1499 }
1500
1501 if (c > d) return FALSE;
1502
1503 *ocptr = othercase;
1504 next = othercase + 1;
1505
1506 for (++c; c <= d; c++)
1507 {
1508 if (_pcre_ucp_findchar(c, &chartype, &othercase) != ucp_L ||
1509 othercase != next)
1510 break;
1511 next++;
1512 }
1513
1514 *odptr = next - 1;
1515 *cptr = c;
1516
1517 return TRUE;
1518 }
1519 #endif /* SUPPORT_UCP */
1520
1521
1522 /*************************************************
1523 * Compile one branch *
1524 *************************************************/
1525
1526 /* Scan the pattern, compiling it into the code vector. If the options are
1527 changed during the branch, the pointer is used to change the external options
1528 bits.
1529
1530 Arguments:
1531 optionsptr pointer to the option bits
1532 brackets points to number of extracting brackets used
1533 codeptr points to the pointer to the current code point
1534 ptrptr points to the current pattern pointer
1535 errorcodeptr points to error code variable
1536 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
1537 reqbyteptr set to the last literal character required, else < 0
1538 bcptr points to current branch chain
1539 cd contains pointers to tables etc.
1540
1541 Returns: TRUE on success
1542 FALSE, with *errorcodeptr set non-zero on error
1543 */
1544
1545 static BOOL
1546 compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
1547 const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,
1548 int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
1549 {
1550 int repeat_type, op_type;
1551 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
1552 int bravalue = 0;
1553 int greedy_default, greedy_non_default;
1554 int firstbyte, reqbyte;
1555 int zeroreqbyte, zerofirstbyte;
1556 int req_caseopt, reqvary, tempreqvary;
1557 int condcount = 0;
1558 int options = *optionsptr;
1559 int after_manual_callout = 0;
1560 register int c;
1561 register uschar *code = *codeptr;
1562 uschar *tempcode;
1563 BOOL inescq = FALSE;
1564 BOOL groupsetfirstbyte = FALSE;
1565 const uschar *ptr = *ptrptr;
1566 const uschar *tempptr;
1567 uschar *previous = NULL;
1568 uschar *previous_callout = NULL;
1569 uschar classbits[32];
1570
1571 #ifdef SUPPORT_UTF8
1572 BOOL class_utf8;
1573 BOOL utf8 = (options & PCRE_UTF8) != 0;
1574 uschar *class_utf8data;
1575 uschar utf8_char[6];
1576 #else
1577 BOOL utf8 = FALSE;
1578 #endif
1579
1580 /* Set up the default and non-default settings for greediness */
1581
1582 greedy_default = ((options & PCRE_UNGREEDY) != 0);
1583 greedy_non_default = greedy_default ^ 1;
1584
1585 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
1586 matching encountered yet". It gets changed to REQ_NONE if we hit something that
1587 matches a non-fixed char first char; reqbyte just remains unset if we never
1588 find one.
1589
1590 When we hit a repeat whose minimum is zero, we may have to adjust these values
1591 to take the zero repeat into account. This is implemented by setting them to
1592 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
1593 item types that can be repeated set these backoff variables appropriately. */
1594
1595 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
1596
1597 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
1598 according to the current setting of the caseless flag. REQ_CASELESS is a bit
1599 value > 255. It is added into the firstbyte or reqbyte variables to record the
1600 case status of the value. This is used only for ASCII characters. */
1601
1602 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
1603
1604 /* Switch on next character until the end of the branch */
1605
1606 for (;; ptr++)
1607 {
1608 BOOL negate_class;
1609 BOOL possessive_quantifier;
1610 BOOL is_quantifier;
1611 int class_charcount;
1612 int class_lastchar;
1613 int newoptions;
1614 int recno;
1615 int skipbytes;
1616 int subreqbyte;
1617 int subfirstbyte;
1618 int mclength;
1619 uschar mcbuffer[8];
1620
1621 /* Next byte in the pattern */
1622
1623 c = *ptr;
1624
1625 /* If in \Q...\E, check for the end; if not, we have a literal */
1626
1627 if (inescq && c != 0)
1628 {
1629 if (c == '\\' && ptr[1] == 'E')
1630 {
1631 inescq = FALSE;
1632 ptr++;
1633 continue;
1634 }
1635 else
1636 {
1637 if (previous_callout != NULL)
1638 {
1639 complete_callout(previous_callout, ptr, cd);
1640 previous_callout = NULL;
1641 }
1642 if ((options & PCRE_AUTO_CALLOUT) != 0)
1643 {
1644 previous_callout = code;
1645 code = auto_callout(code, ptr, cd);
1646 }
1647 goto NORMAL_CHAR;
1648 }
1649 }
1650
1651 /* Fill in length of a previous callout, except when the next thing is
1652 a quantifier. */
1653
1654 is_quantifier = c == '*' || c == '+' || c == '?' ||
1655 (c == '{' && is_counted_repeat(ptr+1));
1656
1657 if (!is_quantifier && previous_callout != NULL &&
1658 after_manual_callout-- <= 0)
1659 {
1660 complete_callout(previous_callout, ptr, cd);
1661 previous_callout = NULL;
1662 }
1663
1664 /* In extended mode, skip white space and comments */
1665
1666 if ((options & PCRE_EXTENDED) != 0)
1667 {
1668 if ((cd->ctypes[c] & ctype_space) != 0) continue;
1669 if (c == '#')
1670 {
1671 /* The space before the ; is to avoid a warning on a silly compiler
1672 on the Macintosh. */
1673 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
1674 if (c != 0) continue; /* Else fall through to handle end of string */
1675 }
1676 }
1677
1678 /* No auto callout for quantifiers. */
1679
1680 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
1681 {
1682 previous_callout = code;
1683 code = auto_callout(code, ptr, cd);
1684 }
1685
1686 switch(c)
1687 {
1688 /* The branch terminates at end of string, |, or ). */
1689
1690 case 0:
1691 case '|':
1692 case ')':
1693 *firstbyteptr = firstbyte;
1694 *reqbyteptr = reqbyte;
1695 *codeptr = code;
1696 *ptrptr = ptr;
1697 return TRUE;
1698
1699 /* Handle single-character metacharacters. In multiline mode, ^ disables
1700 the setting of any following char as a first character. */
1701
1702 case '^':
1703 if ((options & PCRE_MULTILINE) != 0)
1704 {
1705 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1706 }
1707 previous = NULL;
1708 *code++ = OP_CIRC;
1709 break;
1710
1711 case '$':
1712 previous = NULL;
1713 *code++ = OP_DOLL;
1714 break;
1715
1716 /* There can never be a first char if '.' is first, whatever happens about
1717 repeats. The value of reqbyte doesn't change either. */
1718
1719 case '.':
1720 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1721 zerofirstbyte = firstbyte;
1722 zeroreqbyte = reqbyte;
1723 previous = code;
1724 *code++ = OP_ANY;
1725 break;
1726
1727 /* Character classes. If the included characters are all < 255 in value, we
1728 build a 32-byte bitmap of the permitted characters, except in the special
1729 case where there is only one such character. For negated classes, we build
1730 the map as usual, then invert it at the end. However, we use a different
1731 opcode so that data characters > 255 can be handled correctly.
1732
1733 If the class contains characters outside the 0-255 range, a different
1734 opcode is compiled. It may optionally have a bit map for characters < 256,
1735 but those above are are explicitly listed afterwards. A flag byte tells
1736 whether the bitmap is present, and whether this is a negated class or not.
1737 */
1738
1739 case '[':
1740 previous = code;
1741
1742 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
1743 they are encountered at the top level, so we'll do that too. */
1744
1745 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1746 check_posix_syntax(ptr, &tempptr, cd))
1747 {
1748 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
1749 goto FAILED;
1750 }
1751
1752 /* If the first character is '^', set the negation flag and skip it. */
1753
1754 if ((c = *(++ptr)) == '^')
1755 {
1756 negate_class = TRUE;
1757 c = *(++ptr);
1758 }
1759 else
1760 {
1761 negate_class = FALSE;
1762 }
1763
1764 /* Keep a count of chars with values < 256 so that we can optimize the case
1765 of just a single character (as long as it's < 256). For higher valued UTF-8
1766 characters, we don't yet do any optimization. */
1767
1768 class_charcount = 0;
1769 class_lastchar = -1;
1770
1771 #ifdef SUPPORT_UTF8
1772 class_utf8 = FALSE; /* No chars >= 256 */
1773 class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
1774 #endif
1775
1776 /* Initialize the 32-char bit map to all zeros. We have to build the
1777 map in a temporary bit of store, in case the class contains only 1
1778 character (< 256), because in that case the compiled code doesn't use the
1779 bit map. */
1780
1781 memset(classbits, 0, 32 * sizeof(uschar));
1782
1783 /* Process characters until ] is reached. By writing this as a "do" it
1784 means that an initial ] is taken as a data character. The first pass
1785 through the regex checked the overall syntax, so we don't need to be very
1786 strict here. At the start of the loop, c contains the first byte of the
1787 character. */
1788
1789 do
1790 {
1791 #ifdef SUPPORT_UTF8
1792 if (utf8 && c > 127)
1793 { /* Braces are required because the */
1794 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
1795 }
1796 #endif
1797
1798 /* Inside \Q...\E everything is literal except \E */
1799
1800 if (inescq)
1801 {
1802 if (c == '\\' && ptr[1] == 'E')
1803 {
1804 inescq = FALSE;
1805 ptr++;
1806 continue;
1807 }
1808 else goto LONE_SINGLE_CHARACTER;
1809 }
1810
1811 /* Handle POSIX class names. Perl allows a negation extension of the
1812 form [:^name:]. A square bracket that doesn't match the syntax is
1813 treated as a literal. We also recognize the POSIX constructions
1814 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1815 5.6 and 5.8 do. */
1816
1817 if (c == '[' &&
1818 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1819 check_posix_syntax(ptr, &tempptr, cd))
1820 {
1821 BOOL local_negate = FALSE;
1822 int posix_class, i;
1823 register const uschar *cbits = cd->cbits;
1824
1825 if (ptr[1] != ':')
1826 {
1827 *errorcodeptr = ERR31;
1828 goto FAILED;
1829 }
1830
1831 ptr += 2;
1832 if (*ptr == '^')
1833 {
1834 local_negate = TRUE;
1835 ptr++;
1836 }
1837
1838 posix_class = check_posix_name(ptr, tempptr - ptr);
1839 if (posix_class < 0)
1840 {
1841 *errorcodeptr = ERR30;
1842 goto FAILED;
1843 }
1844
1845 /* If matching is caseless, upper and lower are converted to
1846 alpha. This relies on the fact that the class table starts with
1847 alpha, lower, upper as the first 3 entries. */
1848
1849 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1850 posix_class = 0;
1851
1852 /* Or into the map we are building up to 3 of the static class
1853 tables, or their negations. The [:blank:] class sets up the same
1854 chars as the [:space:] class (all white space). We remove the vertical
1855 white space chars afterwards. */
1856
1857 posix_class *= 3;
1858 for (i = 0; i < 3; i++)
1859 {
1860 BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;
1861 int taboffset = posix_class_maps[posix_class + i];
1862 if (taboffset < 0) break;
1863 if (local_negate)
1864 {
1865 if (i == 0)
1866 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset];
1867 else
1868 for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset];
1869 if (blankclass) classbits[1] |= 0x3c;
1870 }
1871 else
1872 {
1873 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset];
1874 if (blankclass) classbits[1] &= ~0x3c;
1875 }
1876 }
1877
1878 ptr = tempptr + 1;
1879 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
1880 continue; /* End of POSIX syntax handling */
1881 }
1882
1883 /* Backslash may introduce a single character, or it may introduce one
1884 of the specials, which just set a flag. Escaped items are checked for
1885 validity in the pre-compiling pass. The sequence \b is a special case.
1886 Inside a class (and only there) it is treated as backspace. Elsewhere
1887 it marks a word boundary. Other escapes have preset maps ready to
1888 or into the one we are building. We assume they have more than one
1889 character in them, so set class_charcount bigger than one. */
1890
1891 if (c == '\\')
1892 {
1893 c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);
1894
1895 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
1896 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
1897 else if (-c == ESC_Q) /* Handle start of quoted string */
1898 {
1899 if (ptr[1] == '\\' && ptr[2] == 'E')
1900 {
1901 ptr += 2; /* avoid empty string */
1902 }
1903 else inescq = TRUE;
1904 continue;
1905 }
1906
1907 if (c < 0)
1908 {
1909 register const uschar *cbits = cd->cbits;
1910 class_charcount += 2; /* Greater than 1 is what matters */
1911 switch (-c)
1912 {
1913 case ESC_d:
1914 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
1915 continue;
1916
1917 case ESC_D:
1918 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
1919 continue;
1920
1921 case ESC_w:
1922 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
1923 continue;
1924
1925 case ESC_W:
1926 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
1927 continue;
1928
1929 case ESC_s:
1930 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
1931 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
1932 continue;
1933
1934 case ESC_S:
1935 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
1936 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
1937 continue;
1938
1939 #ifdef SUPPORT_UCP
1940 case ESC_p:
1941 case ESC_P:
1942 {
1943 BOOL negated;
1944 int property = get_ucp(&ptr, &negated, errorcodeptr);
1945 if (property < 0) goto FAILED;
1946 class_utf8 = TRUE;
1947 *class_utf8data++ = ((-c == ESC_p) != negated)?
1948 XCL_PROP : XCL_NOTPROP;
1949 *class_utf8data++ = property;
1950 class_charcount -= 2; /* Not a < 256 character */
1951 }
1952 continue;
1953 #endif
1954
1955 /* Unrecognized escapes are faulted if PCRE is running in its
1956 strict mode. By default, for compatibility with Perl, they are
1957 treated as literals. */
1958
1959 default:
1960 if ((options & PCRE_EXTRA) != 0)
1961 {
1962 *errorcodeptr = ERR7;
1963 goto FAILED;
1964 }
1965 c = *ptr; /* The final character */
1966 class_charcount -= 2; /* Undo the default count from above */
1967 }
1968 }
1969
1970 /* Fall through if we have a single character (c >= 0). This may be
1971 > 256 in UTF-8 mode. */
1972
1973 } /* End of backslash handling */
1974
1975 /* A single character may be followed by '-' to form a range. However,
1976 Perl does not permit ']' to be the end of the range. A '-' character
1977 here is treated as a literal. */
1978
1979 if (ptr[1] == '-' && ptr[2] != ']')
1980 {
1981 int d;
1982 ptr += 2;
1983
1984 #ifdef SUPPORT_UTF8
1985 if (utf8)
1986 { /* Braces are required because the */
1987 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
1988 }
1989 else
1990 #endif
1991 d = *ptr; /* Not UTF-8 mode */
1992
1993 /* The second part of a range can be a single-character escape, but
1994 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1995 in such circumstances. */
1996
1997 if (d == '\\')
1998 {
1999 const uschar *oldptr = ptr;
2000 d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);
2001
2002 /* \b is backslash; \X is literal X; any other special means the '-'
2003 was literal */
2004
2005 if (d < 0)
2006 {
2007 if (d == -ESC_b) d = '\b';
2008 else if (d == -ESC_X) d = 'X'; else
2009 {
2010 ptr = oldptr - 2;
2011 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2012 }
2013 }
2014 }
2015
2016 /* The check that the two values are in the correct order happens in
2017 the pre-pass. Optimize one-character ranges */
2018
2019 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2020
2021 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2022 matching, we have to use an XCLASS with extra data items. Caseless
2023 matching for characters > 127 is available only if UCP support is
2024 available. */
2025
2026 #ifdef SUPPORT_UTF8
2027 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2028 {
2029 class_utf8 = TRUE;
2030
2031 /* With UCP support, we can find the other case equivalents of
2032 the relevant characters. There may be several ranges. Optimize how
2033 they fit with the basic range. */
2034
2035 #ifdef SUPPORT_UCP
2036 if ((options & PCRE_CASELESS) != 0)
2037 {
2038 int occ, ocd;
2039 int cc = c;
2040 int origd = d;
2041 while (get_othercase_range(&cc, origd, &occ, &ocd))
2042 {
2043 if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
2044
2045 if (occ < c && ocd >= c - 1) /* Extend the basic range */
2046 { /* if there is overlap, */
2047 c = occ; /* noting that if occ < c */
2048 continue; /* we can't have ocd > d */
2049 } /* because a subrange is */
2050 if (ocd > d && occ <= d + 1) /* always shorter than */
2051 { /* the basic range. */
2052 d = ocd;
2053 continue;
2054 }
2055
2056 if (occ == ocd)
2057 {
2058 *class_utf8data++ = XCL_SINGLE;
2059 }
2060 else
2061 {
2062 *class_utf8data++ = XCL_RANGE;
2063 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2064 }
2065 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2066 }
2067 }
2068 #endif /* SUPPORT_UCP */
2069
2070 /* Now record the original range, possibly modified for UCP caseless
2071 overlapping ranges. */
2072
2073 *class_utf8data++ = XCL_RANGE;
2074 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2075 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2076
2077 /* With UCP support, we are done. Without UCP support, there is no
2078 caseless matching for UTF-8 characters > 127; we can use the bit map
2079 for the smaller ones. */
2080
2081 #ifdef SUPPORT_UCP
2082 continue; /* With next character in the class */
2083 #else
2084 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2085
2086 /* Adjust upper limit and fall through to set up the map */
2087
2088 d = 127;
2089
2090 #endif /* SUPPORT_UCP */
2091 }
2092 #endif /* SUPPORT_UTF8 */
2093
2094 /* We use the bit map for all cases when not in UTF-8 mode; else
2095 ranges that lie entirely within 0-127 when there is UCP support; else
2096 for partial ranges without UCP support. */
2097
2098 for (; c <= d; c++)
2099 {
2100 classbits[c/8] |= (1 << (c&7));
2101 if ((options & PCRE_CASELESS) != 0)
2102 {
2103 int uc = cd->fcc[c]; /* flip case */
2104 classbits[uc/8] |= (1 << (uc&7));
2105 }
2106 class_charcount++; /* in case a one-char range */
2107 class_lastchar = c;
2108 }
2109
2110 continue; /* Go get the next char in the class */
2111 }
2112
2113 /* Handle a lone single character - we can get here for a normal
2114 non-escape char, or after \ that introduces a single character or for an
2115 apparent range that isn't. */
2116
2117 LONE_SINGLE_CHARACTER:
2118
2119 /* Handle a character that cannot go in the bit map */
2120
2121 #ifdef SUPPORT_UTF8
2122 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2123 {
2124 class_utf8 = TRUE;
2125 *class_utf8data++ = XCL_SINGLE;
2126 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2127
2128 #ifdef SUPPORT_UCP
2129 if ((options & PCRE_CASELESS) != 0)
2130 {
2131 int chartype;
2132 int othercase;
2133 if (_pcre_ucp_findchar(c, &chartype, &othercase) >= 0 &&
2134 othercase > 0)
2135 {
2136 *class_utf8data++ = XCL_SINGLE;
2137 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
2138 }
2139 }
2140 #endif /* SUPPORT_UCP */
2141
2142 }
2143 else
2144 #endif /* SUPPORT_UTF8 */
2145
2146 /* Handle a single-byte character */
2147 {
2148 classbits[c/8] |= (1 << (c&7));
2149 if ((options & PCRE_CASELESS) != 0)
2150 {
2151 c = cd->fcc[c]; /* flip case */
2152 classbits[c/8] |= (1 << (c&7));
2153 }
2154 class_charcount++;
2155 class_lastchar = c;
2156 }
2157 }
2158
2159 /* Loop until ']' reached; the check for end of string happens inside the
2160 loop. This "while" is the end of the "do" above. */
2161
2162 while ((c = *(++ptr)) != ']' || inescq);
2163
2164 /* If class_charcount is 1, we saw precisely one character whose value is
2165 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2166 can optimize the negative case only if there were no characters >= 128
2167 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2168 single-bytes only. This is an historical hangover. Maybe one day we can
2169 tidy these opcodes to handle multi-byte characters.
2170
2171 The optimization throws away the bit map. We turn the item into a
2172 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2173 that OP_NOT does not support multibyte characters. In the positive case, it
2174 can cause firstbyte to be set. Otherwise, there can be no first char if
2175 this item is first, whatever repeat count may follow. In the case of
2176 reqbyte, save the previous value for reinstating. */
2177
2178 #ifdef SUPPORT_UTF8
2179 if (class_charcount == 1 &&
2180 (!utf8 ||
2181 (!class_utf8 && (!negate_class || class_lastchar < 128))))
2182
2183 #else
2184 if (class_charcount == 1)
2185 #endif
2186 {
2187 zeroreqbyte = reqbyte;
2188
2189 /* The OP_NOT opcode works on one-byte characters only. */
2190
2191 if (negate_class)
2192 {
2193 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2194 zerofirstbyte = firstbyte;
2195 *code++ = OP_NOT;
2196 *code++ = class_lastchar;
2197 break;
2198 }
2199
2200 /* For a single, positive character, get the value into mcbuffer, and
2201 then we can handle this with the normal one-character code. */
2202
2203 #ifdef SUPPORT_UTF8
2204 if (utf8 && class_lastchar > 127)
2205 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
2206 else
2207 #endif
2208 {
2209 mcbuffer[0] = class_lastchar;
2210 mclength = 1;
2211 }
2212 goto ONE_CHAR;
2213 } /* End of 1-char optimization */
2214
2215 /* The general case - not the one-char optimization. If this is the first
2216 thing in the branch, there can be no first char setting, whatever the
2217 repeat count. Any reqbyte setting must remain unchanged after any kind of
2218 repeat. */
2219
2220 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2221 zerofirstbyte = firstbyte;
2222 zeroreqbyte = reqbyte;
2223
2224 /* If there are characters with values > 255, we have to compile an
2225 extended class, with its own opcode. If there are no characters < 256,
2226 we can omit the bitmap. */
2227
2228 #ifdef SUPPORT_UTF8
2229 if (class_utf8)
2230 {
2231 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2232 *code++ = OP_XCLASS;
2233 code += LINK_SIZE;
2234 *code = negate_class? XCL_NOT : 0;
2235
2236 /* If the map is required, install it, and move on to the end of
2237 the extra data */
2238
2239 if (class_charcount > 0)
2240 {
2241 *code++ |= XCL_MAP;
2242 memcpy(code, classbits, 32);
2243 code = class_utf8data;
2244 }
2245
2246 /* If the map is not required, slide down the extra data. */
2247
2248 else
2249 {
2250 int len = class_utf8data - (code + 33);
2251 memmove(code + 1, code + 33, len);
2252 code += len + 1;
2253 }
2254
2255 /* Now fill in the complete length of the item */
2256
2257 PUT(previous, 1, code - previous);
2258 break; /* End of class handling */
2259 }
2260 #endif
2261
2262 /* If there are no characters > 255, negate the 32-byte map if necessary,
2263 and copy it into the code vector. If this is the first thing in the branch,
2264 there can be no first char setting, whatever the repeat count. Any reqbyte
2265 setting must remain unchanged after any kind of repeat. */
2266
2267 if (negate_class)
2268 {
2269 *code++ = OP_NCLASS;
2270 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2271 }
2272 else
2273 {
2274 *code++ = OP_CLASS;
2275 memcpy(code, classbits, 32);
2276 }
2277 code += 32;
2278 break;
2279
2280 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2281 has been tested above. */
2282
2283 case '{':
2284 if (!is_quantifier) goto NORMAL_CHAR;
2285 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
2286 if (*errorcodeptr != 0) goto FAILED;
2287 goto REPEAT;
2288
2289 case '*':
2290 repeat_min = 0;
2291 repeat_max = -1;
2292 goto REPEAT;
2293
2294 case '+':
2295 repeat_min = 1;
2296 repeat_max = -1;
2297 goto REPEAT;
2298
2299 case '?':
2300 repeat_min = 0;
2301 repeat_max = 1;
2302
2303 REPEAT:
2304 if (previous == NULL)
2305 {
2306 *errorcodeptr = ERR9;
2307 goto FAILED;
2308 }
2309
2310 if (repeat_min == 0)
2311 {
2312 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2313 reqbyte = zeroreqbyte; /* Ditto */
2314 }
2315
2316 /* Remember whether this is a variable length repeat */
2317
2318 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2319
2320 op_type = 0; /* Default single-char op codes */
2321 possessive_quantifier = FALSE; /* Default not possessive quantifier */
2322
2323 /* Save start of previous item, in case we have to move it up to make space
2324 for an inserted OP_ONCE for the additional '+' extension. */
2325
2326 tempcode = previous;
2327
2328 /* If the next character is '+', we have a possessive quantifier. This
2329 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2330 If the next character is '?' this is a minimizing repeat, by default,
2331 but if PCRE_UNGREEDY is set, it works the other way round. We change the
2332 repeat type to the non-default. */
2333
2334 if (ptr[1] == '+')
2335 {
2336 repeat_type = 0; /* Force greedy */
2337 possessive_quantifier = TRUE;
2338 ptr++;
2339 }
2340 else if (ptr[1] == '?')
2341 {
2342 repeat_type = greedy_non_default;
2343 ptr++;
2344 }
2345 else repeat_type = greedy_default;
2346
2347 /* If previous was a recursion, we need to wrap it inside brackets so that
2348 it can be replicated if necessary. */
2349
2350 if (*previous == OP_RECURSE)
2351 {
2352 memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2353 code += 1 + LINK_SIZE;
2354 *previous = OP_BRA;
2355 PUT(previous, 1, code - previous);
2356 *code = OP_KET;
2357 PUT(code, 1, code - previous);
2358 code += 1 + LINK_SIZE;
2359 }
2360
2361 /* If previous was a character match, abolish the item and generate a
2362 repeat item instead. If a char item has a minumum of more than one, ensure
2363 that it is set in reqbyte - it might not be if a sequence such as x{3} is
2364 the first thing in a branch because the x will have gone into firstbyte
2365 instead. */
2366
2367 if (*previous == OP_CHAR || *previous == OP_CHARNC)
2368 {
2369 /* Deal with UTF-8 characters that take up more than one byte. It's
2370 easier to write this out separately than try to macrify it. Use c to
2371 hold the length of the character in bytes, plus 0x80 to flag that it's a
2372 length rather than a small character. */
2373
2374 #ifdef SUPPORT_UTF8
2375 if (utf8 && (code[-1] & 0x80) != 0)
2376 {
2377 uschar *lastchar = code - 1;
2378 while((*lastchar & 0xc0) == 0x80) lastchar--;
2379 c = code - lastchar; /* Length of UTF-8 character */
2380 memcpy(utf8_char, lastchar, c); /* Save the char */
2381 c |= 0x80; /* Flag c as a length */
2382 }
2383 else
2384 #endif
2385
2386 /* Handle the case of a single byte - either with no UTF8 support, or
2387 with UTF-8 disabled, or for a UTF-8 character < 128. */
2388
2389 {
2390 c = code[-1];
2391 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2392 }
2393
2394 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
2395 }
2396
2397 /* If previous was a single negated character ([^a] or similar), we use
2398 one of the special opcodes, replacing it. The code is shared with single-
2399 character repeats by setting opt_type to add a suitable offset into
2400 repeat_type. OP_NOT is currently used only for single-byte chars. */
2401
2402 else if (*previous == OP_NOT)
2403 {
2404 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
2405 c = previous[1];
2406 goto OUTPUT_SINGLE_REPEAT;
2407 }
2408
2409 /* If previous was a character type match (\d or similar), abolish it and
2410 create a suitable repeat item. The code is shared with single-character
2411 repeats by setting op_type to add a suitable offset into repeat_type. Note
2412 the the Unicode property types will be present only when SUPPORT_UCP is
2413 defined, but we don't wrap the little bits of code here because it just
2414 makes it horribly messy. */
2415
2416 else if (*previous < OP_EODN)
2417 {
2418 uschar *oldcode;
2419 int prop_type;
2420 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
2421 c = *previous;
2422
2423 OUTPUT_SINGLE_REPEAT:
2424 prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)?
2425 previous[1] : -1;
2426
2427 oldcode = code;
2428 code = previous; /* Usually overwrite previous item */
2429
2430 /* If the maximum is zero then the minimum must also be zero; Perl allows
2431 this case, so we do too - by simply omitting the item altogether. */
2432
2433 if (repeat_max == 0) goto END_REPEAT;
2434
2435 /* All real repeats make it impossible to handle partial matching (maybe
2436 one day we will be able to remove this restriction). */
2437
2438 if (repeat_max != 1) cd->nopartial = TRUE;
2439
2440 /* Combine the op_type with the repeat_type */
2441
2442 repeat_type += op_type;
2443
2444 /* A minimum of zero is handled either as the special case * or ?, or as
2445 an UPTO, with the maximum given. */
2446
2447 if (repeat_min == 0)
2448 {
2449 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
2450 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
2451 else
2452 {
2453 *code++ = OP_UPTO + repeat_type;
2454 PUT2INC(code, 0, repeat_max);
2455 }
2456 }
2457
2458 /* A repeat minimum of 1 is optimized into some special cases. If the
2459 maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
2460 left in place and, if the maximum is greater than 1, we use OP_UPTO with
2461 one less than the maximum. */
2462
2463 else if (repeat_min == 1)
2464 {
2465 if (repeat_max == -1)
2466 *code++ = OP_PLUS + repeat_type;
2467 else
2468 {
2469 code = oldcode; /* leave previous item in place */
2470 if (repeat_max == 1) goto END_REPEAT;
2471 *code++ = OP_UPTO + repeat_type;
2472 PUT2INC(code, 0, repeat_max - 1);
2473 }
2474 }
2475
2476 /* The case {n,n} is just an EXACT, while the general case {n,m} is
2477 handled as an EXACT followed by an UPTO. */
2478
2479 else
2480 {
2481 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
2482 PUT2INC(code, 0, repeat_min);
2483
2484 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
2485 we have to insert the character for the previous code. For a repeated
2486 Unicode property match, there is an extra byte that defines the
2487 required property. In UTF-8 mode, long characters have their length in
2488 c, with the 0x80 bit as a flag. */
2489
2490 if (repeat_max < 0)
2491 {
2492 #ifdef SUPPORT_UTF8
2493 if (utf8 && c >= 128)
2494 {
2495 memcpy(code, utf8_char, c & 7);
2496 code += c & 7;
2497 }
2498 else
2499 #endif
2500 {
2501 *code++ = c;
2502 if (prop_type >= 0) *code++ = prop_type;
2503 }
2504 *code++ = OP_STAR + repeat_type;
2505 }
2506
2507 /* Else insert an UPTO if the max is greater than the min, again
2508 preceded by the character, for the previously inserted code. */
2509
2510 else if (repeat_max != repeat_min)
2511 {
2512 #ifdef SUPPORT_UTF8
2513 if (utf8 && c >= 128)
2514 {
2515 memcpy(code, utf8_char, c & 7);
2516 code += c & 7;
2517 }
2518 else
2519 #endif
2520 *code++ = c;
2521 if (prop_type >= 0) *code++ = prop_type;
2522 repeat_max -= repeat_min;
2523 *code++ = OP_UPTO + repeat_type;
2524 PUT2INC(code, 0, repeat_max);
2525 }
2526 }
2527
2528 /* The character or character type itself comes last in all cases. */
2529
2530 #ifdef SUPPORT_UTF8
2531 if (utf8 && c >= 128)
2532 {
2533 memcpy(code, utf8_char, c & 7);
2534 code += c & 7;
2535 }
2536 else
2537 #endif
2538 *code++ = c;
2539
2540 /* For a repeated Unicode property match, there is an extra byte that
2541 defines the required property. */
2542
2543 #ifdef SUPPORT_UCP
2544 if (prop_type >= 0) *code++ = prop_type;
2545 #endif
2546 }
2547
2548 /* If previous was a character class or a back reference, we put the repeat
2549 stuff after it, but just skip the item if the repeat was {0,0}. */
2550
2551 else if (*previous == OP_CLASS ||
2552 *previous == OP_NCLASS ||
2553 #ifdef SUPPORT_UTF8
2554 *previous == OP_XCLASS ||
2555 #endif
2556 *previous == OP_REF)
2557 {
2558 if (repeat_max == 0)
2559 {
2560 code = previous;
2561 goto END_REPEAT;
2562 }
2563
2564 /* All real repeats make it impossible to handle partial matching (maybe
2565 one day we will be able to remove this restriction). */
2566
2567 if (repeat_max != 1) cd->nopartial = TRUE;
2568
2569 if (repeat_min == 0 && repeat_max == -1)
2570 *code++ = OP_CRSTAR + repeat_type;
2571 else if (repeat_min == 1 && repeat_max == -1)
2572 *code++ = OP_CRPLUS + repeat_type;
2573 else if (repeat_min == 0 && repeat_max == 1)
2574 *code++ = OP_CRQUERY + repeat_type;
2575 else
2576 {
2577 *code++ = OP_CRRANGE + repeat_type;
2578 PUT2INC(code, 0, repeat_min);
2579 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
2580 PUT2INC(code, 0, repeat_max);
2581 }
2582 }
2583
2584 /* If previous was a bracket group, we may have to replicate it in certain
2585 cases. */
2586
2587 else if (*previous >= OP_BRA || *previous == OP_ONCE ||
2588 *previous == OP_COND)
2589 {
2590 register int i;
2591 int ketoffset = 0;
2592 int len = code - previous;
2593 uschar *bralink = NULL;
2594
2595 /* If the maximum repeat count is unlimited, find the end of the bracket
2596 by scanning through from the start, and compute the offset back to it
2597 from the current code pointer. There may be an OP_OPT setting following
2598 the final KET, so we can't find the end just by going back from the code
2599 pointer. */
2600
2601 if (repeat_max == -1)
2602 {
2603 register uschar *ket = previous;
2604 do ket += GET(ket, 1); while (*ket != OP_KET);
2605 ketoffset = code - ket;
2606 }
2607
2608 /* The case of a zero minimum is special because of the need to stick
2609 OP_BRAZERO in front of it, and because the group appears once in the
2610 data, whereas in other cases it appears the minimum number of times. For
2611 this reason, it is simplest to treat this case separately, as otherwise
2612 the code gets far too messy. There are several special subcases when the
2613 minimum is zero. */
2614
2615 if (repeat_min == 0)
2616 {
2617 /* If the maximum is also zero, we just omit the group from the output
2618 altogether. */
2619
2620 if (repeat_max == 0)
2621 {
2622 code = previous;
2623 goto END_REPEAT;
2624 }
2625
2626 /* If the maximum is 1 or unlimited, we just have to stick in the
2627 BRAZERO and do no more at this point. However, we do need to adjust
2628 any OP_RECURSE calls inside the group that refer to the group itself or
2629 any internal group, because the offset is from the start of the whole
2630 regex. Temporarily terminate the pattern while doing this. */
2631
2632 if (repeat_max <= 1)
2633 {
2634 *code = OP_END;
2635 adjust_recurse(previous, 1, utf8, cd);
2636 memmove(previous+1, previous, len);
2637 code++;
2638 *previous++ = OP_BRAZERO + repeat_type;
2639 }
2640
2641 /* If the maximum is greater than 1 and limited, we have to replicate
2642 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
2643 The first one has to be handled carefully because it's the original
2644 copy, which has to be moved up. The remainder can be handled by code
2645 that is common with the non-zero minimum case below. We have to
2646 adjust the value or repeat_max, since one less copy is required. Once
2647 again, we may have to adjust any OP_RECURSE calls inside the group. */
2648
2649 else
2650 {
2651 int offset;
2652 *code = OP_END;
2653 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);
2654 memmove(previous + 2 + LINK_SIZE, previous, len);
2655 code += 2 + LINK_SIZE;
2656 *previous++ = OP_BRAZERO + repeat_type;
2657 *previous++ = OP_BRA;
2658
2659 /* We chain together the bracket offset fields that have to be
2660 filled in later when the ends of the brackets are reached. */
2661
2662 offset = (bralink == NULL)? 0 : previous - bralink;
2663 bralink = previous;
2664 PUTINC(previous, 0, offset);
2665 }
2666
2667 repeat_max--;
2668 }
2669
2670 /* If the minimum is greater than zero, replicate the group as many
2671 times as necessary, and adjust the maximum to the number of subsequent
2672 copies that we need. If we set a first char from the group, and didn't
2673 set a required char, copy the latter from the former. */
2674
2675 else
2676 {
2677 if (repeat_min > 1)
2678 {
2679 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
2680 for (i = 1; i < repeat_min; i++)
2681 {
2682 memcpy(code, previous, len);
2683 code += len;
2684 }
2685 }
2686 if (repeat_max > 0) repeat_max -= repeat_min;
2687 }
2688
2689 /* This code is common to both the zero and non-zero minimum cases. If
2690 the maximum is limited, it replicates the group in a nested fashion,
2691 remembering the bracket starts on a stack. In the case of a zero minimum,
2692 the first one was set up above. In all cases the repeat_max now specifies
2693 the number of additional copies needed. */
2694
2695 if (repeat_max >= 0)
2696 {
2697 for (i = repeat_max - 1; i >= 0; i--)
2698 {
2699 *code++ = OP_BRAZERO + repeat_type;
2700
2701 /* All but the final copy start a new nesting, maintaining the
2702 chain of brackets outstanding. */
2703
2704 if (i != 0)
2705 {
2706 int offset;
2707 *code++ = OP_BRA;
2708 offset = (bralink == NULL)? 0 : code - bralink;
2709 bralink = code;
2710 PUTINC(code, 0, offset);
2711 }
2712
2713 memcpy(code, previous, len);
2714 code += len;
2715 }
2716
2717 /* Now chain through the pending brackets, and fill in their length
2718 fields (which are holding the chain links pro tem). */
2719
2720 while (bralink != NULL)
2721 {
2722 int oldlinkoffset;
2723 int offset = code - bralink + 1;
2724 uschar *bra = code - offset;
2725 oldlinkoffset = GET(bra, 1);
2726 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
2727 *code++ = OP_KET;
2728 PUTINC(code, 0, offset);
2729 PUT(bra, 1, offset);
2730 }
2731 }
2732
2733 /* If the maximum is unlimited, set a repeater in the final copy. We
2734 can't just offset backwards from the current code point, because we
2735 don't know if there's been an options resetting after the ket. The
2736 correct offset was computed above. */
2737
2738 else code[-ketoffset] = OP_KETRMAX + repeat_type;
2739 }
2740
2741 /* Else there's some kind of shambles */
2742
2743 else
2744 {
2745 *errorcodeptr = ERR11;
2746 goto FAILED;
2747 }
2748
2749 /* If the character following a repeat is '+', we wrap the entire repeated
2750 item inside OP_ONCE brackets. This is just syntactic sugar, taken from
2751 Sun's Java package. The repeated item starts at tempcode, not at previous,
2752 which might be the first part of a string whose (former) last char we
2753 repeated. However, we don't support '+' after a greediness '?'. */
2754
2755 if (possessive_quantifier)
2756 {
2757 int len = code - tempcode;
2758 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
2759 code += 1 + LINK_SIZE;
2760 len += 1 + LINK_SIZE;
2761 tempcode[0] = OP_ONCE;
2762 *code++ = OP_KET;
2763 PUTINC(code, 0, len);
2764 PUT(tempcode, 1, len);
2765 }
2766
2767 /* In all case we no longer have a previous item. We also set the
2768 "follows varying string" flag for subsequently encountered reqbytes if
2769 it isn't already set and we have just passed a varying length item. */
2770
2771 END_REPEAT:
2772 previous = NULL;
2773 cd->req_varyopt |= reqvary;
2774 break;
2775
2776
2777 /* Start of nested bracket sub-expression, or comment or lookahead or
2778 lookbehind or option setting or condition. First deal with special things
2779 that can come after a bracket; all are introduced by ?, and the appearance
2780 of any of them means that this is not a referencing group. They were
2781 checked for validity in the first pass over the string, so we don't have to
2782 check for syntax errors here. */
2783
2784 case '(':
2785 newoptions = options;
2786 skipbytes = 0;
2787
2788 if (*(++ptr) == '?')
2789 {
2790 int set, unset;
2791 int *optset;
2792
2793 switch (*(++ptr))
2794 {
2795 case '#': /* Comment; skip to ket */
2796 ptr++;
2797 while (*ptr != ')') ptr++;
2798 continue;
2799
2800 case ':': /* Non-extracting bracket */
2801 bravalue = OP_BRA;
2802 ptr++;
2803 break;
2804
2805 case '(':
2806 bravalue = OP_COND; /* Conditional group */
2807
2808 /* Condition to test for recursion */
2809
2810 if (ptr[1] == 'R')
2811 {
2812 code[1+LINK_SIZE] = OP_CREF;
2813 PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
2814 skipbytes = 3;
2815 ptr += 3;
2816 }
2817
2818 /* Condition to test for a numbered subpattern match. We know that
2819 if a digit follows ( then there will just be digits until ) because
2820 the syntax was checked in the first pass. */
2821
2822 else if ((digitab[ptr[1]] && ctype_digit) != 0)
2823 {
2824 int condref; /* Don't amalgamate; some compilers */
2825 condref = *(++ptr) - '0'; /* grumble at autoincrement in declaration */
2826 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
2827 if (condref == 0)
2828 {
2829 *errorcodeptr = ERR35;
2830 goto FAILED;
2831 }
2832 ptr++;
2833 code[1+LINK_SIZE] = OP_CREF;
2834 PUT2(code, 2+LINK_SIZE, condref);
2835 skipbytes = 3;
2836 }
2837 /* For conditions that are assertions, we just fall through, having
2838 set bravalue above. */
2839 break;
2840
2841 case '=': /* Positive lookahead */
2842 bravalue = OP_ASSERT;
2843 ptr++;
2844 break;
2845
2846 case '!': /* Negative lookahead */
2847 bravalue = OP_ASSERT_NOT;
2848 ptr++;
2849 break;
2850
2851 case '<': /* Lookbehinds */
2852 switch (*(++ptr))
2853 {
2854 case '=': /* Positive lookbehind */
2855 bravalue = OP_ASSERTBACK;
2856 ptr++;
2857 break;
2858
2859 case '!': /* Negative lookbehind */
2860 bravalue = OP_ASSERTBACK_NOT;
2861 ptr++;
2862 break;
2863 }
2864 break;
2865
2866 case '>': /* One-time brackets */
2867 bravalue = OP_ONCE;
2868 ptr++;
2869 break;
2870
2871 case 'C': /* Callout - may be followed by digits; */
2872 previous_callout = code; /* Save for later completion */
2873 after_manual_callout = 1; /* Skip one item before completing */
2874 *code++ = OP_CALLOUT; /* Already checked that the terminating */
2875 { /* closing parenthesis is present. */
2876 int n = 0;
2877 while ((digitab[*(++ptr)] & ctype_digit) != 0)
2878 n = n * 10 + *ptr - '0';
2879 if (n > 255)
2880 {
2881 *errorcodeptr = ERR38;
2882 goto FAILED;
2883 }
2884 *code++ = n;
2885 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
2886 PUT(code, LINK_SIZE, 0); /* Default length */
2887 code += 2 * LINK_SIZE;
2888 }
2889 previous = NULL;
2890 continue;
2891
2892 case 'P': /* Named subpattern handling */
2893 if (*(++ptr) == '<') /* Definition */
2894 {
2895 int i, namelen;
2896 uschar *slot = cd->name_table;
2897 const uschar *name; /* Don't amalgamate; some compilers */
2898 name = ++ptr; /* grumble at autoincrement in declaration */
2899
2900 while (*ptr++ != '>');
2901 namelen = ptr - name - 1;
2902
2903 for (i = 0; i < cd->names_found; i++)
2904 {
2905 int crc = memcmp(name, slot+2, namelen);
2906 if (crc == 0)
2907 {
2908 if (slot[2+namelen] == 0)
2909 {
2910 *errorcodeptr = ERR43;
2911 goto FAILED;
2912 }
2913 crc = -1; /* Current name is substring */
2914 }
2915 if (crc < 0)
2916 {
2917 memmove(slot + cd->name_entry_size, slot,
2918 (cd->names_found - i) * cd->name_entry_size);
2919 break;
2920 }
2921 slot += cd->name_entry_size;
2922 }
2923
2924 PUT2(slot, 0, *brackets + 1);
2925 memcpy(slot + 2, name, namelen);
2926 slot[2+namelen] = 0;
2927 cd->names_found++;
2928 goto NUMBERED_GROUP;
2929 }
2930
2931 if (*ptr == '=' || *ptr == '>') /* Reference or recursion */
2932 {
2933 int i, namelen;
2934 int type = *ptr++;
2935 const uschar *name = ptr;
2936 uschar *slot = cd->name_table;
2937
2938 while (*ptr != ')') ptr++;
2939 namelen = ptr - name;
2940
2941 for (i = 0; i < cd->names_found; i++)
2942 {
2943 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
2944 slot += cd->name_entry_size;
2945 }
2946 if (i >= cd->names_found)
2947 {
2948 *errorcodeptr = ERR15;
2949 goto FAILED;
2950 }
2951
2952 recno = GET2(slot, 0);
2953
2954 if (type == '>') goto HANDLE_RECURSION; /* A few lines below */
2955
2956 /* Back reference */
2957
2958 previous = code;
2959 *code++ = OP_REF;
2960 PUT2INC(code, 0, recno);
2961 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
2962 if (recno > cd->top_backref) cd->top_backref = recno;
2963 continue;
2964 }
2965
2966 /* Should never happen */
2967 break;
2968
2969 case 'R': /* Pattern recursion */
2970 ptr++; /* Same as (?0) */
2971 /* Fall through */
2972
2973 /* Recursion or "subroutine" call */
2974
2975 case '0': case '1': case '2': case '3': case '4':
2976 case '5': case '6': case '7': case '8': case '9':
2977 {
2978 const uschar *called;
2979 recno = 0;
2980 while((digitab[*ptr] & ctype_digit) != 0)
2981 recno = recno * 10 + *ptr++ - '0';
2982
2983 /* Come here from code above that handles a named recursion */
2984
2985 HANDLE_RECURSION:
2986
2987 previous = code;
2988
2989 /* Find the bracket that is being referenced. Temporarily end the
2990 regex in case it doesn't exist. */
2991
2992 *code = OP_END;
2993 called = (recno == 0)?
2994 cd->start_code : find_bracket(cd->start_code, utf8, recno);
2995
2996 if (called == NULL)
2997 {
2998 *errorcodeptr = ERR15;
2999 goto FAILED;
3000 }
3001
3002 /* If the subpattern is still open, this is a recursive call. We
3003 check to see if this is a left recursion that could loop for ever,
3004 and diagnose that case. */
3005
3006 if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
3007 {
3008 *errorcodeptr = ERR40;
3009 goto FAILED;
3010 }
3011
3012 /* Insert the recursion/subroutine item */
3013
3014 *code = OP_RECURSE;
3015 PUT(code, 1, called - cd->start_code);
3016 code += 1 + LINK_SIZE;
3017 }
3018 continue;
3019
3020 /* Character after (? not specially recognized */
3021
3022 default: /* Option setting */
3023 set = unset = 0;
3024 optset = &set;
3025
3026 while (*ptr != ')' && *ptr != ':')
3027 {
3028 switch (*ptr++)
3029 {
3030 case '-': optset = &unset; break;
3031
3032 case 'i': *optset |= PCRE_CASELESS; break;
3033 case 'm': *optset |= PCRE_MULTILINE; break;
3034 case 's': *optset |= PCRE_DOTALL; break;
3035 case 'x': *optset |= PCRE_EXTENDED; break;
3036 case 'U': *optset |= PCRE_UNGREEDY; break;
3037 case 'X': *optset |= PCRE_EXTRA; break;
3038 }
3039 }
3040
3041 /* Set up the changed option bits, but don't change anything yet. */
3042
3043 newoptions = (options | set) & (~unset);
3044
3045 /* If the options ended with ')' this is not the start of a nested
3046 group with option changes, so the options change at this level. Compile
3047 code to change the ims options if this setting actually changes any of
3048 them. We also pass the new setting back so that it can be put at the
3049 start of any following branches, and when this group ends (if we are in
3050 a group), a resetting item can be compiled.
3051
3052 Note that if this item is right at the start of the pattern, the
3053 options will have been abstracted and made global, so there will be no
3054 change to compile. */
3055
3056 if (*ptr == ')')
3057 {
3058 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
3059 {
3060 *code++ = OP_OPT;
3061 *code++ = newoptions & PCRE_IMS;
3062 }
3063
3064 /* Change options at this level, and pass them back for use
3065 in subsequent branches. Reset the greedy defaults and the case
3066 value for firstbyte and reqbyte. */
3067
3068 *optionsptr = options = newoptions;
3069 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
3070 greedy_non_default = greedy_default ^ 1;
3071 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3072
3073 previous = NULL; /* This item can't be repeated */
3074 continue; /* It is complete */
3075 }
3076
3077 /* If the options ended with ':' we are heading into a nested group
3078 with possible change of options. Such groups are non-capturing and are
3079 not assertions of any kind. All we need to do is skip over the ':';
3080 the newoptions value is handled below. */
3081
3082 bravalue = OP_BRA;
3083 ptr++;
3084 }
3085 }
3086
3087 /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
3088 non-capturing and behave like (?:...) brackets */
3089
3090 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
3091 {
3092 bravalue = OP_BRA;
3093 }
3094
3095 /* Else we have a referencing group; adjust the opcode. If the bracket
3096 number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
3097 arrange for the true number to follow later, in an OP_BRANUMBER item. */
3098
3099 else
3100 {
3101 NUMBERED_GROUP:
3102 if (++(*brackets) > EXTRACT_BASIC_MAX)
3103 {
3104 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
3105 code[1+LINK_SIZE] = OP_BRANUMBER;
3106 PUT2(code, 2+LINK_SIZE, *brackets);
3107 skipbytes = 3;
3108 }
3109 else bravalue = OP_BRA + *brackets;
3110 }
3111
3112 /* Process nested bracketed re. Assertions may not be repeated, but other
3113 kinds can be. We copy code into a non-register variable in order to be able
3114 to pass its address because some compilers complain otherwise. Pass in a
3115 new setting for the ims options if they have changed. */
3116
3117 previous = (bravalue >= OP_ONCE)? code : NULL;
3118 *code = bravalue;
3119 tempcode = code;
3120 tempreqvary = cd->req_varyopt; /* Save value before bracket */
3121
3122 if (!compile_regex(
3123 newoptions, /* The complete new option state */
3124 options & PCRE_IMS, /* The previous ims option state */
3125 brackets, /* Extracting bracket count */
3126 &tempcode, /* Where to put code (updated) */
3127 &ptr, /* Input pointer (updated) */
3128 errorcodeptr, /* Where to put an error message */
3129 (bravalue == OP_ASSERTBACK ||
3130 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
3131 skipbytes, /* Skip over OP_COND/OP_BRANUMBER */
3132 &subfirstbyte, /* For possible first char */
3133 &subreqbyte, /* For possible last char */
3134 bcptr, /* Current branch chain */
3135 cd)) /* Tables block */
3136 goto FAILED;
3137
3138 /* At the end of compiling, code is still pointing to the start of the
3139 group, while tempcode has been updated to point past the end of the group
3140 and any option resetting that may follow it. The pattern pointer (ptr)
3141 is on the bracket. */
3142
3143 /* If this is a conditional bracket, check that there are no more than
3144 two branches in the group. */
3145
3146 else if (bravalue == OP_COND)
3147 {
3148 uschar *tc = code;
3149 condcount = 0;
3150
3151 do {
3152 condcount++;
3153 tc += GET(tc,1);
3154 }
3155 while (*tc != OP_KET);
3156
3157 if (condcount > 2)
3158 {
3159 *errorcodeptr = ERR27;
3160 goto FAILED;
3161 }
3162
3163 /* If there is just one branch, we must not make use of its firstbyte or
3164 reqbyte, because this is equivalent to an empty second branch. */
3165
3166 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
3167 }
3168
3169 /* Handle updating of the required and first characters. Update for normal
3170 brackets of all kinds, and conditions with two branches (see code above).
3171 If the bracket is followed by a quantifier with zero repeat, we have to
3172 back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
3173 main loop so that they can be accessed for the back off. */
3174
3175 zeroreqbyte = reqbyte;
3176 zerofirstbyte = firstbyte;
3177 groupsetfirstbyte = FALSE;
3178
3179 if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
3180 {
3181 /* If we have not yet set a firstbyte in this branch, take it from the
3182 subpattern, remembering that it was set here so that a repeat of more
3183 than one can replicate it as reqbyte if necessary. If the subpattern has
3184 no firstbyte, set "none" for the whole branch. In both cases, a zero
3185 repeat forces firstbyte to "none". */
3186
3187 if (firstbyte == REQ_UNSET)
3188 {
3189 if (subfirstbyte >= 0)
3190 {
3191 firstbyte = subfirstbyte;
3192 groupsetfirstbyte = TRUE;
3193 }
3194 else firstbyte = REQ_NONE;
3195 zerofirstbyte = REQ_NONE;
3196 }
3197
3198 /* If firstbyte was previously set, convert the subpattern's firstbyte
3199 into reqbyte if there wasn't one, using the vary flag that was in
3200 existence beforehand. */
3201
3202 else if (subfirstbyte >= 0 && subreqbyte < 0)
3203 subreqbyte = subfirstbyte | tempreqvary;
3204
3205 /* If the subpattern set a required byte (or set a first byte that isn't
3206 really the first byte - see above), set it. */
3207
3208 if (subreqbyte >= 0) reqbyte = subreqbyte;
3209 }
3210
3211 /* For a forward assertion, we take the reqbyte, if set. This can be
3212 helpful if the pattern that follows the assertion doesn't set a different
3213 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
3214 for an assertion, however because it leads to incorrect effect for patterns
3215 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
3216 of a firstbyte. This is overcome by a scan at the end if there's no
3217 firstbyte, looking for an asserted first char. */
3218
3219 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
3220
3221 /* Now update the main code pointer to the end of the group. */
3222
3223 code = tempcode;
3224
3225 /* Error if hit end of pattern */
3226
3227 if (*ptr != ')')
3228 {
3229 *errorcodeptr = ERR14;
3230 goto FAILED;
3231 }
3232 break;
3233
3234 /* Check \ for being a real metacharacter; if not, fall through and handle
3235 it as a data character at the start of a string. Escape items are checked
3236 for validity in the pre-compiling pass. */
3237
3238 case '\\':
3239 tempptr = ptr;
3240 c = check_escape(&ptr, errorcodeptr, *brackets, options, FALSE);
3241
3242 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
3243 are arranged to be the negation of the corresponding OP_values. For the
3244 back references, the values are ESC_REF plus the reference number. Only
3245 back references and those types that consume a character may be repeated.
3246 We can test for values between ESC_b and ESC_Z for the latter; this may
3247 have to change if any new ones are ever created. */
3248
3249 if (c < 0)
3250 {
3251 if (-c == ESC_Q) /* Handle start of quoted string */
3252 {
3253 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
3254 else inescq = TRUE;
3255 continue;
3256 }
3257
3258 /* For metasequences that actually match a character, we disable the
3259 setting of a first character if it hasn't already been set. */
3260
3261 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
3262 firstbyte = REQ_NONE;
3263
3264 /* Set values to reset to if this is followed by a zero repeat. */
3265
3266 zerofirstbyte = firstbyte;
3267 zeroreqbyte = reqbyte;
3268
3269 /* Back references are handled specially */
3270
3271 if (-c >= ESC_REF)
3272 {
3273 int number = -c - ESC_REF;
3274 previous = code;
3275 *code++ = OP_REF;
3276 PUT2INC(code, 0, number);
3277 }
3278
3279 /* So are Unicode property matches, if supported. We know that get_ucp
3280 won't fail because it was tested in the pre-pass. */
3281
3282 #ifdef SUPPORT_UCP
3283 else if (-c == ESC_P || -c == ESC_p)
3284 {
3285 BOOL negated;
3286 int value = get_ucp(&ptr, &negated, errorcodeptr);
3287 previous = code;
3288 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
3289 *code++ = value;
3290 }
3291 #endif
3292
3293 /* For the rest, we can obtain the OP value by negating the escape
3294 value */
3295
3296 else
3297 {
3298 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
3299 *code++ = -c;
3300 }
3301 continue;
3302 }
3303
3304 /* We have a data character whose value is in c. In UTF-8 mode it may have
3305 a value > 127. We set its representation in the length/buffer, and then
3306 handle it as a data character. */
3307
3308 #ifdef SUPPORT_UTF8
3309 if (utf8 && c > 127)
3310 mclength = _pcre_ord2utf8(c, mcbuffer);
3311 else
3312 #endif
3313
3314 {
3315 mcbuffer[0] = c;
3316 mclength = 1;
3317 }
3318
3319 goto ONE_CHAR;
3320
3321 /* Handle a literal character. It is guaranteed not to be whitespace or #
3322 when the extended flag is set. If we are in UTF-8 mode, it may be a
3323 multi-byte literal character. */
3324
3325 default:
3326 NORMAL_CHAR:
3327 mclength = 1;
3328 mcbuffer[0] = c;
3329
3330 #ifdef SUPPORT_UTF8
3331 if (utf8 && (c & 0xc0) == 0xc0)
3332 {
3333 while ((ptr[1] & 0xc0) == 0x80)
3334 mcbuffer[mclength++] = *(++ptr);
3335 }
3336 #endif
3337
3338 /* At this point we have the character's bytes in mcbuffer, and the length
3339 in mclength. When not in UTF-8 mode, the length is always 1. */
3340
3341 ONE_CHAR:
3342 previous = code;
3343 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
3344 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
3345
3346 /* Set the first and required bytes appropriately. If no previous first
3347 byte, set it from this character, but revert to none on a zero repeat.
3348 Otherwise, leave the firstbyte value alone, and don't change it on a zero
3349 repeat. */
3350
3351 if (firstbyte == REQ_UNSET)
3352 {
3353 zerofirstbyte = REQ_NONE;
3354 zeroreqbyte = reqbyte;
3355
3356 /* If the character is more than one byte long, we can set firstbyte
3357 only if it is not to be matched caselessly. */
3358
3359 if (mclength == 1 || req_caseopt == 0)
3360 {
3361 firstbyte = mcbuffer[0] | req_caseopt;
3362 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
3363 }
3364 else firstbyte = reqbyte = REQ_NONE;
3365 }
3366
3367 /* firstbyte was previously set; we can set reqbyte only the length is
3368 1 or the matching is caseful. */
3369
3370 else
3371 {
3372 zerofirstbyte = firstbyte;
3373 zeroreqbyte = reqbyte;
3374 if (mclength == 1 || req_caseopt == 0)
3375 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3376 }
3377
3378 break; /* End of literal character handling */
3379 }
3380 } /* end of big loop */
3381
3382 /* Control never reaches here by falling through, only by a goto for all the
3383 error states. Pass back the position in the pattern so that it can be displayed
3384 to the user for diagnosing the error. */
3385
3386 FAILED:
3387 *ptrptr = ptr;
3388 return FALSE;
3389 }
3390
3391
3392
3393
3394 /*************************************************
3395 * Compile sequence of alternatives *
3396 *************************************************/
3397
3398 /* On entry, ptr is pointing past the bracket character, but on return
3399 it points to the closing bracket, or vertical bar, or end of string.
3400 The code variable is pointing at the byte into which the BRA operator has been
3401 stored. If the ims options are changed at the start (for a (?ims: group) or
3402 during any branch, we need to insert an OP_OPT item at the start of every
3403 following branch to ensure they get set correctly at run time, and also pass
3404 the new options into every subsequent branch compile.
3405
3406 Argument:
3407 options option bits, including any changes for this subpattern
3408 oldims previous settings of ims option bits
3409 brackets -> int containing the number of extracting brackets used
3410 codeptr -> the address of the current code pointer
3411 ptrptr -> the address of the current pattern pointer
3412 errorcodeptr -> pointer to error code variable
3413 lookbehind TRUE if this is a lookbehind assertion
3414 skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER)
3415 firstbyteptr place to put the first required character, or a negative number
3416 reqbyteptr place to put the last required character, or a negative number
3417 bcptr pointer to the chain of currently open branches
3418 cd points to the data block with tables pointers etc.
3419
3420 Returns: TRUE on success
3421 */
3422
3423 static BOOL
3424 compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
3425 const uschar **ptrptr, int *errorcodeptr, BOOL lookbehind, int skipbytes,
3426 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
3427 {
3428 const uschar *ptr = *ptrptr;
3429 uschar *code = *codeptr;
3430 uschar *last_branch = code;
3431 uschar *start_bracket = code;
3432 uschar *reverse_count = NULL;
3433 int firstbyte, reqbyte;
3434 int branchfirstbyte, branchreqbyte;
3435 branch_chain bc;
3436
3437 bc.outer = bcptr;
3438 bc.current = code;
3439
3440 firstbyte = reqbyte = REQ_UNSET;
3441
3442 /* Offset is set zero to mark that this bracket is still open */
3443
3444 PUT(code, 1, 0);
3445 code += 1 + LINK_SIZE + skipbytes;
3446
3447 /* Loop for each alternative branch */
3448
3449 for (;;)
3450 {
3451 /* Handle a change of ims options at the start of the branch */
3452
3453 if ((options & PCRE_IMS) != oldims)
3454 {
3455 *code++ = OP_OPT;
3456 *code++ = options & PCRE_IMS;
3457 }
3458
3459 /* Set up dummy OP_REVERSE if lookbehind assertion */
3460
3461 if (lookbehind)
3462 {
3463 *code++ = OP_REVERSE;
3464 reverse_count = code;
3465 PUTINC(code, 0, 0);
3466 }
3467
3468 /* Now compile the branch */
3469
3470 if (!compile_branch(&options, brackets, &code, &ptr, errorcodeptr,
3471 &branchfirstbyte, &branchreqbyte, &bc, cd))
3472 {
3473 *ptrptr = ptr;
3474 return FALSE;
3475 }
3476
3477 /* If this is the first branch, the firstbyte and reqbyte values for the
3478 branch become the values for the regex. */
3479
3480 if (*last_branch != OP_ALT)
3481 {
3482 firstbyte = branchfirstbyte;
3483 reqbyte = branchreqbyte;
3484 }
3485
3486 /* If this is not the first branch, the first char and reqbyte have to
3487 match the values from all the previous branches, except that if the previous
3488 value for reqbyte didn't have REQ_VARY set, it can still match, and we set
3489 REQ_VARY for the regex. */
3490
3491 else
3492 {
3493 /* If we previously had a firstbyte, but it doesn't match the new branch,
3494 we have to abandon the firstbyte for the regex, but if there was previously
3495 no reqbyte, it takes on the value of the old firstbyte. */
3496
3497 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
3498 {
3499 if (reqbyte < 0) reqbyte = firstbyte;
3500 firstbyte = REQ_NONE;
3501 }
3502
3503 /* If we (now or from before) have no firstbyte, a firstbyte from the
3504 branch becomes a reqbyte if there isn't a branch reqbyte. */
3505
3506 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
3507 branchreqbyte = branchfirstbyte;
3508
3509 /* Now ensure that the reqbytes match */
3510
3511 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
3512 reqbyte = REQ_NONE;
3513 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
3514 }
3515
3516 /* If lookbehind, check that this branch matches a fixed-length string,
3517 and put the length into the OP_REVERSE item. Temporarily mark the end of
3518 the branch with OP_END. */
3519
3520 if (lookbehind)
3521 {
3522 int length;
3523 *code = OP_END;
3524 length = find_fixedlength(last_branch, options);
3525 DPRINTF(("fixed length = %d\n", length));
3526 if (length < 0)
3527 {
3528 *errorcodeptr = (length == -2)? ERR36 : ERR25;
3529 *ptrptr = ptr;
3530 return FALSE;
3531 }
3532 PUT(reverse_count, 0, length);
3533 }
3534
3535 /* Reached end of expression, either ')' or end of pattern. Go back through
3536 the alternative branches and reverse the chain of offsets, with the field in
3537 the BRA item now becoming an offset to the first alternative. If there are
3538 no alternatives, it points to the end of the group. The length in the
3539 terminating ket is always the length of the whole bracketed item. If any of
3540 the ims options were changed inside the group, compile a resetting op-code
3541 following, except at the very end of the pattern. Return leaving the pointer
3542 at the terminating char. */
3543
3544 if (*ptr != '|')
3545 {
3546 int length = code - last_branch;
3547 do
3548 {
3549 int prev_length = GET(last_branch, 1);
3550 PUT(last_branch, 1, length);
3551 length = prev_length;
3552 last_branch -= length;
3553 }
3554 while (length > 0);
3555
3556 /* Fill in the ket */
3557
3558 *code = OP_KET;
3559 PUT(code, 1, code - start_bracket);
3560 code += 1 + LINK_SIZE;
3561
3562 /* Resetting option if needed */
3563
3564 if ((options & PCRE_IMS) != oldims && *ptr == ')')
3565 {
3566 *code++ = OP_OPT;
3567 *code++ = oldims;
3568 }
3569
3570 /* Set values to pass back */
3571
3572 *codeptr = code;
3573 *ptrptr = ptr;
3574 *firstbyteptr = firstbyte;
3575 *reqbyteptr = reqbyte;
3576 return TRUE;
3577 }
3578
3579 /* Another branch follows; insert an "or" node. Its length field points back
3580 to the previous branch while the bracket remains open. At the end the chain
3581 is reversed. It's done like this so that the start of the bracket has a
3582 zero offset until it is closed, making it possible to detect recursion. */
3583
3584 *code = OP_ALT;
3585 PUT(code, 1, code - last_branch);
3586 bc.current = last_branch = code;
3587 code += 1 + LINK_SIZE;
3588 ptr++;
3589 }
3590 /* Control never reaches here */
3591 }
3592
3593
3594
3595
3596 /*************************************************
3597 * Check for anchored expression *
3598 *************************************************/
3599
3600 /* Try to find out if this is an anchored regular expression. Consider each
3601 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
3602 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
3603 it's anchored. However, if this is a multiline pattern, then only OP_SOD
3604 counts, since OP_CIRC can match in the middle.
3605
3606 We can also consider a regex to be anchored if OP_SOM starts all its branches.
3607 This is the code for \G, which means "match at start of match position, taking
3608 into account the match offset".
3609
3610 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
3611 because that will try the rest of the pattern at all possible matching points,
3612 so there is no point trying again.... er ....
3613
3614 .... except when the .* appears inside capturing parentheses, and there is a
3615 subsequent back reference to those parentheses. We haven't enough information
3616 to catch that case precisely.
3617
3618 At first, the best we could do was to detect when .* was in capturing brackets
3619 and the highest back reference was greater than or equal to that level.
3620 However, by keeping a bitmap of the first 31 back references, we can catch some
3621 of the more common cases more precisely.
3622
3623 Arguments:
3624 code points to start of expression (the bracket)
3625 options points to the options setting
3626 bracket_map a bitmap of which brackets we are inside while testing; this
3627 handles up to substring 31; after that we just have to take
3628 the less precise approach
3629 backref_map the back reference bitmap
3630
3631 Returns: TRUE or FALSE
3632 */
3633
3634 static BOOL
3635 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
3636 unsigned int backref_map)
3637 {
3638 do {
3639 const uschar *scode =
3640 first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE);
3641 register int op = *scode;
3642
3643 /* Capturing brackets */
3644
3645 if (op > OP_BRA)
3646 {
3647 int new_map;
3648 op -= OP_BRA;
3649 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3650 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3651 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
3652 }
3653
3654 /* Other brackets */
3655
3656 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3657 {
3658 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
3659 }
3660
3661 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3662 are or may be referenced. */
3663
3664 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
3665 (*options & PCRE_DOTALL) != 0)
3666 {
3667 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3668 }
3669
3670 /* Check for explicit anchoring */
3671
3672 else if (op != OP_SOD && op != OP_SOM &&
3673 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
3674 return FALSE;
3675 code += GET(code, 1);
3676 }
3677 while (*code == OP_ALT); /* Loop for each alternative */
3678 return TRUE;
3679 }
3680
3681
3682
3683 /*************************************************
3684 * Check for starting with ^ or .* *
3685 *************************************************/
3686
3687 /* This is called to find out if every branch starts with ^ or .* so that
3688 "first char" processing can be done to speed things up in multiline
3689 matching and for non-DOTALL patterns that start with .* (which must start at
3690 the beginning or after \n). As in the case of is_anchored() (see above), we
3691 have to take account of back references to capturing brackets that contain .*
3692 because in that case we can't make the assumption.
3693
3694 Arguments:
3695 code points to start of expression (the bracket)
3696 bracket_map a bitmap of which brackets we are inside while testing; this
3697 handles up to substring 31; after that we just have to take
3698 the less precise approach
3699 backref_map the back reference bitmap
3700
3701 Returns: TRUE or FALSE
3702 */
3703
3704 static BOOL
3705 is_startline(const uschar *code, unsigned int bracket_map,
3706 unsigned int backref_map)
3707 {
3708 do {
3709 const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0,
3710 FALSE);
3711 register int op = *scode;
3712
3713 /* Capturing brackets */
3714
3715 if (op > OP_BRA)
3716 {
3717 int new_map;
3718 op -= OP_BRA;
3719 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3720 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3721 if (!is_startline(scode, new_map, backref_map)) return FALSE;
3722 }
3723
3724 /* Other brackets */
3725
3726 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3727 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
3728
3729 /* .* means "start at start or after \n" if it isn't in brackets that
3730 may be referenced. */
3731
3732 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
3733 {
3734 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3735 }
3736
3737 /* Check for explicit circumflex */
3738
3739 else if (op != OP_CIRC) return FALSE;
3740
3741 /* Move on to the next alternative */
3742
3743 code += GET(code, 1);
3744 }
3745 while (*code == OP_ALT); /* Loop for each alternative */
3746 return TRUE;
3747 }
3748
3749
3750
3751 /*************************************************
3752 * Check for asserted fixed first char *
3753 *************************************************/
3754
3755 /* During compilation, the "first char" settings from forward assertions are
3756 discarded, because they can cause conflicts with actual literals that follow.
3757 However, if we end up without a first char setting for an unanchored pattern,
3758 it is worth scanning the regex to see if there is an initial asserted first
3759 char. If all branches start with the same asserted char, or with a bracket all
3760 of whose alternatives start with the same asserted char (recurse ad lib), then
3761 we return that char, otherwise -1.
3762
3763 Arguments:
3764 code points to start of expression (the bracket)
3765 options pointer to the options (used to check casing changes)
3766 inassert TRUE if in an assertion
3767
3768 Returns: -1 or the fixed first char
3769 */
3770
3771 static int
3772 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
3773 {
3774 register int c = -1;
3775 do {
3776 int d;
3777 const uschar *scode =
3778 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
3779 register int op = *scode;
3780
3781 if (op >= OP_BRA) op = OP_BRA;
3782
3783 switch(op)
3784 {
3785 default:
3786 return -1;
3787
3788 case OP_BRA:
3789 case OP_ASSERT:
3790 case OP_ONCE:
3791 case OP_COND:
3792 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
3793 return -1;
3794 if (c < 0) c = d; else if (c != d) return -1;
3795 break;
3796
3797 case OP_EXACT: /* Fall through */
3798 scode += 2;
3799
3800 case OP_CHAR:
3801 case OP_CHARNC:
3802 case OP_PLUS:
3803 case OP_MINPLUS:
3804 if (!inassert) return -1;
3805 if (c < 0)
3806 {
3807 c = scode[1];
3808 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
3809 }
3810 else if (c != scode[1]) return -1;
3811 break;
3812 }
3813
3814 code += GET(code, 1);
3815 }
3816 while (*code == OP_ALT);
3817 return c;
3818 }
3819
3820
3821
3822 /*************************************************
3823 * Compile a Regular Expression *
3824 *************************************************/
3825
3826 /* This function takes a string and returns a pointer to a block of store
3827 holding a compiled version of the expression. The original API for this
3828 function had no error code return variable; it is retained for backwards
3829 compatibility. The new function is given a new name.
3830
3831 Arguments:
3832 pattern the regular expression
3833 options various option bits
3834 errorcodeptr pointer to error code variable (pcre_compile2() only)
3835 can be NULL if you don't want a code value
3836 errorptr pointer to pointer to error text
3837 erroroffset ptr offset in pattern where error was detected
3838 tables pointer to character tables or NULL
3839
3840 Returns: pointer to compiled data block, or NULL on error,
3841 with errorptr and erroroffset set
3842 */
3843
3844 EXPORT pcre *
3845 pcre_compile(const char *pattern, int options, const char **errorptr,
3846 int *erroroffset, const unsigned char *tables)
3847 {
3848 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
3849 }
3850
3851
3852 EXPORT pcre *
3853 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
3854 const char **errorptr, int *erroroffset, const unsigned char *tables)
3855 {
3856 real_pcre *re;
3857 int length = 1 + LINK_SIZE; /* For initial BRA plus length */
3858 int c, firstbyte, reqbyte;
3859 int bracount = 0;
3860 int branch_extra = 0;
3861 int branch_newextra;
3862 int item_count = -1;
3863 int name_count = 0;
3864 int max_name_size = 0;
3865 int lastitemlength = 0;
3866 int errorcode = 0;
3867 #ifdef SUPPORT_UTF8
3868 BOOL utf8;
3869 BOOL class_utf8;
3870 #endif
3871 BOOL inescq = FALSE;
3872 BOOL capturing;
3873 unsigned int brastackptr = 0;
3874 size_t size;
3875 uschar *code;
3876 const uschar *codestart;
3877 const uschar *ptr;
3878 compile_data compile_block;
3879 int brastack[BRASTACK_SIZE];
3880 uschar bralenstack[BRASTACK_SIZE];
3881
3882 /* We can't pass back an error message if errorptr is NULL; I guess the best we
3883 can do is just return NULL, but we can set a code value if there is a code
3884 pointer. */
3885
3886 if (errorptr == NULL)
3887 {
3888 if (errorcodeptr != NULL) *errorcodeptr = 99;
3889 return NULL;
3890 }
3891
3892 *errorptr = NULL;
3893 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
3894
3895 /* However, we can give a message for this error */
3896
3897 if (erroroffset == NULL)
3898 {
3899 errorcode = ERR16;
3900 goto PCRE_EARLY_ERROR_RETURN;
3901 }
3902
3903 *erroroffset = 0;
3904
3905 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
3906
3907 #ifdef SUPPORT_UTF8
3908 utf8 = (options & PCRE_UTF8) != 0;
3909 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
3910 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
3911 {
3912 errorcode = ERR44;
3913 goto PCRE_EARLY_ERROR_RETURN;
3914 }
3915 #else
3916 if ((options & PCRE_UTF8) != 0)
3917 {
3918 errorcode = ERR32;
3919 goto PCRE_EARLY_ERROR_RETURN;
3920 }
3921 #endif
3922
3923 if ((options & ~PUBLIC_OPTIONS) != 0)
3924 {
3925 errorcode = ERR17;
3926 goto PCRE_EARLY_ERROR_RETURN;
3927 }
3928
3929 /* Set up pointers to the individual character tables */
3930
3931 if (tables == NULL) tables = _pcre_default_tables;
3932 compile_block.lcc = tables + lcc_offset;
3933 compile_block.fcc = tables + fcc_offset;
3934 compile_block.cbits = tables + cbits_offset;
3935 compile_block.ctypes = tables + ctypes_offset;
3936
3937 /* Maximum back reference and backref bitmap. This is updated for numeric
3938 references during the first pass, but for named references during the actual
3939 compile pass. The bitmap records up to 31 back references to help in deciding
3940 whether (.*) can be treated as anchored or not. */
3941
3942 compile_block.top_backref = 0;
3943 compile_block.backref_map = 0;
3944
3945 /* Reflect pattern for debugging output */
3946
3947 DPRINTF(("------------------------------------------------------------------\n"));
3948 DPRINTF(("%s\n", pattern));
3949
3950 /* The first thing to do is to make a pass over the pattern to compute the
3951 amount of store required to hold the compiled code. This does not have to be
3952 perfect as long as errors are overestimates. At the same time we can detect any
3953 flag settings right at the start, and extract them. Make an attempt to correct
3954 for any counted white space if an "extended" flag setting appears late in the
3955 pattern. We can't be so clever for #-comments. */
3956
3957 ptr = (const uschar *)(pattern - 1);
3958 while ((c = *(++ptr)) != 0)
3959 {
3960 int min, max;
3961 int class_optcount;
3962 int bracket_length;
3963 int duplength;
3964
3965 /* If we are inside a \Q...\E sequence, all chars are literal */
3966
3967 if (inescq)
3968 {
3969 if ((options & PCRE_AUTO_CALLOUT) != 0) length += 2 + 2*LINK_SIZE;
3970 goto NORMAL_CHAR;
3971 }
3972
3973 /* Otherwise, first check for ignored whitespace and comments */
3974
3975 if ((options & PCRE_EXTENDED) != 0)
3976 {
3977 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
3978 if (c == '#')
3979 {
3980 /* The space before the ; is to avoid a warning on a silly compiler
3981 on the Macintosh. */
3982 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
3983 if (c == 0) break;
3984 continue;
3985 }
3986 }
3987
3988 item_count++; /* Is zero for the first non-comment item */
3989
3990 /* Allow space for auto callout before every item except quantifiers. */
3991
3992 if ((options & PCRE_AUTO_CALLOUT) != 0 &&
3993 c != '*' && c != '+' && c != '?' &&
3994 (c != '{' || !is_counted_repeat(ptr + 1)))
3995 length += 2 + 2*LINK_SIZE;
3996
3997 switch(c)
3998 {
3999 /* A backslashed item may be an escaped data character or it may be a
4000 character type. */
4001
4002 case '\\':
4003 c = check_escape(&ptr, &errorcode, bracount, options, FALSE);
4004 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4005
4006 lastitemlength = 1; /* Default length of last item for repeats */
4007
4008 if (c >= 0) /* Data character */
4009 {
4010 length += 2; /* For a one-byte character */
4011
4012 #ifdef SUPPORT_UTF8
4013 if (utf8 && c > 127)
4014 {
4015 int i;
4016 for (i = 0; i < _pcre_utf8_table1_size; i++)
4017 if (c <= _pcre_utf8_table1[i]) break;
4018 length += i;
4019 lastitemlength += i;
4020 }
4021 #endif
4022
4023 continue;
4024 }
4025
4026 /* If \Q, enter "literal" mode */
4027
4028 if (-c == ESC_Q)
4029 {
4030 inescq = TRUE;
4031 continue;
4032 }
4033
4034 /* \X is supported only if Unicode property support is compiled */
4035
4036 #ifndef SUPPORT_UCP
4037 if (-c == ESC_X)
4038 {
4039 errorcode = ERR45;
4040 goto PCRE_ERROR_RETURN;
4041 }
4042 #endif
4043
4044 /* \P and \p are for Unicode properties, but only when the support has
4045 been compiled. Each item needs 2 bytes. */
4046
4047 else if (-c == ESC_P || -c == ESC_p)
4048 {
4049 #ifdef SUPPORT_UCP
4050 BOOL negated;
4051 length += 2;
4052 lastitemlength = 2;
4053 if (get_ucp(&ptr, &negated, &errorcode) < 0) goto PCRE_ERROR_RETURN;
4054 continue;
4055 #else
4056 errorcode = ERR45;
4057 goto PCRE_ERROR_RETURN;
4058 #endif
4059 }
4060
4061 /* Other escapes need one byte */
4062
4063 length++;
4064
4065 /* A back reference needs an additional 2 bytes, plus either one or 5
4066 bytes for a repeat. We also need to keep the value of the highest
4067 back reference. */
4068
4069 if (c <= -ESC_REF)
4070 {
4071 int refnum = -c - ESC_REF;
4072 compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
4073 if (refnum > compile_block.top_backref)
4074 compile_block.top_backref = refnum;
4075 length += 2; /* For single back reference */
4076 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
4077 {
4078 ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
4079 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4080 if ((min == 0 && (max == 1 || max == -1)) ||
4081 (min == 1 && max == -1))
4082 length++;
4083 else length += 5;
4084 if (ptr[1] == '?') ptr++;
4085 }
4086 }
4087 continue;
4088
4089 case '^': /* Single-byte metacharacters */
4090 case '.':
4091 case '$':
4092 length++;
4093 lastitemlength = 1;
4094 continue;
4095
4096 case '*': /* These repeats won't be after brackets; */
4097 case '+': /* those are handled separately */
4098 case '?':
4099 length++;
4100 goto POSESSIVE; /* A few lines below */
4101
4102 /* This covers the cases of braced repeats after a single char, metachar,
4103 class, or back reference. */
4104
4105 case '{':
4106 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
4107 ptr = read_repeat_counts(ptr+1, &min, &max, &errorcode);
4108 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4109
4110 /* These special cases just insert one extra opcode */
4111
4112 if ((min == 0 && (max == 1 || max == -1)) ||
4113 (min == 1 && max == -1))
4114 length++;
4115
4116 /* These cases might insert additional copies of a preceding character. */
4117
4118 else
4119 {
4120 if (min != 1)
4121 {
4122 length -= lastitemlength; /* Uncount the original char or metachar */
4123 if (min > 0) length += 3 + lastitemlength;
4124 }
4125 length += lastitemlength + ((max > 0)? 3 : 1);
4126 }
4127
4128 if (ptr[1] == '?') ptr++; /* Needs no extra length */
4129
4130 POSESSIVE: /* Test for possessive quantifier */
4131 if (ptr[1] == '+')
4132 {
4133 ptr++;
4134 length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */
4135 }
4136 continue;
4137
4138 /* An alternation contains an offset to the next branch or ket. If any ims
4139 options changed in the previous branch(es), and/or if we are in a
4140 lookbehind assertion, extra space will be needed at the start of the
4141 branch. This is handled by branch_extra. */
4142
4143 case '|':
4144 length += 1 + LINK_SIZE + branch_extra;
4145 continue;
4146
4147 /* A character class uses 33 characters provided that all the character
4148 values are less than 256. Otherwise, it uses a bit map for low valued
4149 characters, and individual items for others. Don't worry about character
4150 types that aren't allowed in classes - they'll get picked up during the
4151 compile. A character class that contains only one single-byte character
4152 uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
4153 where we can. (In UTF-8 mode we can do this only for chars < 128.) */
4154
4155 case '[':
4156 if (*(++ptr) == '^')
4157 {
4158 class_optcount = 10; /* Greater than one */
4159 ptr++;
4160 }
4161 else class_optcount = 0;
4162
4163 #ifdef SUPPORT_UTF8
4164 class_utf8 = FALSE;
4165 #endif
4166
4167 /* Written as a "do" so that an initial ']' is taken as data */
4168
4169 if (*ptr != 0) do
4170 {
4171 /* Inside \Q...\E everything is literal except \E */
4172
4173 if (inescq)
4174 {
4175 if (*ptr != '\\' || ptr[1] != 'E') goto GET_ONE_CHARACTER;
4176 inescq = FALSE;
4177 ptr += 1;
4178 continue;
4179 }
4180
4181 /* Outside \Q...\E, check for escapes */
4182
4183 if (*ptr == '\\')
4184 {
4185 c = check_escape(&ptr, &errorcode, bracount, options, TRUE);
4186 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4187
4188 /* \b is backspace inside a class; \X is literal */
4189
4190 if (-c == ESC_b) c = '\b';
4191 else if (-c == ESC_X) c = 'X';
4192
4193 /* \Q enters quoting mode */
4194
4195 else if (-c == ESC_Q)
4196 {
4197 inescq = TRUE;
4198 continue;
4199 }
4200
4201 /* Handle escapes that turn into characters */
4202
4203 if (c >= 0) goto NON_SPECIAL_CHARACTER;
4204
4205 /* Escapes that are meta-things. The normal ones just affect the
4206 bit map, but Unicode properties require an XCLASS extended item. */
4207
4208 else
4209 {
4210 class_optcount = 10; /* \d, \s etc; make sure > 1 */
4211 #ifdef SUPPORT_UTF8
4212 if (-c == ESC_p || -c == ESC_P)
4213 {
4214 if (!class_utf8)
4215 {
4216 class_utf8 = TRUE;
4217 length += LINK_SIZE + 2;
4218 }
4219 length += 2;
4220 }
4221 #endif
4222 }
4223 }
4224
4225 /* Check the syntax for POSIX stuff. The bits we actually handle are
4226 checked during the real compile phase. */
4227
4228 else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block))
4229 {
4230 ptr++;
4231 class_optcount = 10; /* Make sure > 1 */
4232 }
4233
4234 /* Anything else increments the possible optimization count. We have to
4235 detect ranges here so that we can compute the number of extra ranges for
4236 caseless wide characters when UCP support is available. If there are wide
4237 characters, we are going to have to use an XCLASS, even for single
4238 characters. */
4239
4240 else
4241 {
4242 int d;
4243
4244 GET_ONE_CHARACTER:
4245
4246 #ifdef SUPPORT_UTF8
4247 if (utf8)
4248 {
4249 int extra = 0;
4250 GETCHARLEN(c, ptr, extra);
4251 ptr += extra;
4252 }
4253 else c = *ptr;
4254 #else
4255 c = *ptr;
4256 #endif
4257
4258 /* Come here from handling \ above when it escapes to a char value */
4259
4260 NON_SPECIAL_CHARACTER:
4261 class_optcount++;
4262
4263 d = -1;
4264 if (ptr[1] == '-')
4265 {
4266 uschar const *hyptr = ptr++;
4267 if (ptr[1] == '\\')
4268 {
4269 ptr++;
4270 d = check_escape(&ptr, &errorcode, bracount, options, TRUE);
4271 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4272 if (-d == ESC_b) d = '\b'; /* backspace */
4273 else if (-d == ESC_X) d = 'X'; /* literal X in a class */
4274 }
4275 else if (ptr[1] != 0 && ptr[1] != ']')
4276 {
4277 ptr++;
4278 #ifdef SUPPORT_UTF8
4279 if (utf8)
4280 {
4281 int extra = 0;
4282 GETCHARLEN(d, ptr, extra);
4283 ptr += extra;
4284 }
4285 else
4286 #endif
4287 d = *ptr;
4288 }
4289 if (d < 0) ptr = hyptr; /* go back to hyphen as data */
4290 }
4291
4292 /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
4293 127 for caseless matching, we will need to use an XCLASS. */
4294
4295 if (d >= 0)
4296 {
4297 class_optcount = 10; /* Ensure > 1 */
4298 if (d < c)
4299 {
4300 errorcode = ERR8;
4301 goto PCRE_ERROR_RETURN;
4302 }
4303
4304 #ifdef SUPPORT_UTF8
4305 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
4306 {
4307 uschar buffer[6];
4308 if (!class_utf8) /* Allow for XCLASS overhead */
4309 {
4310 class_utf8 = TRUE;
4311 length += LINK_SIZE + 2;
4312 }
4313
4314 #ifdef SUPPORT_UCP
4315 /* If we have UCP support, find out how many extra ranges are
4316 needed to map the other case of characters within this range. We
4317 have to mimic the range optimization here, because extending the
4318 range upwards might push d over a boundary that makes is use
4319 another byte in the UTF-8 representation. */
4320
4321 if ((options & PCRE_CASELESS) != 0)
4322 {
4323 int occ, ocd;
4324 int cc = c;
4325 int origd = d;
4326 while (get_othercase_range(&cc, origd, &occ, &ocd))
4327 {
4328 if (occ >= c && ocd <= d) continue; /* Skip embedded */
4329
4330 if (occ < c && ocd >= c - 1) /* Extend the basic range */
4331 { /* if there is overlap, */
4332 c = occ; /* noting that if occ < c */
4333 continue; /* we can't have ocd > d */
4334 } /* because a subrange is */
4335 if (ocd > d && occ <= d + 1) /* always shorter than */
4336 { /* the basic range. */
4337 d = ocd;
4338 continue;
4339 }
4340
4341 /* An extra item is needed */
4342
4343 length += 1 + _pcre_ord2utf8(occ, buffer) +
4344 ((occ == ocd)? 0 : _pcre_ord2utf8(ocd, buffer));
4345 }
4346 }
4347 #endif /* SUPPORT_UCP */
4348
4349 /* The length of the (possibly extended) range */
4350
4351 length += 1 + _pcre_ord2utf8(c, buffer) + _pcre_ord2utf8(d, buffer);
4352 }
4353 #endif /* SUPPORT_UTF8 */
4354
4355 }
4356
4357 /* We have a single character. There is nothing to be done unless we
4358 are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
4359 allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
4360 support. */
4361
4362 else
4363 {
4364 #ifdef SUPPORT_UTF8
4365 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
4366 {
4367 uschar buffer[6];
4368 class_optcount = 10; /* Ensure > 1 */
4369 if (!class_utf8) /* Allow for XCLASS overhead */
4370 {
4371 class_utf8 = TRUE;
4372 length += LINK_SIZE + 2;
4373 }
4374 #ifdef SUPPORT_UCP
4375 length += (((options & PCRE_CASELESS) != 0)? 2 : 1) *
4376 (1 + _pcre_ord2utf8(c, buffer));
4377 #else /* SUPPORT_UCP */
4378 length += 1 + _pcre_ord2utf8(c, buffer);
4379 #endif /* SUPPORT_UCP */
4380 }
4381 #endif /* SUPPORT_UTF8 */
4382 }
4383 }
4384 }
4385 while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */
4386
4387 if (*ptr == 0) /* Missing terminating ']' */
4388 {
4389 errorcode = ERR6;
4390 goto PCRE_ERROR_RETURN;
4391 }
4392
4393 /* We can optimize when there was only one optimizable character. Repeats
4394 for positive and negated single one-byte chars are handled by the general
4395 code. Here, we handle repeats for the class opcodes. */
4396
4397 if (class_optcount == 1) length += 3; else
4398 {
4399 length += 33;
4400
4401 /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
4402 we also need extra for wrapping the whole thing in a sub-pattern. */
4403
4404 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
4405 {
4406 ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
4407 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4408 if ((min == 0 && (max == 1 || max == -1)) ||
4409 (min == 1 && max == -1))
4410 length++;
4411 else length += 5;
4412 if (ptr[1] == '+')
4413 {
4414 ptr++;
4415 length += 2 + 2*LINK_SIZE;
4416 }
4417 else if (ptr[1] == '?') ptr++;
4418 }
4419 }
4420 continue;
4421
4422 /* Brackets may be genuine groups or special things */
4423
4424 case '(':
4425 branch_newextra = 0;
4426 bracket_length = 1 + LINK_SIZE;
4427 capturing = FALSE;
4428
4429 /* Handle special forms of bracket, which all start (? */
4430
4431 if (ptr[1] == '?')
4432 {
4433 int set, unset;
4434 int *optset;
4435
4436 switch (c = ptr[2])
4437 {
4438 /* Skip over comments entirely */
4439 case '#':
4440 ptr += 3;
4441 while (*ptr != 0 && *ptr != ')') ptr++;
4442 if (*ptr == 0)
4443 {
4444 errorcode = ERR18;
4445 goto PCRE_ERROR_RETURN;
4446 }
4447 continue;
4448
4449 /* Non-referencing groups and lookaheads just move the pointer on, and
4450 then behave like a non-special bracket, except that they don't increment
4451 the count of extracting brackets. Ditto for the "once only" bracket,
4452 which is in Perl from version 5.005. */
4453
4454 case ':':
4455 case '=':
4456 case '!':
4457 case '>':
4458 ptr += 2;
4459 break;
4460
4461 /* (?R) specifies a recursive call to the regex, which is an extension
4462 to provide the facility which can be obtained by (?p{perl-code}) in
4463 Perl 5.6. In Perl 5.8 this has become (??{perl-code}).
4464
4465 From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to
4466 the appropriate numbered brackets. This includes both recursive and
4467 non-recursive calls. (?R) is now synonymous with (?0). */
4468
4469 case 'R':
4470 ptr++;
4471
4472 case '0': case '1': case '2': case '3': case '4':
4473 case '5': case '6': case '7': case '8': case '9':
4474 ptr += 2;
4475 if (c != 'R')
4476 while ((digitab[*(++ptr)] & ctype_digit) != 0);
4477 if (*ptr != ')')
4478 {
4479 errorcode = ERR29;
4480 goto PCRE_ERROR_RETURN;
4481 }
4482 length += 1 + LINK_SIZE;
4483
4484 /* If this item is quantified, it will get wrapped inside brackets so
4485 as to use the code for quantified brackets. We jump down and use the
4486 code that handles this for real brackets. */
4487
4488 if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{')
4489 {
4490 length += 2 + 2 * LINK_SIZE; /* to make bracketed */
4491 duplength = 5 + 3 * LINK_SIZE;
4492 goto HANDLE_QUANTIFIED_BRACKETS;
4493 }
4494 continue;
4495
4496 /* (?C) is an extension which provides "callout" - to provide a bit of
4497 the functionality of the Perl (?{...}) feature. An optional number may
4498 follow (default is zero). */
4499
4500 case 'C':
4501 ptr += 2;
4502 while ((digitab[*(++ptr)] & ctype_digit) != 0);
4503 if (*ptr != ')')
4504 {
4505 errorcode = ERR39;
4506 goto PCRE_ERROR_RETURN;
4507 }
4508 length += 2 + 2*LINK_SIZE;
4509 continue;
4510
4511 /* Named subpatterns are an extension copied from Python */
4512
4513 case 'P':
4514 ptr += 3;
4515
4516 /* Handle the definition of a named subpattern */
4517
4518 if (*ptr == '<')
4519 {
4520 const uschar *p; /* Don't amalgamate; some compilers */
4521 p = ++ptr; /* grumble at autoincrement in declaration */
4522 while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;
4523 if (*ptr != '>')
4524 {
4525 errorcode = ERR42;
4526 goto PCRE_ERROR_RETURN;
4527 }
4528 name_count++;
4529 if (ptr - p > max_name_size) max_name_size = (ptr - p);
4530 capturing = TRUE; /* Named parentheses are always capturing */
4531 break;
4532 }
4533
4534 /* Handle back references and recursive calls to named subpatterns */
4535
4536 if (*ptr == '=' || *ptr == '>')
4537 {
4538 while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0);
4539 if (*ptr != ')')
4540 {
4541 errorcode = ERR42;
4542 goto PCRE_ERROR_RETURN;
4543 }
4544 break;
4545 }
4546
4547 /* Unknown character after (?P */
4548
4549 errorcode = ERR41;
4550 goto PCRE_ERROR_RETURN;
4551
4552 /* Lookbehinds are in Perl from version 5.005 */
4553
4554 case '<':
4555 ptr += 3;
4556 if (*ptr == '=' || *ptr == '!')
4557 {
4558 branch_newextra = 1 + LINK_SIZE;
4559 length += 1 + LINK_SIZE; /* For the first branch */
4560 break;
4561 }
4562 errorcode = ERR24;
4563 goto PCRE_ERROR_RETURN;
4564
4565 /* Conditionals are in Perl from version 5.005. The bracket must either
4566 be followed by a number (for bracket reference) or by an assertion
4567 group, or (a PCRE extension) by 'R' for a recursion test. */
4568
4569 case '(':
4570 if (ptr[3] == 'R' && ptr[4] == ')')
4571 {
4572 ptr += 4;
4573 length += 3;
4574 }
4575 else if ((digitab[ptr[3]] & ctype_digit) != 0)
4576 {
4577 ptr += 4;
4578 length += 3;
4579 while ((digitab[*ptr] & ctype_digit) != 0) ptr++;
4580 if (*ptr != ')')
4581 {
4582 errorcode = ERR26;
4583 goto PCRE_ERROR_RETURN;
4584 }
4585 }
4586 else /* An assertion must follow */
4587 {
4588 ptr++; /* Can treat like ':' as far as spacing is concerned */
4589 if (ptr[2] != '?' ||
4590 (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
4591 {
4592 ptr += 2; /* To get right offset in message */
4593 errorcode = ERR28;
4594 goto PCRE_ERROR_RETURN;
4595 }
4596 }
4597 break;
4598
4599 /* Else loop checking valid options until ) is met. Anything else is an
4600 error. If we are without any brackets, i.e. at top level, the settings
4601 act as if specified in the options, so massage the options immediately.
4602 This is for backward compatibility with Perl 5.004. */
4603
4604 default:
4605 set = unset = 0;
4606 optset = &set;
4607 ptr += 2;
4608
4609 for (;; ptr++)
4610 {
4611 c = *ptr;
4612 switch (c)
4613 {
4614 case 'i':
4615 *optset |= PCRE_CASELESS;
4616 continue;
4617
4618 case 'm':
4619 *optset |= PCRE_MULTILINE;
4620 continue;
4621
4622 case 's':
4623 *optset |= PCRE_DOTALL;
4624 continue;
4625
4626 case 'x':
4627 *optset |= PCRE_EXTENDED;
4628 continue;
4629
4630 case 'X':
4631 *optset |= PCRE_EXTRA;
4632 continue;
4633
4634 case 'U':
4635 *optset |= PCRE_UNGREEDY;
4636 continue;
4637
4638 case '-':
4639 optset = &unset;
4640 continue;
4641
4642 /* A termination by ')' indicates an options-setting-only item; if
4643 this is at the very start of the pattern (indicated by item_count
4644 being zero), we use it to set the global options. This is helpful
4645 when analyzing the pattern for first characters, etc. Otherwise
4646 nothing is done here and it is handled during the compiling
4647 process.
4648
4649 We allow for more than one options setting at the start. If such
4650 settings do not change the existing options, nothing is compiled.
4651 However, we must leave space just in case something is compiled.
4652 This can happen for pathological sequences such as (?i)(?-i)
4653 because the global options will end up with -i set. The space is
4654 small and not significant. (Before I did this there was a reported
4655 bug with (?i)(?-i) in a machine-generated pattern.)
4656
4657 [Historical note: Up to Perl 5.8, options settings at top level
4658 were always global settings, wherever they appeared in the pattern.
4659 That is, they were equivalent to an external setting. From 5.8
4660 onwards, they apply only to what follows (which is what you might
4661 expect).] */
4662
4663 case ')':
4664 if (item_count == 0)
4665 {
4666 options = (options | set) & (~unset);
4667 set = unset = 0; /* To save length */
4668 item_count--; /* To allow for several */
4669 length += 2;
4670 }
4671
4672 /* Fall through */
4673
4674 /* A termination by ':' indicates the start of a nested group with
4675 the given options set. This is again handled at compile time, but
4676 we must allow for compiled space if any of the ims options are
4677 set. We also have to allow for resetting space at the end of
4678 the group, which is why 4 is added to the length and not just 2.
4679 If there are several changes of options within the same group, this
4680 will lead to an over-estimate on the length, but this shouldn't
4681 matter very much. We also have to allow for resetting options at
4682 the start of any alternations, which we do by setting
4683 branch_newextra to 2. Finally, we record whether the case-dependent
4684 flag ever changes within the regex. This is used by the "required
4685 character" code. */
4686
4687 case ':':
4688 if (((set|unset) & PCRE_IMS) != 0)
4689 {
4690 length += 4;
4691 branch_newextra = 2;
4692 if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
4693 }
4694 goto END_OPTIONS;
4695
4696 /* Unrecognized option character */
4697
4698 default:
4699 errorcode = ERR12;
4700 goto PCRE_ERROR_RETURN;
4701 }
4702 }
4703
4704 /* If we hit a closing bracket, that's it - this is a freestanding
4705 option-setting. We need to ensure that branch_extra is updated if
4706 necessary. The only values branch_newextra can have here are 0 or 2.
4707 If the value is 2, then branch_extra must either be 2 or 5, depending
4708 on whether this is a lookbehind group or not. */
4709
4710 END_OPTIONS:
4711 if (c == ')')
4712 {
4713 if (branch_newextra == 2 &&
4714 (branch_extra == 0 || branch_extra == 1+LINK_SIZE))
4715 branch_extra += branch_newextra;
4716 continue;
4717 }
4718
4719 /* If options were terminated by ':' control comes here. This is a
4720 non-capturing group with an options change. There is nothing more that
4721 needs to be done because "capturing" is already set FALSE by default;
4722 we can just fall through. */
4723
4724 }
4725 }
4726
4727 /* Ordinary parentheses, not followed by '?', are capturing unless
4728 PCRE_NO_AUTO_CAPTURE is set. */
4729
4730 else capturing = (options & PCRE_NO_AUTO_CAPTURE) == 0;
4731
4732 /* Capturing brackets must be counted so we can process escapes in a
4733 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to need
4734 an additional 3 bytes of memory per capturing bracket. */
4735
4736 if (capturing)
4737 {
4738 bracount++;
4739 if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
4740 }
4741
4742 /* Save length for computing whole length at end if there's a repeat that
4743 requires duplication of the group. Also save the current value of
4744 branch_extra, and start the new group with the new value. If non-zero, this
4745 will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
4746
4747 if (brastackptr >= sizeof(brastack)/sizeof(int))
4748 {
4749 errorcode = ERR19;
4750 goto PCRE_ERROR_RETURN;
4751 }
4752
4753 bralenstack[brastackptr] = branch_extra;
4754 branch_extra = branch_newextra;
4755
4756 brastack[brastackptr++] = length;
4757 length += bracket_length;
4758 continue;
4759
4760 /* Handle ket. Look for subsequent max/min; for certain sets of values we
4761 have to replicate this bracket up to that many times. If brastackptr is
4762 0 this is an unmatched bracket which will generate an error, but take care
4763 not to try to access brastack[-1] when computing the length and restoring
4764 the branch_extra value. */
4765
4766 case ')':
4767 length += 1 + LINK_SIZE;
4768 if (brastackptr > 0)
4769 {
4770 duplength = length - brastack[--brastackptr];
4771 branch_extra = bralenstack[brastackptr];
4772 }
4773 else duplength = 0;
4774
4775 /* The following code is also used when a recursion such as (?3) is
4776 followed by a quantifier, because in that case, it has to be wrapped inside
4777 brackets so that the quantifier works. The value of duplength must be
4778 set before arrival. */
4779
4780 HANDLE_QUANTIFIED_BRACKETS:
4781
4782 /* Leave ptr at the final char; for read_repeat_counts this happens
4783 automatically; for the others we need an increment. */
4784
4785 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
4786 {
4787 ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
4788 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4789 }
4790 else if (c == '*') { min = 0; max = -1; ptr++; }
4791 else if (c == '+') { min = 1; max = -1; ptr++; }
4792 else if (c == '?') { min = 0; max = 1; ptr++; }
4793 else { min = 1; max = 1; }
4794
4795 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
4796 group, and if the maximum is greater than zero, we have to replicate
4797 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
4798 bracket set. */
4799
4800 if (min == 0)
4801 {
4802 length++;
4803 if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
4804 }
4805
4806 /* When the minimum is greater than zero, we have to replicate up to
4807 minval-1 times, with no additions required in the copies. Then, if there
4808 is a limited maximum we have to replicate up to maxval-1 times allowing
4809 for a BRAZERO item before each optional copy and nesting brackets for all
4810 but one of the optional copies. */
4811
4812 else
4813 {
4814 length += (min - 1) * duplength;
4815 if (max > min) /* Need this test as max=-1 means no limit */
4816 length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
4817 - (2 + 2*LINK_SIZE);
4818 }
4819
4820 /* Allow space for once brackets for "possessive quantifier" */
4821
4822 if (ptr[1] == '+')
4823 {
4824 ptr++;
4825 length += 2 + 2*LINK_SIZE;
4826 }
4827 continue;
4828
4829 /* Non-special character. It won't be space or # in extended mode, so it is
4830 always a genuine character. If we are in a \Q...\E sequence, check for the
4831 end; if not, we have a literal. */
4832
4833 default:
4834 NORMAL_CHAR:
4835
4836 if (inescq && c == '\\' && ptr[1] == 'E')
4837 {
4838 inescq = FALSE;
4839 ptr++;
4840 continue;
4841 }
4842
4843 length += 2; /* For a one-byte character */
4844 lastitemlength = 1; /* Default length of last item for repeats */
4845
4846 /* In UTF-8 mode, check for additional bytes. */
4847
4848 #ifdef SUPPORT_UTF8
4849 if (utf8 && (c & 0xc0) == 0xc0)
4850 {
4851 while ((ptr[1] & 0xc0) == 0x80) /* Can't flow over the end */
4852 { /* because the end is marked */
4853 lastitemlength++; /* by a zero byte. */
4854 length++;
4855 ptr++;
4856 }
4857 }
4858 #endif
4859
4860 continue;
4861 }
4862 }
4863
4864 length += 2 + LINK_SIZE; /* For final KET and END */
4865
4866 if ((options & PCRE_AUTO_CALLOUT) != 0)
4867 length += 2 + 2*LINK_SIZE; /* For final callout */
4868
4869 if (length > MAX_PATTERN_SIZE)
4870 {
4871 errorcode = ERR20;
4872 goto PCRE_EARLY_ERROR_RETURN;
4873 }
4874
4875 /* Compute the size of data block needed and get it, either from malloc or
4876 externally provided function. */
4877
4878 size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
4879 re = (real_pcre *)(pcre_malloc)(size);
4880
4881 if (re == NULL)
4882 {
4883 errorcode = ERR21;
4884 goto PCRE_EARLY_ERROR_RETURN;
4885 }
4886
4887 /* Put in the magic number, and save the sizes, options, and character table
4888 pointer. NULL is used for the default character tables. The nullpad field is at
4889 the end; it's there to help in the case when a regex compiled on a system with
4890 4-byte pointers is run on another with 8-byte pointers. */
4891
4892 re->magic_number = MAGIC_NUMBER;
4893 re->size = size;
4894 re->options = options;
4895 re->dummy1 = 0;
4896 re->name_table_offset = sizeof(real_pcre);
4897 re->name_entry_size = max_name_size + 3;
4898 re->name_count = name_count;
4899 re->ref_count = 0;
4900 re->tables = (tables == _pcre_default_tables)? NULL : tables;
4901 re->nullpad = NULL;
4902
4903 /* The starting points of the name/number translation table and of the code are
4904 passed around in the compile data block. */
4905
4906 compile_block.names_found = 0;
4907 compile_block.name_entry_size = max_name_size + 3;
4908 compile_block.name_table = (uschar *)re + re->name_table_offset;
4909 codestart = compile_block.name_table + re->name_entry_size * re->name_count;
4910 compile_block.start_code = codestart;
4911 compile_block.start_pattern = (const uschar *)pattern;
4912 compile_block.req_varyopt = 0;
4913 compile_block.nopartial = FALSE;
4914
4915 /* Set up a starting, non-extracting bracket, then compile the expression. On
4916 error, errorcode will be set non-zero, so we don't need to look at the result
4917 of the function here. */
4918
4919 ptr = (const uschar *)pattern;
4920 code = (uschar *)codestart;
4921 *code = OP_BRA;
4922 bracount = 0;
4923 (void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr,
4924 &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, &compile_block);
4925 re->top_bracket = bracount;
4926 re->top_backref = compile_block.top_backref;
4927
4928 if (compile_block.nopartial) re->options |= PCRE_NOPARTIAL;
4929
4930 /* If not reached end of pattern on success, there's an excess bracket. */
4931
4932 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
4933
4934 /* Fill in the terminating state and check for disastrous overflow, but
4935 if debugging, leave the test till after things are printed out. */
4936
4937 *code++ = OP_END;
4938
4939 #ifndef DEBUG
4940 if (code - codestart > length) errorcode = ERR23;
4941 #endif
4942
4943 /* Give an error if there's back reference to a non-existent capturing
4944 subpattern. */
4945
4946 if (re->top_backref > re->top_bracket) errorcode = ERR15;
4947
4948 /* Failed to compile, or error while post-processing */
4949
4950 if (errorcode != 0)
4951 {
4952 (pcre_free)(re);
4953 PCRE_ERROR_RETURN:
4954 *erroroffset = ptr - (const uschar *)pattern;
4955 PCRE_EARLY_ERROR_RETURN:
4956 *errorptr = error_texts[errorcode];
4957 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
4958 return NULL;
4959 }
4960
4961 /* If the anchored option was not passed, set the flag if we can determine that
4962 the pattern is anchored by virtue of ^ characters or \A or anything else (such
4963 as starting with .* when DOTALL is set).
4964
4965 Otherwise, if we know what the first character has to be, save it, because that
4966 speeds up unanchored matches no end. If not, see if we can set the
4967 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
4968 start with ^. and also when all branches start with .* for non-DOTALL matches.
4969 */
4970
4971 if ((options & PCRE_ANCHORED) == 0)
4972 {
4973 int temp_options = options;
4974 if (is_anchored(codestart, &temp_options, 0, compile_block.backref_map))
4975 re->options |= PCRE_ANCHORED;
4976 else
4977 {
4978 if (firstbyte < 0)
4979 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
4980 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
4981 {
4982 int ch = firstbyte & 255;
4983 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
4984 compile_block.fcc[ch] == ch)? ch : firstbyte;
4985 re->options |= PCRE_FIRSTSET;
4986 }
4987 else if (is_startline(codestart, 0, compile_block.backref_map))
4988 re->options |= PCRE_STARTLINE;
4989 }
4990 }
4991
4992 /* For an anchored pattern, we use the "required byte" only if it follows a
4993 variable length item in the regex. Remove the caseless flag for non-caseable
4994 bytes. */
4995
4996 if (reqbyte >= 0 &&
4997 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
4998 {
4999 int ch = reqbyte & 255;
5000 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
5001 compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5002 re->options |= PCRE_REQCHSET;
5003 }
5004
5005 /* Print out the compiled data for debugging */
5006
5007 #ifdef DEBUG
5008
5009 printf("Length = %d top_bracket = %d top_backref = %d\n",
5010 length, re->top_bracket, re->top_backref);
5011
5012 if (re->options != 0)
5013 {
5014 printf("%s%s%s%s%s%s%s%s%s%s\n",
5015 ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5016 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5017 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5018 ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
5019 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5020 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5021 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5022 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5023 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5024 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5025 }
5026
5027 if ((re->options & PCRE_FIRSTSET) != 0)
5028 {
5029 int ch = re->first_byte & 255;
5030 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
5031 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5032 else printf("First char = \\x%02x%s\n", ch, caseless);
5033 }
5034
5035 if ((re->options & PCRE_REQCHSET) != 0)
5036 {
5037 int ch = re->req_byte & 255;
5038 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
5039 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5040 else printf("Req char = \\x%02x%s\n", ch, caseless);
5041 }
5042
5043 _pcre_printint(re, stdout);
5044
5045 /* This check is done here in the debugging case so that the code that
5046 was compiled can be seen. */
5047
5048 if (code - codestart > length)
5049 {
5050 (pcre_free)(re);
5051 *errorptr = error_texts[ERR23];
5052 *erroroffset = ptr - (uschar *)pattern;
5053 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
5054 return NULL;
5055 }
5056 #endif
5057
5058 return (pcre *)re;
5059 }
5060
5061 /* End of pcre_compile.c */