Install PCRE 6.2.
[exim.git] / src / src / pcre / pcre_compile.c
1 /* $Cambridge: exim/src/src/pcre/pcre_compile.c,v 1.2 2005/08/08 10:22:14 ph10 Exp $ */
2
3 /*************************************************
4 * Perl-Compatible Regular Expressions *
5 *************************************************/
6
7 /* PCRE is a library of functions to support regular expressions whose syntax
8 and semantics are as close as possible to those of the Perl 5 language.
9
10 Written by Philip Hazel
11 Copyright (c) 1997-2005 University of Cambridge
12
13 -----------------------------------------------------------------------------
14 Redistribution and use in source and binary forms, with or without
15 modification, are permitted provided that the following conditions are met:
16
17 * Redistributions of source code must retain the above copyright notice,
18 this list of conditions and the following disclaimer.
19
20 * Redistributions in binary form must reproduce the above copyright
21 notice, this list of conditions and the following disclaimer in the
22 documentation and/or other materials provided with the distribution.
23
24 * Neither the name of the University of Cambridge nor the names of its
25 contributors may be used to endorse or promote products derived from
26 this software without specific prior written permission.
27
28 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 POSSIBILITY OF SUCH DAMAGE.
39 -----------------------------------------------------------------------------
40 */
41
42
43 /* This module contains the external function pcre_compile(), along with
44 supporting internal functions that are not used by other modules. */
45
46
47 #include "pcre_internal.h"
48
49
50 /*************************************************
51 * Code parameters and static tables *
52 *************************************************/
53
54 /* Maximum number of items on the nested bracket stacks at compile time. This
55 applies to the nesting of all kinds of parentheses. It does not limit
56 un-nested, non-capturing parentheses. This number can be made bigger if
57 necessary - it is used to dimension one int and one unsigned char vector at
58 compile time. */
59
60 #define BRASTACK_SIZE 200
61
62
63 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
64 are simple data values; negative values are for special things like \d and so
65 on. Zero means further processing is needed (for things like \x), or the escape
66 is invalid. */
67
68 #if !EBCDIC /* This is the "normal" table for ASCII systems */
69 static const short int escapes[] = {
70 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
71 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
72 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
73 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
74 -ESC_P, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
75 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
76 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
77 0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
78 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
79 0, 0, -ESC_z /* x - z */
80 };
81
82 #else /* This is the "abnormal" table for EBCDIC systems */
83 static const short int escapes[] = {
84 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
85 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
86 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
87 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
88 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
89 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
90 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
91 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
92 /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
93 /* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, -ESC_p,
94 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
95 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
96 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
97 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
98 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
99 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
100 /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
101 /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
102 /* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0,
103 /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
104 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
105 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
106 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
107 };
108 #endif
109
110
111 /* Tables of names of POSIX character classes and their lengths. The list is
112 terminated by a zero length entry. The first three must be alpha, upper, lower,
113 as this is assumed for handling case independence. */
114
115 static const char *const posix_names[] = {
116 "alpha", "lower", "upper",
117 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
118 "print", "punct", "space", "word", "xdigit" };
119
120 static const uschar posix_name_lengths[] = {
121 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
122
123 /* Table of class bit maps for each POSIX class; up to three may be combined
124 to form the class. The table for [:blank:] is dynamically modified to remove
125 the vertical space characters. */
126
127 static const int posix_class_maps[] = {
128 cbit_lower, cbit_upper, -1, /* alpha */
129 cbit_lower, -1, -1, /* lower */
130 cbit_upper, -1, -1, /* upper */
131 cbit_digit, cbit_lower, cbit_upper, /* alnum */
132 cbit_print, cbit_cntrl, -1, /* ascii */
133 cbit_space, -1, -1, /* blank - a GNU extension */
134 cbit_cntrl, -1, -1, /* cntrl */
135 cbit_digit, -1, -1, /* digit */
136 cbit_graph, -1, -1, /* graph */
137 cbit_print, -1, -1, /* print */
138 cbit_punct, -1, -1, /* punct */
139 cbit_space, -1, -1, /* space */
140 cbit_word, -1, -1, /* word - a Perl extension */
141 cbit_xdigit,-1, -1 /* xdigit */
142 };
143
144
145 /* The texts of compile-time error messages. These are "char *" because they
146 are passed to the outside world. */
147
148 static const char *error_texts[] = {
149 "no error",
150 "\\ at end of pattern",
151 "\\c at end of pattern",
152 "unrecognized character follows \\",
153 "numbers out of order in {} quantifier",
154 /* 5 */
155 "number too big in {} quantifier",
156 "missing terminating ] for character class",
157 "invalid escape sequence in character class",
158 "range out of order in character class",
159 "nothing to repeat",
160 /* 10 */
161 "operand of unlimited repeat could match the empty string",
162 "internal error: unexpected repeat",
163 "unrecognized character after (?",
164 "POSIX named classes are supported only within a class",
165 "missing )",
166 /* 15 */
167 "reference to non-existent subpattern",
168 "erroffset passed as NULL",
169 "unknown option bit(s) set",
170 "missing ) after comment",
171 "parentheses nested too deeply",
172 /* 20 */
173 "regular expression too large",
174 "failed to get memory",
175 "unmatched parentheses",
176 "internal error: code overflow",
177 "unrecognized character after (?<",
178 /* 25 */
179 "lookbehind assertion is not fixed length",
180 "malformed number after (?(",
181 "conditional group contains more than two branches",
182 "assertion expected after (?(",
183 "(?R or (?digits must be followed by )",
184 /* 30 */
185 "unknown POSIX class name",
186 "POSIX collating elements are not supported",
187 "this version of PCRE is not compiled with PCRE_UTF8 support",
188 "spare error",
189 "character value in \\x{...} sequence is too large",
190 /* 35 */
191 "invalid condition (?(0)",
192 "\\C not allowed in lookbehind assertion",
193 "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
194 "number after (?C is > 255",
195 "closing ) for (?C expected",
196 /* 40 */
197 "recursive call could loop indefinitely",
198 "unrecognized character after (?P",
199 "syntax error after (?P",
200 "two named groups have the same name",
201 "invalid UTF-8 string",
202 /* 45 */
203 "support for \\P, \\p, and \\X has not been compiled",
204 "malformed \\P or \\p sequence",
205 "unknown property name after \\P or \\p"
206 };
207
208
209 /* Table to identify digits and hex digits. This is used when compiling
210 patterns. Note that the tables in chartables are dependent on the locale, and
211 may mark arbitrary characters as digits - but the PCRE compiling code expects
212 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
213 a private table here. It costs 256 bytes, but it is a lot faster than doing
214 character value tests (at least in some simple cases I timed), and in some
215 applications one wants PCRE to compile efficiently as well as match
216 efficiently.
217
218 For convenience, we use the same bit definitions as in chartables:
219
220 0x04 decimal digit
221 0x08 hexadecimal digit
222
223 Then we can use ctype_digit and ctype_xdigit in the code. */
224
225 #if !EBCDIC /* This is the "normal" case, for ASCII systems */
226 static const unsigned char digitab[] =
227 {
228 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
229 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
230 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
231 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
232 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
233 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
234 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
235 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
236 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
237 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
238 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
239 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
240 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
241 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
242 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
243 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
244 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
245 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
246 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
247 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
248 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
249 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
250 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
251 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
252 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
253 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
254 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
255 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
256 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
257 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
258 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
259 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
260
261 #else /* This is the "abnormal" case, for EBCDIC systems */
262 static const unsigned char digitab[] =
263 {
264 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
265 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
266 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
267 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
268 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
269 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
270 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
271 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
272 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
273 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
274 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
275 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- ¬ */
276 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
277 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
278 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
279 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
280 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
281 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
282 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
283 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
284 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
285 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
286 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
287 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
288 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
289 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
290 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
291 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
292 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
293 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
294 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
295 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
296
297 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
298 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
299 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
300 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
301 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
302 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
303 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
304 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
305 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
306 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
307 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
308 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
309 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- ¬ */
310 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
311 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
312 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
313 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
314 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
315 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
316 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
317 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
318 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
319 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
320 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
321 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
322 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
323 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
324 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
325 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
326 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
327 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
328 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
329 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
330 #endif
331
332
333 /* Definition to allow mutual recursion */
334
335 static BOOL
336 compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,
337 int *, int *, branch_chain *, compile_data *);
338
339
340
341 /*************************************************
342 * Handle escapes *
343 *************************************************/
344
345 /* This function is called when a \ has been encountered. It either returns a
346 positive value for a simple escape such as \n, or a negative value which
347 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
348 a positive value greater than 255 may be returned. On entry, ptr is pointing at
349 the \. On exit, it is on the final character of the escape sequence.
350
351 Arguments:
352 ptrptr points to the pattern position pointer
353 errorcodeptr points to the errorcode variable
354 bracount number of previous extracting brackets
355 options the options bits
356 isclass TRUE if inside a character class
357
358 Returns: zero or positive => a data character
359 negative => a special escape sequence
360 on error, errorptr is set
361 */
362
363 static int
364 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
365 int options, BOOL isclass)
366 {
367 const uschar *ptr = *ptrptr;
368 int c, i;
369
370 /* If backslash is at the end of the pattern, it's an error. */
371
372 c = *(++ptr);
373 if (c == 0) *errorcodeptr = ERR1;
374
375 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
376 a table. A non-zero result is something that can be returned immediately.
377 Otherwise further processing may be required. */
378
379 #if !EBCDIC /* ASCII coding */
380 else if (c < '0' || c > 'z') {} /* Not alphameric */
381 else if ((i = escapes[c - '0']) != 0) c = i;
382
383 #else /* EBCDIC coding */
384 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
385 else if ((i = escapes[c - 0x48]) != 0) c = i;
386 #endif
387
388 /* Escapes that need further processing, or are illegal. */
389
390 else
391 {
392 const uschar *oldptr;
393 switch (c)
394 {
395 /* A number of Perl escapes are not handled by PCRE. We give an explicit
396 error. */
397
398 case 'l':
399 case 'L':
400 case 'N':
401 case 'u':
402 case 'U':
403 *errorcodeptr = ERR37;
404 break;
405
406 /* The handling of escape sequences consisting of a string of digits
407 starting with one that is not zero is not straightforward. By experiment,
408 the way Perl works seems to be as follows:
409
410 Outside a character class, the digits are read as a decimal number. If the
411 number is less than 10, or if there are that many previous extracting
412 left brackets, then it is a back reference. Otherwise, up to three octal
413 digits are read to form an escaped byte. Thus \123 is likely to be octal
414 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
415 value is greater than 377, the least significant 8 bits are taken. Inside a
416 character class, \ followed by a digit is always an octal number. */
417
418 case '1': case '2': case '3': case '4': case '5':
419 case '6': case '7': case '8': case '9':
420
421 if (!isclass)
422 {
423 oldptr = ptr;
424 c -= '0';
425 while ((digitab[ptr[1]] & ctype_digit) != 0)
426 c = c * 10 + *(++ptr) - '0';
427 if (c < 10 || c <= bracount)
428 {
429 c = -(ESC_REF + c);
430 break;
431 }
432 ptr = oldptr; /* Put the pointer back and fall through */
433 }
434
435 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
436 generates a binary zero byte and treats the digit as a following literal.
437 Thus we have to pull back the pointer by one. */
438
439 if ((c = *ptr) >= '8')
440 {
441 ptr--;
442 c = 0;
443 break;
444 }
445
446 /* \0 always starts an octal number, but we may drop through to here with a
447 larger first octal digit. */
448
449 case '0':
450 c -= '0';
451 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
452 c = c * 8 + *(++ptr) - '0';
453 c &= 255; /* Take least significant 8 bits */
454 break;
455
456 /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
457 which can be greater than 0xff, but only if the ddd are hex digits. */
458
459 case 'x':
460 #ifdef SUPPORT_UTF8
461 if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
462 {
463 const uschar *pt = ptr + 2;
464 register int count = 0;
465 c = 0;
466 while ((digitab[*pt] & ctype_xdigit) != 0)
467 {
468 int cc = *pt++;
469 count++;
470 #if !EBCDIC /* ASCII coding */
471 if (cc >= 'a') cc -= 32; /* Convert to upper case */
472 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
473 #else /* EBCDIC coding */
474 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
475 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
476 #endif
477 }
478 if (*pt == '}')
479 {
480 if (c < 0 || count > 8) *errorcodeptr = ERR34;
481 ptr = pt;
482 break;
483 }
484 /* If the sequence of hex digits does not end with '}', then we don't
485 recognize this construct; fall through to the normal \x handling. */
486 }
487 #endif
488
489 /* Read just a single hex char */
490
491 c = 0;
492 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
493 {
494 int cc; /* Some compilers don't like ++ */
495 cc = *(++ptr); /* in initializers */
496 #if !EBCDIC /* ASCII coding */
497 if (cc >= 'a') cc -= 32; /* Convert to upper case */
498 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
499 #else /* EBCDIC coding */
500 if (cc <= 'z') cc += 64; /* Convert to upper case */
501 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
502 #endif
503 }
504 break;
505
506 /* Other special escapes not starting with a digit are straightforward */
507
508 case 'c':
509 c = *(++ptr);
510 if (c == 0)
511 {
512 *errorcodeptr = ERR2;
513 return 0;
514 }
515
516 /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
517 is ASCII-specific, but then the whole concept of \cx is ASCII-specific.
518 (However, an EBCDIC equivalent has now been added.) */
519
520 #if !EBCDIC /* ASCII coding */
521 if (c >= 'a' && c <= 'z') c -= 32;
522 c ^= 0x40;
523 #else /* EBCDIC coding */
524 if (c >= 'a' && c <= 'z') c += 64;
525 c ^= 0xC0;
526 #endif
527 break;
528
529 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
530 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
531 for Perl compatibility, it is a literal. This code looks a bit odd, but
532 there used to be some cases other than the default, and there may be again
533 in future, so I haven't "optimized" it. */
534
535 default:
536 if ((options & PCRE_EXTRA) != 0) switch(c)
537 {
538 default:
539 *errorcodeptr = ERR3;
540 break;
541 }
542 break;
543 }
544 }
545
546 *ptrptr = ptr;
547 return c;
548 }
549
550
551
552 #ifdef SUPPORT_UCP
553 /*************************************************
554 * Handle \P and \p *
555 *************************************************/
556
557 /* This function is called after \P or \p has been encountered, provided that
558 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
559 pointing at the P or p. On exit, it is pointing at the final character of the
560 escape sequence.
561
562 Argument:
563 ptrptr points to the pattern position pointer
564 negptr points to a boolean that is set TRUE for negation else FALSE
565 errorcodeptr points to the error code variable
566
567 Returns: value from ucp_type_table, or -1 for an invalid type
568 */
569
570 static int
571 get_ucp(const uschar **ptrptr, BOOL *negptr, int *errorcodeptr)
572 {
573 int c, i, bot, top;
574 const uschar *ptr = *ptrptr;
575 char name[4];
576
577 c = *(++ptr);
578 if (c == 0) goto ERROR_RETURN;
579
580 *negptr = FALSE;
581
582 /* \P or \p can be followed by a one- or two-character name in {}, optionally
583 preceded by ^ for negation. */
584
585 if (c == '{')
586 {
587 if (ptr[1] == '^')
588 {
589 *negptr = TRUE;
590 ptr++;
591 }
592 for (i = 0; i <= 2; i++)
593 {
594 c = *(++ptr);
595 if (c == 0) goto ERROR_RETURN;
596 if (c == '}') break;
597 name[i] = c;
598 }
599 if (c !='}') /* Try to distinguish error cases */
600 {
601 while (*(++ptr) != 0 && *ptr != '}');
602 if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;
603 }
604 name[i] = 0;
605 }
606
607 /* Otherwise there is just one following character */
608
609 else
610 {
611 name[0] = c;
612 name[1] = 0;
613 }
614
615 *ptrptr = ptr;
616
617 /* Search for a recognized property name using binary chop */
618
619 bot = 0;
620 top = _pcre_utt_size;
621
622 while (bot < top)
623 {
624 i = (bot + top)/2;
625 c = strcmp(name, _pcre_utt[i].name);
626 if (c == 0) return _pcre_utt[i].value;
627 if (c > 0) bot = i + 1; else top = i;
628 }
629
630 UNKNOWN_RETURN:
631 *errorcodeptr = ERR47;
632 *ptrptr = ptr;
633 return -1;
634
635 ERROR_RETURN:
636 *errorcodeptr = ERR46;
637 *ptrptr = ptr;
638 return -1;
639 }
640 #endif
641
642
643
644
645 /*************************************************
646 * Check for counted repeat *
647 *************************************************/
648
649 /* This function is called when a '{' is encountered in a place where it might
650 start a quantifier. It looks ahead to see if it really is a quantifier or not.
651 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
652 where the ddds are digits.
653
654 Arguments:
655 p pointer to the first char after '{'
656
657 Returns: TRUE or FALSE
658 */
659
660 static BOOL
661 is_counted_repeat(const uschar *p)
662 {
663 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
664 while ((digitab[*p] & ctype_digit) != 0) p++;
665 if (*p == '}') return TRUE;
666
667 if (*p++ != ',') return FALSE;
668 if (*p == '}') return TRUE;
669
670 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
671 while ((digitab[*p] & ctype_digit) != 0) p++;
672
673 return (*p == '}');
674 }
675
676
677
678 /*************************************************
679 * Read repeat counts *
680 *************************************************/
681
682 /* Read an item of the form {n,m} and return the values. This is called only
683 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
684 so the syntax is guaranteed to be correct, but we need to check the values.
685
686 Arguments:
687 p pointer to first char after '{'
688 minp pointer to int for min
689 maxp pointer to int for max
690 returned as -1 if no max
691 errorcodeptr points to error code variable
692
693 Returns: pointer to '}' on success;
694 current ptr on error, with errorcodeptr set non-zero
695 */
696
697 static const uschar *
698 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
699 {
700 int min = 0;
701 int max = -1;
702
703 /* Read the minimum value and do a paranoid check: a negative value indicates
704 an integer overflow. */
705
706 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
707 if (min < 0 || min > 65535)
708 {
709 *errorcodeptr = ERR5;
710 return p;
711 }
712
713 /* Read the maximum value if there is one, and again do a paranoid on its size.
714 Also, max must not be less than min. */
715
716 if (*p == '}') max = min; else
717 {
718 if (*(++p) != '}')
719 {
720 max = 0;
721 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
722 if (max < 0 || max > 65535)
723 {
724 *errorcodeptr = ERR5;
725 return p;
726 }
727 if (max < min)
728 {
729 *errorcodeptr = ERR4;
730 return p;
731 }
732 }
733 }
734
735 /* Fill in the required variables, and pass back the pointer to the terminating
736 '}'. */
737
738 *minp = min;
739 *maxp = max;
740 return p;
741 }
742
743
744
745 /*************************************************
746 * Find first significant op code *
747 *************************************************/
748
749 /* This is called by several functions that scan a compiled expression looking
750 for a fixed first character, or an anchoring op code etc. It skips over things
751 that do not influence this. For some calls, a change of option is important.
752 For some calls, it makes sense to skip negative forward and all backward
753 assertions, and also the \b assertion; for others it does not.
754
755 Arguments:
756 code pointer to the start of the group
757 options pointer to external options
758 optbit the option bit whose changing is significant, or
759 zero if none are
760 skipassert TRUE if certain assertions are to be skipped
761
762 Returns: pointer to the first significant opcode
763 */
764
765 static const uschar*
766 first_significant_code(const uschar *code, int *options, int optbit,
767 BOOL skipassert)
768 {
769 for (;;)
770 {
771 switch ((int)*code)
772 {
773 case OP_OPT:
774 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
775 *options = (int)code[1];
776 code += 2;
777 break;
778
779 case OP_ASSERT_NOT:
780 case OP_ASSERTBACK:
781 case OP_ASSERTBACK_NOT:
782 if (!skipassert) return code;
783 do code += GET(code, 1); while (*code == OP_ALT);
784 code += _pcre_OP_lengths[*code];
785 break;
786
787 case OP_WORD_BOUNDARY:
788 case OP_NOT_WORD_BOUNDARY:
789 if (!skipassert) return code;
790 /* Fall through */
791
792 case OP_CALLOUT:
793 case OP_CREF:
794 case OP_BRANUMBER:
795 code += _pcre_OP_lengths[*code];
796 break;
797
798 default:
799 return code;
800 }
801 }
802 /* Control never reaches here */
803 }
804
805
806
807
808 /*************************************************
809 * Find the fixed length of a pattern *
810 *************************************************/
811
812 /* Scan a pattern and compute the fixed length of subject that will match it,
813 if the length is fixed. This is needed for dealing with backward assertions.
814 In UTF8 mode, the result is in characters rather than bytes.
815
816 Arguments:
817 code points to the start of the pattern (the bracket)
818 options the compiling options
819
820 Returns: the fixed length, or -1 if there is no fixed length,
821 or -2 if \C was encountered
822 */
823
824 static int
825 find_fixedlength(uschar *code, int options)
826 {
827 int length = -1;
828
829 register int branchlength = 0;
830 register uschar *cc = code + 1 + LINK_SIZE;
831
832 /* Scan along the opcodes for this branch. If we get to the end of the
833 branch, check the length against that of the other branches. */
834
835 for (;;)
836 {
837 int d;
838 register int op = *cc;
839 if (op >= OP_BRA) op = OP_BRA;
840
841 switch (op)
842 {
843 case OP_BRA:
844 case OP_ONCE:
845 case OP_COND:
846 d = find_fixedlength(cc, options);
847 if (d < 0) return d;
848 branchlength += d;
849 do cc += GET(cc, 1); while (*cc == OP_ALT);
850 cc += 1 + LINK_SIZE;
851 break;
852
853 /* Reached end of a branch; if it's a ket it is the end of a nested
854 call. If it's ALT it is an alternation in a nested call. If it is
855 END it's the end of the outer call. All can be handled by the same code. */
856
857 case OP_ALT:
858 case OP_KET:
859 case OP_KETRMAX:
860 case OP_KETRMIN:
861 case OP_END:
862 if (length < 0) length = branchlength;
863 else if (length != branchlength) return -1;
864 if (*cc != OP_ALT) return length;
865 cc += 1 + LINK_SIZE;
866 branchlength = 0;
867 break;
868
869 /* Skip over assertive subpatterns */
870
871 case OP_ASSERT:
872 case OP_ASSERT_NOT:
873 case OP_ASSERTBACK:
874 case OP_ASSERTBACK_NOT:
875 do cc += GET(cc, 1); while (*cc == OP_ALT);
876 /* Fall through */
877
878 /* Skip over things that don't match chars */
879
880 case OP_REVERSE:
881 case OP_BRANUMBER:
882 case OP_CREF:
883 case OP_OPT:
884 case OP_CALLOUT:
885 case OP_SOD:
886 case OP_SOM:
887 case OP_EOD:
888 case OP_EODN:
889 case OP_CIRC:
890 case OP_DOLL:
891 case OP_NOT_WORD_BOUNDARY:
892 case OP_WORD_BOUNDARY:
893 cc += _pcre_OP_lengths[*cc];
894 break;
895
896 /* Handle literal characters */
897
898 case OP_CHAR:
899 case OP_CHARNC:
900 branchlength++;
901 cc += 2;
902 #ifdef SUPPORT_UTF8
903 if ((options & PCRE_UTF8) != 0)
904 {
905 while ((*cc & 0xc0) == 0x80) cc++;
906 }
907 #endif
908 break;
909
910 /* Handle exact repetitions. The count is already in characters, but we
911 need to skip over a multibyte character in UTF8 mode. */
912
913 case OP_EXACT:
914 branchlength += GET2(cc,1);
915 cc += 4;
916 #ifdef SUPPORT_UTF8
917 if ((options & PCRE_UTF8) != 0)
918 {
919 while((*cc & 0x80) == 0x80) cc++;
920 }
921 #endif
922 break;
923
924 case OP_TYPEEXACT:
925 branchlength += GET2(cc,1);
926 cc += 4;
927 break;
928
929 /* Handle single-char matchers */
930
931 case OP_PROP:
932 case OP_NOTPROP:
933 cc++;
934 /* Fall through */
935
936 case OP_NOT_DIGIT:
937 case OP_DIGIT:
938 case OP_NOT_WHITESPACE:
939 case OP_WHITESPACE:
940 case OP_NOT_WORDCHAR:
941 case OP_WORDCHAR:
942 case OP_ANY:
943 branchlength++;
944 cc++;
945 break;
946
947 /* The single-byte matcher isn't allowed */
948
949 case OP_ANYBYTE:
950 return -2;
951
952 /* Check a class for variable quantification */
953
954 #ifdef SUPPORT_UTF8
955 case OP_XCLASS:
956 cc += GET(cc, 1) - 33;
957 /* Fall through */
958 #endif
959
960 case OP_CLASS:
961 case OP_NCLASS:
962 cc += 33;
963
964 switch (*cc)
965 {
966 case OP_CRSTAR:
967 case OP_CRMINSTAR:
968 case OP_CRQUERY:
969 case OP_CRMINQUERY:
970 return -1;
971
972 case OP_CRRANGE:
973 case OP_CRMINRANGE:
974 if (GET2(cc,1) != GET2(cc,3)) return -1;
975 branchlength += GET2(cc,1);
976 cc += 5;
977 break;
978
979 default:
980 branchlength++;
981 }
982 break;
983
984 /* Anything else is variable length */
985
986 default:
987 return -1;
988 }
989 }
990 /* Control never gets here */
991 }
992
993
994
995
996 /*************************************************
997 * Scan compiled regex for numbered bracket *
998 *************************************************/
999
1000 /* This little function scans through a compiled pattern until it finds a
1001 capturing bracket with the given number.
1002
1003 Arguments:
1004 code points to start of expression
1005 utf8 TRUE in UTF-8 mode
1006 number the required bracket number
1007
1008 Returns: pointer to the opcode for the bracket, or NULL if not found
1009 */
1010
1011 static const uschar *
1012 find_bracket(const uschar *code, BOOL utf8, int number)
1013 {
1014 #ifndef SUPPORT_UTF8
1015 utf8 = utf8; /* Stop pedantic compilers complaining */
1016 #endif
1017
1018 for (;;)
1019 {
1020 register int c = *code;
1021 if (c == OP_END) return NULL;
1022 else if (c > OP_BRA)
1023 {
1024 int n = c - OP_BRA;
1025 if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1026 if (n == number) return (uschar *)code;
1027 code += _pcre_OP_lengths[OP_BRA];
1028 }
1029 else
1030 {
1031 code += _pcre_OP_lengths[c];
1032
1033 #ifdef SUPPORT_UTF8
1034
1035 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1036 by a multi-byte character. The length in the table is a minimum, so we have
1037 to scan along to skip the extra bytes. All opcodes are less than 128, so we
1038 can use relatively efficient code. */
1039
1040 if (utf8) switch(c)
1041 {
1042 case OP_CHAR:
1043 case OP_CHARNC:
1044 case OP_EXACT:
1045 case OP_UPTO:
1046 case OP_MINUPTO:
1047 case OP_STAR:
1048 case OP_MINSTAR:
1049 case OP_PLUS:
1050 case OP_MINPLUS:
1051 case OP_QUERY:
1052 case OP_MINQUERY:
1053 while ((*code & 0xc0) == 0x80) code++;
1054 break;
1055
1056 /* XCLASS is used for classes that cannot be represented just by a bit
1057 map. This includes negated single high-valued characters. The length in
1058 the table is zero; the actual length is stored in the compiled code. */
1059
1060 case OP_XCLASS:
1061 code += GET(code, 1) + 1;
1062 break;
1063 }
1064 #endif
1065 }
1066 }
1067 }
1068
1069
1070
1071 /*************************************************
1072 * Scan compiled regex for recursion reference *
1073 *************************************************/
1074
1075 /* This little function scans through a compiled pattern until it finds an
1076 instance of OP_RECURSE.
1077
1078 Arguments:
1079 code points to start of expression
1080 utf8 TRUE in UTF-8 mode
1081
1082 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1083 */
1084
1085 static const uschar *
1086 find_recurse(const uschar *code, BOOL utf8)
1087 {
1088 #ifndef SUPPORT_UTF8
1089 utf8 = utf8; /* Stop pedantic compilers complaining */
1090 #endif
1091
1092 for (;;)
1093 {
1094 register int c = *code;
1095 if (c == OP_END) return NULL;
1096 else if (c == OP_RECURSE) return code;
1097 else if (c > OP_BRA)
1098 {
1099 code += _pcre_OP_lengths[OP_BRA];
1100 }
1101 else
1102 {
1103 code += _pcre_OP_lengths[c];
1104
1105 #ifdef SUPPORT_UTF8
1106
1107 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1108 by a multi-byte character. The length in the table is a minimum, so we have
1109 to scan along to skip the extra bytes. All opcodes are less than 128, so we
1110 can use relatively efficient code. */
1111
1112 if (utf8) switch(c)
1113 {
1114 case OP_CHAR:
1115 case OP_CHARNC:
1116 case OP_EXACT:
1117 case OP_UPTO:
1118 case OP_MINUPTO:
1119 case OP_STAR:
1120 case OP_MINSTAR:
1121 case OP_PLUS:
1122 case OP_MINPLUS:
1123 case OP_QUERY:
1124 case OP_MINQUERY:
1125 while ((*code & 0xc0) == 0x80) code++;
1126 break;
1127
1128 /* XCLASS is used for classes that cannot be represented just by a bit
1129 map. This includes negated single high-valued characters. The length in
1130 the table is zero; the actual length is stored in the compiled code. */
1131
1132 case OP_XCLASS:
1133 code += GET(code, 1) + 1;
1134 break;
1135 }
1136 #endif
1137 }
1138 }
1139 }
1140
1141
1142
1143 /*************************************************
1144 * Scan compiled branch for non-emptiness *
1145 *************************************************/
1146
1147 /* This function scans through a branch of a compiled pattern to see whether it
1148 can match the empty string or not. It is called only from could_be_empty()
1149 below. Note that first_significant_code() skips over assertions. If we hit an
1150 unclosed bracket, we return "empty" - this means we've struck an inner bracket
1151 whose current branch will already have been scanned.
1152
1153 Arguments:
1154 code points to start of search
1155 endcode points to where to stop
1156 utf8 TRUE if in UTF8 mode
1157
1158 Returns: TRUE if what is matched could be empty
1159 */
1160
1161 static BOOL
1162 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1163 {
1164 register int c;
1165 for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);
1166 code < endcode;
1167 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1168 {
1169 const uschar *ccode;
1170
1171 c = *code;
1172
1173 if (c >= OP_BRA)
1174 {
1175 BOOL empty_branch;
1176 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1177
1178 /* Scan a closed bracket */
1179
1180 empty_branch = FALSE;
1181 do
1182 {
1183 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1184 empty_branch = TRUE;
1185 code += GET(code, 1);
1186 }
1187 while (*code == OP_ALT);
1188 if (!empty_branch) return FALSE; /* All branches are non-empty */
1189 code += 1 + LINK_SIZE;
1190 c = *code;
1191 }
1192
1193 else switch (c)
1194 {
1195 /* Check for quantifiers after a class */
1196
1197 #ifdef SUPPORT_UTF8
1198 case OP_XCLASS:
1199 ccode = code + GET(code, 1);
1200 goto CHECK_CLASS_REPEAT;
1201 #endif
1202
1203 case OP_CLASS:
1204 case OP_NCLASS:
1205 ccode = code + 33;
1206
1207 #ifdef SUPPORT_UTF8
1208 CHECK_CLASS_REPEAT:
1209 #endif
1210
1211 switch (*ccode)
1212 {
1213 case OP_CRSTAR: /* These could be empty; continue */
1214 case OP_CRMINSTAR:
1215 case OP_CRQUERY:
1216 case OP_CRMINQUERY:
1217 break;
1218
1219 default: /* Non-repeat => class must match */
1220 case OP_CRPLUS: /* These repeats aren't empty */
1221 case OP_CRMINPLUS:
1222 return FALSE;
1223
1224 case OP_CRRANGE:
1225 case OP_CRMINRANGE:
1226 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1227 break;
1228 }
1229 break;
1230
1231 /* Opcodes that must match a character */
1232
1233 case OP_PROP:
1234 case OP_NOTPROP:
1235 case OP_EXTUNI:
1236 case OP_NOT_DIGIT:
1237 case OP_DIGIT:
1238 case OP_NOT_WHITESPACE:
1239 case OP_WHITESPACE:
1240 case OP_NOT_WORDCHAR:
1241 case OP_WORDCHAR:
1242 case OP_ANY:
1243 case OP_ANYBYTE:
1244 case OP_CHAR:
1245 case OP_CHARNC:
1246 case OP_NOT:
1247 case OP_PLUS:
1248 case OP_MINPLUS:
1249 case OP_EXACT:
1250 case OP_NOTPLUS:
1251 case OP_NOTMINPLUS:
1252 case OP_NOTEXACT:
1253 case OP_TYPEPLUS:
1254 case OP_TYPEMINPLUS:
1255 case OP_TYPEEXACT:
1256 return FALSE;
1257
1258 /* End of branch */
1259
1260 case OP_KET:
1261 case OP_KETRMAX:
1262 case OP_KETRMIN:
1263 case OP_ALT:
1264 return TRUE;
1265
1266 /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO may be
1267 followed by a multibyte character */
1268
1269 #ifdef SUPPORT_UTF8
1270 case OP_STAR:
1271 case OP_MINSTAR:
1272 case OP_QUERY:
1273 case OP_MINQUERY:
1274 case OP_UPTO:
1275 case OP_MINUPTO:
1276 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1277 break;
1278 #endif
1279 }
1280 }
1281
1282 return TRUE;
1283 }
1284
1285
1286
1287 /*************************************************
1288 * Scan compiled regex for non-emptiness *
1289 *************************************************/
1290
1291 /* This function is called to check for left recursive calls. We want to check
1292 the current branch of the current pattern to see if it could match the empty
1293 string. If it could, we must look outwards for branches at other levels,
1294 stopping when we pass beyond the bracket which is the subject of the recursion.
1295
1296 Arguments:
1297 code points to start of the recursion
1298 endcode points to where to stop (current RECURSE item)
1299 bcptr points to the chain of current (unclosed) branch starts
1300 utf8 TRUE if in UTF-8 mode
1301
1302 Returns: TRUE if what is matched could be empty
1303 */
1304
1305 static BOOL
1306 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1307 BOOL utf8)
1308 {
1309 while (bcptr != NULL && bcptr->current >= code)
1310 {
1311 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1312 bcptr = bcptr->outer;
1313 }
1314 return TRUE;
1315 }
1316
1317
1318
1319 /*************************************************
1320 * Check for POSIX class syntax *
1321 *************************************************/
1322
1323 /* This function is called when the sequence "[:" or "[." or "[=" is
1324 encountered in a character class. It checks whether this is followed by an
1325 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1326 ".]" or "=]".
1327
1328 Argument:
1329 ptr pointer to the initial [
1330 endptr where to return the end pointer
1331 cd pointer to compile data
1332
1333 Returns: TRUE or FALSE
1334 */
1335
1336 static BOOL
1337 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1338 {
1339 int terminator; /* Don't combine these lines; the Solaris cc */
1340 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1341 if (*(++ptr) == '^') ptr++;
1342 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1343 if (*ptr == terminator && ptr[1] == ']')
1344 {
1345 *endptr = ptr;
1346 return TRUE;
1347 }
1348 return FALSE;
1349 }
1350
1351
1352
1353
1354 /*************************************************
1355 * Check POSIX class name *
1356 *************************************************/
1357
1358 /* This function is called to check the name given in a POSIX-style class entry
1359 such as [:alnum:].
1360
1361 Arguments:
1362 ptr points to the first letter
1363 len the length of the name
1364
1365 Returns: a value representing the name, or -1 if unknown
1366 */
1367
1368 static int
1369 check_posix_name(const uschar *ptr, int len)
1370 {
1371 register int yield = 0;
1372 while (posix_name_lengths[yield] != 0)
1373 {
1374 if (len == posix_name_lengths[yield] &&
1375 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1376 yield++;
1377 }
1378 return -1;
1379 }
1380
1381
1382 /*************************************************
1383 * Adjust OP_RECURSE items in repeated group *
1384 *************************************************/
1385
1386 /* OP_RECURSE items contain an offset from the start of the regex to the group
1387 that is referenced. This means that groups can be replicated for fixed
1388 repetition simply by copying (because the recursion is allowed to refer to
1389 earlier groups that are outside the current group). However, when a group is
1390 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1391 it, after it has been compiled. This means that any OP_RECURSE items within it
1392 that refer to the group itself or any contained groups have to have their
1393 offsets adjusted. That is the job of this function. Before it is called, the
1394 partially compiled regex must be temporarily terminated with OP_END.
1395
1396 Arguments:
1397 group points to the start of the group
1398 adjust the amount by which the group is to be moved
1399 utf8 TRUE in UTF-8 mode
1400 cd contains pointers to tables etc.
1401
1402 Returns: nothing
1403 */
1404
1405 static void
1406 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)
1407 {
1408 uschar *ptr = group;
1409 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1410 {
1411 int offset = GET(ptr, 1);
1412 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1413 ptr += 1 + LINK_SIZE;
1414 }
1415 }
1416
1417
1418
1419 /*************************************************
1420 * Insert an automatic callout point *
1421 *************************************************/
1422
1423 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1424 callout points before each pattern item.
1425
1426 Arguments:
1427 code current code pointer
1428 ptr current pattern pointer
1429 cd pointers to tables etc
1430
1431 Returns: new code pointer
1432 */
1433
1434 static uschar *
1435 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1436 {
1437 *code++ = OP_CALLOUT;
1438 *code++ = 255;
1439 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1440 PUT(code, LINK_SIZE, 0); /* Default length */
1441 return code + 2*LINK_SIZE;
1442 }
1443
1444
1445
1446 /*************************************************
1447 * Complete a callout item *
1448 *************************************************/
1449
1450 /* A callout item contains the length of the next item in the pattern, which
1451 we can't fill in till after we have reached the relevant point. This is used
1452 for both automatic and manual callouts.
1453
1454 Arguments:
1455 previous_callout points to previous callout item
1456 ptr current pattern pointer
1457 cd pointers to tables etc
1458
1459 Returns: nothing
1460 */
1461
1462 static void
1463 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1464 {
1465 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1466 PUT(previous_callout, 2 + LINK_SIZE, length);
1467 }
1468
1469
1470
1471 #ifdef SUPPORT_UCP
1472 /*************************************************
1473 * Get othercase range *
1474 *************************************************/
1475
1476 /* This function is passed the start and end of a class range, in UTF-8 mode
1477 with UCP support. It searches up the characters, looking for internal ranges of
1478 characters in the "other" case. Each call returns the next one, updating the
1479 start address.
1480
1481 Arguments:
1482 cptr points to starting character value; updated
1483 d end value
1484 ocptr where to put start of othercase range
1485 odptr where to put end of othercase range
1486
1487 Yield: TRUE when range returned; FALSE when no more
1488 */
1489
1490 static BOOL
1491 get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)
1492 {
1493 int c, chartype, othercase, next;
1494
1495 for (c = *cptr; c <= d; c++)
1496 {
1497 if (_pcre_ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0)
1498 break;
1499 }
1500
1501 if (c > d) return FALSE;
1502
1503 *ocptr = othercase;
1504 next = othercase + 1;
1505
1506 for (++c; c <= d; c++)
1507 {
1508 if (_pcre_ucp_findchar(c, &chartype, &othercase) != ucp_L ||
1509 othercase != next)
1510 break;
1511 next++;
1512 }
1513
1514 *odptr = next - 1;
1515 *cptr = c;
1516
1517 return TRUE;
1518 }
1519 #endif /* SUPPORT_UCP */
1520
1521
1522 /*************************************************
1523 * Compile one branch *
1524 *************************************************/
1525
1526 /* Scan the pattern, compiling it into the code vector. If the options are
1527 changed during the branch, the pointer is used to change the external options
1528 bits.
1529
1530 Arguments:
1531 optionsptr pointer to the option bits
1532 brackets points to number of extracting brackets used
1533 codeptr points to the pointer to the current code point
1534 ptrptr points to the current pattern pointer
1535 errorcodeptr points to error code variable
1536 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
1537 reqbyteptr set to the last literal character required, else < 0
1538 bcptr points to current branch chain
1539 cd contains pointers to tables etc.
1540
1541 Returns: TRUE on success
1542 FALSE, with *errorcodeptr set non-zero on error
1543 */
1544
1545 static BOOL
1546 compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
1547 const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,
1548 int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
1549 {
1550 int repeat_type, op_type;
1551 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
1552 int bravalue = 0;
1553 int greedy_default, greedy_non_default;
1554 int firstbyte, reqbyte;
1555 int zeroreqbyte, zerofirstbyte;
1556 int req_caseopt, reqvary, tempreqvary;
1557 int condcount = 0;
1558 int options = *optionsptr;
1559 int after_manual_callout = 0;
1560 register int c;
1561 register uschar *code = *codeptr;
1562 uschar *tempcode;
1563 BOOL inescq = FALSE;
1564 BOOL groupsetfirstbyte = FALSE;
1565 const uschar *ptr = *ptrptr;
1566 const uschar *tempptr;
1567 uschar *previous = NULL;
1568 uschar *previous_callout = NULL;
1569 uschar classbits[32];
1570
1571 #ifdef SUPPORT_UTF8
1572 BOOL class_utf8;
1573 BOOL utf8 = (options & PCRE_UTF8) != 0;
1574 uschar *class_utf8data;
1575 uschar utf8_char[6];
1576 #else
1577 BOOL utf8 = FALSE;
1578 #endif
1579
1580 /* Set up the default and non-default settings for greediness */
1581
1582 greedy_default = ((options & PCRE_UNGREEDY) != 0);
1583 greedy_non_default = greedy_default ^ 1;
1584
1585 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
1586 matching encountered yet". It gets changed to REQ_NONE if we hit something that
1587 matches a non-fixed char first char; reqbyte just remains unset if we never
1588 find one.
1589
1590 When we hit a repeat whose minimum is zero, we may have to adjust these values
1591 to take the zero repeat into account. This is implemented by setting them to
1592 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
1593 item types that can be repeated set these backoff variables appropriately. */
1594
1595 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
1596
1597 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
1598 according to the current setting of the caseless flag. REQ_CASELESS is a bit
1599 value > 255. It is added into the firstbyte or reqbyte variables to record the
1600 case status of the value. This is used only for ASCII characters. */
1601
1602 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
1603
1604 /* Switch on next character until the end of the branch */
1605
1606 for (;; ptr++)
1607 {
1608 BOOL negate_class;
1609 BOOL possessive_quantifier;
1610 BOOL is_quantifier;
1611 int class_charcount;
1612 int class_lastchar;
1613 int newoptions;
1614 int recno;
1615 int skipbytes;
1616 int subreqbyte;
1617 int subfirstbyte;
1618 int mclength;
1619 uschar mcbuffer[8];
1620
1621 /* Next byte in the pattern */
1622
1623 c = *ptr;
1624
1625 /* If in \Q...\E, check for the end; if not, we have a literal */
1626
1627 if (inescq && c != 0)
1628 {
1629 if (c == '\\' && ptr[1] == 'E')
1630 {
1631 inescq = FALSE;
1632 ptr++;
1633 continue;
1634 }
1635 else
1636 {
1637 if (previous_callout != NULL)
1638 {
1639 complete_callout(previous_callout, ptr, cd);
1640 previous_callout = NULL;
1641 }
1642 if ((options & PCRE_AUTO_CALLOUT) != 0)
1643 {
1644 previous_callout = code;
1645 code = auto_callout(code, ptr, cd);
1646 }
1647 goto NORMAL_CHAR;
1648 }
1649 }
1650
1651 /* Fill in length of a previous callout, except when the next thing is
1652 a quantifier. */
1653
1654 is_quantifier = c == '*' || c == '+' || c == '?' ||
1655 (c == '{' && is_counted_repeat(ptr+1));
1656
1657 if (!is_quantifier && previous_callout != NULL &&
1658 after_manual_callout-- <= 0)
1659 {
1660 complete_callout(previous_callout, ptr, cd);
1661 previous_callout = NULL;
1662 }
1663
1664 /* In extended mode, skip white space and comments */
1665
1666 if ((options & PCRE_EXTENDED) != 0)
1667 {
1668 if ((cd->ctypes[c] & ctype_space) != 0) continue;
1669 if (c == '#')
1670 {
1671 /* The space before the ; is to avoid a warning on a silly compiler
1672 on the Macintosh. */
1673 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
1674 if (c != 0) continue; /* Else fall through to handle end of string */
1675 }
1676 }
1677
1678 /* No auto callout for quantifiers. */
1679
1680 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
1681 {
1682 previous_callout = code;
1683 code = auto_callout(code, ptr, cd);
1684 }
1685
1686 switch(c)
1687 {
1688 /* The branch terminates at end of string, |, or ). */
1689
1690 case 0:
1691 case '|':
1692 case ')':
1693 *firstbyteptr = firstbyte;
1694 *reqbyteptr = reqbyte;
1695 *codeptr = code;
1696 *ptrptr = ptr;
1697 return TRUE;
1698
1699 /* Handle single-character metacharacters. In multiline mode, ^ disables
1700 the setting of any following char as a first character. */
1701
1702 case '^':
1703 if ((options & PCRE_MULTILINE) != 0)
1704 {
1705 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1706 }
1707 previous = NULL;
1708 *code++ = OP_CIRC;
1709 break;
1710
1711 case '$':
1712 previous = NULL;
1713 *code++ = OP_DOLL;
1714 break;
1715
1716 /* There can never be a first char if '.' is first, whatever happens about
1717 repeats. The value of reqbyte doesn't change either. */
1718
1719 case '.':
1720 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1721 zerofirstbyte = firstbyte;
1722 zeroreqbyte = reqbyte;
1723 previous = code;
1724 *code++ = OP_ANY;
1725 break;
1726
1727 /* Character classes. If the included characters are all < 255 in value, we
1728 build a 32-byte bitmap of the permitted characters, except in the special
1729 case where there is only one such character. For negated classes, we build
1730 the map as usual, then invert it at the end. However, we use a different
1731 opcode so that data characters > 255 can be handled correctly.
1732
1733 If the class contains characters outside the 0-255 range, a different
1734 opcode is compiled. It may optionally have a bit map for characters < 256,
1735 but those above are are explicitly listed afterwards. A flag byte tells
1736 whether the bitmap is present, and whether this is a negated class or not.
1737 */
1738
1739 case '[':
1740 previous = code;
1741
1742 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
1743 they are encountered at the top level, so we'll do that too. */
1744
1745 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1746 check_posix_syntax(ptr, &tempptr, cd))
1747 {
1748 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
1749 goto FAILED;
1750 }
1751
1752 /* If the first character is '^', set the negation flag and skip it. */
1753
1754 if ((c = *(++ptr)) == '^')
1755 {
1756 negate_class = TRUE;
1757 c = *(++ptr);
1758 }
1759 else
1760 {
1761 negate_class = FALSE;
1762 }
1763
1764 /* Keep a count of chars with values < 256 so that we can optimize the case
1765 of just a single character (as long as it's < 256). For higher valued UTF-8
1766 characters, we don't yet do any optimization. */
1767
1768 class_charcount = 0;
1769 class_lastchar = -1;
1770
1771 #ifdef SUPPORT_UTF8
1772 class_utf8 = FALSE; /* No chars >= 256 */
1773 class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
1774 #endif
1775
1776 /* Initialize the 32-char bit map to all zeros. We have to build the
1777 map in a temporary bit of store, in case the class contains only 1
1778 character (< 256), because in that case the compiled code doesn't use the
1779 bit map. */
1780
1781 memset(classbits, 0, 32 * sizeof(uschar));
1782
1783 /* Process characters until ] is reached. By writing this as a "do" it
1784 means that an initial ] is taken as a data character. The first pass
1785 through the regex checked the overall syntax, so we don't need to be very
1786 strict here. At the start of the loop, c contains the first byte of the
1787 character. */
1788
1789 do
1790 {
1791 #ifdef SUPPORT_UTF8
1792 if (utf8 && c > 127)
1793 { /* Braces are required because the */
1794 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
1795 }
1796 #endif
1797
1798 /* Inside \Q...\E everything is literal except \E */
1799
1800 if (inescq)
1801 {
1802 if (c == '\\' && ptr[1] == 'E')
1803 {
1804 inescq = FALSE;
1805 ptr++;
1806 continue;
1807 }
1808 else goto LONE_SINGLE_CHARACTER;
1809 }
1810
1811 /* Handle POSIX class names. Perl allows a negation extension of the
1812 form [:^name:]. A square bracket that doesn't match the syntax is
1813 treated as a literal. We also recognize the POSIX constructions
1814 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1815 5.6 and 5.8 do. */
1816
1817 if (c == '[' &&
1818 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1819 check_posix_syntax(ptr, &tempptr, cd))
1820 {
1821 BOOL local_negate = FALSE;
1822 int posix_class, i;
1823 register const uschar *cbits = cd->cbits;
1824
1825 if (ptr[1] != ':')
1826 {
1827 *errorcodeptr = ERR31;
1828 goto FAILED;
1829 }
1830
1831 ptr += 2;
1832 if (*ptr == '^')
1833 {
1834 local_negate = TRUE;
1835 ptr++;
1836 }
1837
1838 posix_class = check_posix_name(ptr, tempptr - ptr);
1839 if (posix_class < 0)
1840 {
1841 *errorcodeptr = ERR30;
1842 goto FAILED;
1843 }
1844
1845 /* If matching is caseless, upper and lower are converted to
1846 alpha. This relies on the fact that the class table starts with
1847 alpha, lower, upper as the first 3 entries. */
1848
1849 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1850 posix_class = 0;
1851
1852 /* Or into the map we are building up to 3 of the static class
1853 tables, or their negations. The [:blank:] class sets up the same
1854 chars as the [:space:] class (all white space). We remove the vertical
1855 white space chars afterwards. */
1856
1857 posix_class *= 3;
1858 for (i = 0; i < 3; i++)
1859 {
1860 BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;
1861 int taboffset = posix_class_maps[posix_class + i];
1862 if (taboffset < 0) break;
1863 if (local_negate)
1864 {
1865 if (i == 0)
1866 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset];
1867 else
1868 for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset];
1869 if (blankclass) classbits[1] |= 0x3c;
1870 }
1871 else
1872 {
1873 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset];
1874 if (blankclass) classbits[1] &= ~0x3c;
1875 }
1876 }
1877
1878 ptr = tempptr + 1;
1879 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
1880 continue; /* End of POSIX syntax handling */
1881 }
1882
1883 /* Backslash may introduce a single character, or it may introduce one
1884 of the specials, which just set a flag. Escaped items are checked for
1885 validity in the pre-compiling pass. The sequence \b is a special case.
1886 Inside a class (and only there) it is treated as backspace. Elsewhere
1887 it marks a word boundary. Other escapes have preset maps ready to
1888 or into the one we are building. We assume they have more than one
1889 character in them, so set class_charcount bigger than one. */
1890
1891 if (c == '\\')
1892 {
1893 c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);
1894
1895 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
1896 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
1897 else if (-c == ESC_Q) /* Handle start of quoted string */
1898 {
1899 if (ptr[1] == '\\' && ptr[2] == 'E')
1900 {
1901 ptr += 2; /* avoid empty string */
1902 }
1903 else inescq = TRUE;
1904 continue;
1905 }
1906
1907 if (c < 0)
1908 {
1909 register const uschar *cbits = cd->cbits;
1910 class_charcount += 2; /* Greater than 1 is what matters */
1911 switch (-c)
1912 {
1913 case ESC_d:
1914 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
1915 continue;
1916
1917 case ESC_D:
1918 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
1919 continue;
1920
1921 case ESC_w:
1922 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
1923 continue;
1924
1925 case ESC_W:
1926 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
1927 continue;
1928
1929 case ESC_s:
1930 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
1931 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
1932 continue;
1933
1934 case ESC_S:
1935 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
1936 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
1937 continue;
1938
1939 #ifdef SUPPORT_UCP
1940 case ESC_p:
1941 case ESC_P:
1942 {
1943 BOOL negated;
1944 int property = get_ucp(&ptr, &negated, errorcodeptr);
1945 if (property < 0) goto FAILED;
1946 class_utf8 = TRUE;
1947 *class_utf8data++ = ((-c == ESC_p) != negated)?
1948 XCL_PROP : XCL_NOTPROP;
1949 *class_utf8data++ = property;
1950 class_charcount -= 2; /* Not a < 256 character */
1951 }
1952 continue;
1953 #endif
1954
1955 /* Unrecognized escapes are faulted if PCRE is running in its
1956 strict mode. By default, for compatibility with Perl, they are
1957 treated as literals. */
1958
1959 default:
1960 if ((options & PCRE_EXTRA) != 0)
1961 {
1962 *errorcodeptr = ERR7;
1963 goto FAILED;
1964 }
1965 c = *ptr; /* The final character */
1966 class_charcount -= 2; /* Undo the default count from above */
1967 }
1968 }
1969
1970 /* Fall through if we have a single character (c >= 0). This may be
1971 > 256 in UTF-8 mode. */
1972
1973 } /* End of backslash handling */
1974
1975 /* A single character may be followed by '-' to form a range. However,
1976 Perl does not permit ']' to be the end of the range. A '-' character
1977 here is treated as a literal. */
1978
1979 if (ptr[1] == '-' && ptr[2] != ']')
1980 {
1981 int d;
1982 ptr += 2;
1983
1984 #ifdef SUPPORT_UTF8
1985 if (utf8)
1986 { /* Braces are required because the */
1987 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
1988 }
1989 else
1990 #endif
1991 d = *ptr; /* Not UTF-8 mode */
1992
1993 /* The second part of a range can be a single-character escape, but
1994 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1995 in such circumstances. */
1996
1997 if (d == '\\')
1998 {
1999 const uschar *oldptr = ptr;
2000 d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);
2001
2002 /* \b is backslash; \X is literal X; any other special means the '-'
2003 was literal */
2004
2005 if (d < 0)
2006 {
2007 if (d == -ESC_b) d = '\b';
2008 else if (d == -ESC_X) d = 'X'; else
2009 {
2010 ptr = oldptr - 2;
2011 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2012 }
2013 }
2014 }
2015
2016 /* The check that the two values are in the correct order happens in
2017 the pre-pass. Optimize one-character ranges */
2018
2019 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2020
2021 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2022 matching, we have to use an XCLASS with extra data items. Caseless
2023 matching for characters > 127 is available only if UCP support is
2024 available. */
2025
2026 #ifdef SUPPORT_UTF8
2027 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2028 {
2029 class_utf8 = TRUE;
2030
2031 /* With UCP support, we can find the other case equivalents of
2032 the relevant characters. There may be several ranges. Optimize how
2033 they fit with the basic range. */
2034
2035 #ifdef SUPPORT_UCP
2036 if ((options & PCRE_CASELESS) != 0)
2037 {
2038 int occ, ocd;
2039 int cc = c;
2040 int origd = d;
2041 while (get_othercase_range(&cc, origd, &occ, &ocd))
2042 {
2043 if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
2044
2045 if (occ < c && ocd >= c - 1) /* Extend the basic range */
2046 { /* if there is overlap, */
2047 c = occ; /* noting that if occ < c */
2048 continue; /* we can't have ocd > d */
2049 } /* because a subrange is */
2050 if (ocd > d && occ <= d + 1) /* always shorter than */
2051 { /* the basic range. */
2052 d = ocd;
2053 continue;
2054 }
2055
2056 if (occ == ocd)
2057 {
2058 *class_utf8data++ = XCL_SINGLE;
2059 }
2060 else
2061 {
2062 *class_utf8data++ = XCL_RANGE;
2063 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2064 }
2065 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2066 }
2067 }
2068 #endif /* SUPPORT_UCP */
2069
2070 /* Now record the original range, possibly modified for UCP caseless
2071 overlapping ranges. */
2072
2073 *class_utf8data++ = XCL_RANGE;
2074 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2075 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2076
2077 /* With UCP support, we are done. Without UCP support, there is no
2078 caseless matching for UTF-8 characters > 127; we can use the bit map
2079 for the smaller ones. */
2080
2081 #ifdef SUPPORT_UCP
2082 continue; /* With next character in the class */
2083 #else
2084 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2085
2086 /* Adjust upper limit and fall through to set up the map */
2087
2088 d = 127;
2089
2090 #endif /* SUPPORT_UCP */
2091 }
2092 #endif /* SUPPORT_UTF8 */
2093
2094 /* We use the bit map for all cases when not in UTF-8 mode; else
2095 ranges that lie entirely within 0-127 when there is UCP support; else
2096 for partial ranges without UCP support. */
2097
2098 for (; c <= d; c++)
2099 {
2100 classbits[c/8] |= (1 << (c&7));
2101 if ((options & PCRE_CASELESS) != 0)
2102 {
2103 int uc = cd->fcc[c]; /* flip case */
2104 classbits[uc/8] |= (1 << (uc&7));
2105 }
2106 class_charcount++; /* in case a one-char range */
2107 class_lastchar = c;
2108 }
2109
2110 continue; /* Go get the next char in the class */
2111 }
2112
2113 /* Handle a lone single character - we can get here for a normal
2114 non-escape char, or after \ that introduces a single character or for an
2115 apparent range that isn't. */
2116
2117 LONE_SINGLE_CHARACTER:
2118
2119 /* Handle a character that cannot go in the bit map */
2120
2121 #ifdef SUPPORT_UTF8
2122 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2123 {
2124 class_utf8 = TRUE;
2125 *class_utf8data++ = XCL_SINGLE;
2126 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2127
2128 #ifdef SUPPORT_UCP
2129 if ((options & PCRE_CASELESS) != 0)
2130 {
2131 int chartype;
2132 int othercase;
2133 if (_pcre_ucp_findchar(c, &chartype, &othercase) >= 0 &&
2134 othercase > 0)
2135 {
2136 *class_utf8data++ = XCL_SINGLE;
2137 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
2138 }
2139 }
2140 #endif /* SUPPORT_UCP */
2141
2142 }
2143 else
2144 #endif /* SUPPORT_UTF8 */
2145
2146 /* Handle a single-byte character */
2147 {
2148 classbits[c/8] |= (1 << (c&7));
2149 if ((options & PCRE_CASELESS) != 0)
2150 {
2151 c = cd->fcc[c]; /* flip case */
2152 classbits[c/8] |= (1 << (c&7));
2153 }
2154 class_charcount++;
2155 class_lastchar = c;
2156 }
2157 }
2158
2159 /* Loop until ']' reached; the check for end of string happens inside the
2160 loop. This "while" is the end of the "do" above. */
2161
2162 while ((c = *(++ptr)) != ']' || inescq);
2163
2164 /* If class_charcount is 1, we saw precisely one character whose value is
2165 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2166 can optimize the negative case only if there were no characters >= 128
2167 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2168 single-bytes only. This is an historical hangover. Maybe one day we can
2169 tidy these opcodes to handle multi-byte characters.
2170
2171 The optimization throws away the bit map. We turn the item into a
2172 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2173 that OP_NOT does not support multibyte characters. In the positive case, it
2174 can cause firstbyte to be set. Otherwise, there can be no first char if
2175 this item is first, whatever repeat count may follow. In the case of
2176 reqbyte, save the previous value for reinstating. */
2177
2178 #ifdef SUPPORT_UTF8
2179 if (class_charcount == 1 &&
2180 (!utf8 ||
2181 (!class_utf8 && (!negate_class || class_lastchar < 128))))
2182
2183 #else
2184 if (class_charcount == 1)
2185 #endif
2186 {
2187 zeroreqbyte = reqbyte;
2188
2189 /* The OP_NOT opcode works on one-byte characters only. */
2190
2191 if (negate_class)
2192 {
2193 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2194 zerofirstbyte = firstbyte;
2195 *code++ = OP_NOT;
2196 *code++ = class_lastchar;
2197 break;
2198 }
2199
2200 /* For a single, positive character, get the value into mcbuffer, and
2201 then we can handle this with the normal one-character code. */
2202
2203 #ifdef SUPPORT_UTF8
2204 if (utf8 && class_lastchar > 127)
2205 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
2206 else
2207 #endif
2208 {
2209 mcbuffer[0] = class_lastchar;
2210 mclength = 1;
2211 }
2212 goto ONE_CHAR;
2213 } /* End of 1-char optimization */
2214
2215 /* The general case - not the one-char optimization. If this is the first
2216 thing in the branch, there can be no first char setting, whatever the
2217 repeat count. Any reqbyte setting must remain unchanged after any kind of
2218 repeat. */
2219
2220 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2221 zerofirstbyte = firstbyte;
2222 zeroreqbyte = reqbyte;
2223
2224 /* If there are characters with values > 255, we have to compile an
2225 extended class, with its own opcode. If there are no characters < 256,
2226 we can omit the bitmap. */
2227
2228 #ifdef SUPPORT_UTF8
2229 if (class_utf8)
2230 {
2231 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2232 *code++ = OP_XCLASS;
2233 code += LINK_SIZE;
2234 *code = negate_class? XCL_NOT : 0;
2235
2236 /* If the map is required, install it, and move on to the end of
2237 the extra data */
2238
2239 if (class_charcount > 0)
2240 {
2241 *code++ |= XCL_MAP;
2242 memcpy(code, classbits, 32);
2243 code = class_utf8data;
2244 }
2245
2246 /* If the map is not required, slide down the extra data. */
2247
2248 else
2249 {
2250 int len = class_utf8data - (code + 33);
2251 memmove(code + 1, code + 33, len);
2252 code += len + 1;
2253 }
2254
2255 /* Now fill in the complete length of the item */
2256
2257 PUT(previous, 1, code - previous);
2258 break; /* End of class handling */
2259 }
2260 #endif
2261
2262 /* If there are no characters > 255, negate the 32-byte map if necessary,
2263 and copy it into the code vector. If this is the first thing in the branch,
2264 there can be no first char setting, whatever the repeat count. Any reqbyte
2265 setting must remain unchanged after any kind of repeat. */
2266
2267 if (negate_class)
2268 {
2269 *code++ = OP_NCLASS;
2270 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2271 }
2272 else
2273 {
2274 *code++ = OP_CLASS;
2275 memcpy(code, classbits, 32);
2276 }
2277 code += 32;
2278 break;
2279
2280 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2281 has been tested above. */
2282
2283 case '{':
2284 if (!is_quantifier) goto NORMAL_CHAR;
2285 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
2286 if (*errorcodeptr != 0) goto FAILED;
2287 goto REPEAT;
2288
2289 case '*':
2290 repeat_min = 0;
2291 repeat_max = -1;
2292 goto REPEAT;
2293
2294 case '+':
2295 repeat_min = 1;
2296 repeat_max = -1;
2297 goto REPEAT;
2298
2299 case '?':
2300 repeat_min = 0;
2301 repeat_max = 1;
2302
2303 REPEAT:
2304 if (previous == NULL)
2305 {
2306 *errorcodeptr = ERR9;
2307 goto FAILED;
2308 }
2309
2310 if (repeat_min == 0)
2311 {
2312 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2313 reqbyte = zeroreqbyte; /* Ditto */
2314 }
2315
2316 /* Remember whether this is a variable length repeat */
2317
2318 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2319
2320 op_type = 0; /* Default single-char op codes */
2321 possessive_quantifier = FALSE; /* Default not possessive quantifier */
2322
2323 /* Save start of previous item, in case we have to move it up to make space
2324 for an inserted OP_ONCE for the additional '+' extension. */
2325
2326 tempcode = previous;
2327
2328 /* If the next character is '+', we have a possessive quantifier. This
2329 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2330 If the next character is '?' this is a minimizing repeat, by default,
2331 but if PCRE_UNGREEDY is set, it works the other way round. We change the
2332 repeat type to the non-default. */
2333
2334 if (ptr[1] == '+')
2335 {
2336 repeat_type = 0; /* Force greedy */
2337 possessive_quantifier = TRUE;
2338 ptr++;
2339 }
2340 else if (ptr[1] == '?')
2341 {
2342 repeat_type = greedy_non_default;
2343 ptr++;
2344 }
2345 else repeat_type = greedy_default;
2346
2347 /* If previous was a recursion, we need to wrap it inside brackets so that
2348 it can be replicated if necessary. */
2349
2350 if (*previous == OP_RECURSE)
2351 {
2352 memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2353 code += 1 + LINK_SIZE;
2354 *previous = OP_BRA;
2355 PUT(previous, 1, code - previous);
2356 *code = OP_KET;
2357 PUT(code, 1, code - previous);
2358 code += 1 + LINK_SIZE;
2359 }
2360
2361 /* If previous was a character match, abolish the item and generate a
2362 repeat item instead. If a char item has a minumum of more than one, ensure
2363 that it is set in reqbyte - it might not be if a sequence such as x{3} is
2364 the first thing in a branch because the x will have gone into firstbyte
2365 instead. */
2366
2367 if (*previous == OP_CHAR || *previous == OP_CHARNC)
2368 {
2369 /* Deal with UTF-8 characters that take up more than one byte. It's
2370 easier to write this out separately than try to macrify it. Use c to
2371 hold the length of the character in bytes, plus 0x80 to flag that it's a
2372 length rather than a small character. */
2373
2374 #ifdef SUPPORT_UTF8
2375 if (utf8 && (code[-1] & 0x80) != 0)
2376 {
2377 uschar *lastchar = code - 1;
2378 while((*lastchar & 0xc0) == 0x80) lastchar--;
2379 c = code - lastchar; /* Length of UTF-8 character */
2380 memcpy(utf8_char, lastchar, c); /* Save the char */
2381 c |= 0x80; /* Flag c as a length */
2382 }
2383 else
2384 #endif
2385
2386 /* Handle the case of a single byte - either with no UTF8 support, or
2387 with UTF-8 disabled, or for a UTF-8 character < 128. */
2388
2389 {
2390 c = code[-1];
2391 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2392 }
2393
2394 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
2395 }
2396
2397 /* If previous was a single negated character ([^a] or similar), we use
2398 one of the special opcodes, replacing it. The code is shared with single-
2399 character repeats by setting opt_type to add a suitable offset into
2400 repeat_type. OP_NOT is currently used only for single-byte chars. */
2401
2402 else if (*previous == OP_NOT)
2403 {
2404 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
2405 c = previous[1];
2406 goto OUTPUT_SINGLE_REPEAT;
2407 }
2408
2409 /* If previous was a character type match (\d or similar), abolish it and
2410 create a suitable repeat item. The code is shared with single-character
2411 repeats by setting op_type to add a suitable offset into repeat_type. Note
2412 the the Unicode property types will be present only when SUPPORT_UCP is
2413 defined, but we don't wrap the little bits of code here because it just
2414 makes it horribly messy. */
2415
2416 else if (*previous < OP_EODN)
2417 {
2418 uschar *oldcode;
2419 int prop_type;
2420 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
2421 c = *previous;
2422
2423 OUTPUT_SINGLE_REPEAT:
2424 prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)?
2425 previous[1] : -1;
2426
2427 oldcode = code;
2428 code = previous; /* Usually overwrite previous item */
2429
2430 /* If the maximum is zero then the minimum must also be zero; Perl allows
2431 this case, so we do too - by simply omitting the item altogether. */
2432
2433 if (repeat_max == 0) goto END_REPEAT;
2434
2435 /* All real repeats make it impossible to handle partial matching (maybe
2436 one day we will be able to remove this restriction). */
2437
2438 if (repeat_max != 1) cd->nopartial = TRUE;
2439
2440 /* Combine the op_type with the repeat_type */
2441
2442 repeat_type += op_type;
2443
2444 /* A minimum of zero is handled either as the special case * or ?, or as
2445 an UPTO, with the maximum given. */
2446
2447 if (repeat_min == 0)
2448 {
2449 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
2450 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
2451 else
2452 {
2453 *code++ = OP_UPTO + repeat_type;
2454 PUT2INC(code, 0, repeat_max);
2455 }
2456 }
2457
2458 /* A repeat minimum of 1 is optimized into some special cases. If the
2459 maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
2460 left in place and, if the maximum is greater than 1, we use OP_UPTO with
2461 one less than the maximum. */
2462
2463 else if (repeat_min == 1)
2464 {
2465 if (repeat_max == -1)
2466 *code++ = OP_PLUS + repeat_type;
2467 else
2468 {
2469 code = oldcode; /* leave previous item in place */
2470 if (repeat_max == 1) goto END_REPEAT;
2471 *code++ = OP_UPTO + repeat_type;
2472 PUT2INC(code, 0, repeat_max - 1);
2473 }
2474 }
2475
2476 /* The case {n,n} is just an EXACT, while the general case {n,m} is
2477 handled as an EXACT followed by an UPTO. */
2478
2479 else
2480 {
2481 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
2482 PUT2INC(code, 0, repeat_min);
2483
2484 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
2485 we have to insert the character for the previous code. For a repeated
2486 Unicode property match, there is an extra byte that defines the
2487 required property. In UTF-8 mode, long characters have their length in
2488 c, with the 0x80 bit as a flag. */
2489
2490 if (repeat_max < 0)
2491 {
2492 #ifdef SUPPORT_UTF8
2493 if (utf8 && c >= 128)
2494 {
2495 memcpy(code, utf8_char, c & 7);
2496 code += c & 7;
2497 }
2498 else
2499 #endif
2500 {
2501 *code++ = c;
2502 if (prop_type >= 0) *code++ = prop_type;
2503 }
2504 *code++ = OP_STAR + repeat_type;
2505 }
2506
2507 /* Else insert an UPTO if the max is greater than the min, again
2508 preceded by the character, for the previously inserted code. */
2509
2510 else if (repeat_max != repeat_min)
2511 {
2512 #ifdef SUPPORT_UTF8
2513 if (utf8 && c >= 128)
2514 {
2515 memcpy(code, utf8_char, c & 7);
2516 code += c & 7;
2517 }
2518 else
2519 #endif
2520 *code++ = c;
2521 if (prop_type >= 0) *code++ = prop_type;
2522 repeat_max -= repeat_min;
2523 *code++ = OP_UPTO + repeat_type;
2524 PUT2INC(code, 0, repeat_max);
2525 }
2526 }
2527
2528 /* The character or character type itself comes last in all cases. */
2529
2530 #ifdef SUPPORT_UTF8
2531 if (utf8 && c >= 128)
2532 {
2533 memcpy(code, utf8_char, c & 7);
2534 code += c & 7;
2535 }
2536 else
2537 #endif
2538 *code++ = c;
2539
2540 /* For a repeated Unicode property match, there is an extra byte that
2541 defines the required property. */
2542
2543 #ifdef SUPPORT_UCP
2544 if (prop_type >= 0) *code++ = prop_type;
2545 #endif
2546 }
2547
2548 /* If previous was a character class or a back reference, we put the repeat
2549 stuff after it, but just skip the item if the repeat was {0,0}. */
2550
2551 else if (*previous == OP_CLASS ||
2552 *previous == OP_NCLASS ||
2553 #ifdef SUPPORT_UTF8
2554 *previous == OP_XCLASS ||
2555 #endif
2556 *previous == OP_REF)
2557 {
2558 if (repeat_max == 0)
2559 {
2560 code = previous;
2561 goto END_REPEAT;
2562 }
2563
2564 /* All real repeats make it impossible to handle partial matching (maybe
2565 one day we will be able to remove this restriction). */
2566
2567 if (repeat_max != 1) cd->nopartial = TRUE;
2568
2569 if (repeat_min == 0 && repeat_max == -1)
2570 *code++ = OP_CRSTAR + repeat_type;
2571 else if (repeat_min == 1 && repeat_max == -1)
2572 *code++ = OP_CRPLUS + repeat_type;
2573 else if (repeat_min == 0 && repeat_max == 1)
2574 *code++ = OP_CRQUERY + repeat_type;
2575 else
2576 {
2577 *code++ = OP_CRRANGE + repeat_type;
2578 PUT2INC(code, 0, repeat_min);
2579 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
2580 PUT2INC(code, 0, repeat_max);
2581 }
2582 }
2583
2584 /* If previous was a bracket group, we may have to replicate it in certain
2585 cases. */
2586
2587 else if (*previous >= OP_BRA || *previous == OP_ONCE ||
2588 *previous == OP_COND)
2589 {
2590 register int i;
2591 int ketoffset = 0;
2592 int len = code - previous;
2593 uschar *bralink = NULL;
2594
2595 /* If the maximum repeat count is unlimited, find the end of the bracket
2596 by scanning through from the start, and compute the offset back to it
2597 from the current code pointer. There may be an OP_OPT setting following
2598 the final KET, so we can't find the end just by going back from the code
2599 pointer. */
2600
2601 if (repeat_max == -1)
2602 {
2603 register uschar *ket = previous;
2604 do ket += GET(ket, 1); while (*ket != OP_KET);
2605 ketoffset = code - ket;
2606 }
2607
2608 /* The case of a zero minimum is special because of the need to stick
2609 OP_BRAZERO in front of it, and because the group appears once in the
2610 data, whereas in other cases it appears the minimum number of times. For
2611 this reason, it is simplest to treat this case separately, as otherwise
2612 the code gets far too messy. There are several special subcases when the
2613 minimum is zero. */
2614
2615 if (repeat_min == 0)
2616 {
2617 /* If the maximum is also zero, we just omit the group from the output
2618 altogether. */
2619
2620 if (repeat_max == 0)
2621 {
2622 code = previous;
2623 goto END_REPEAT;
2624 }
2625
2626 /* If the maximum is 1 or unlimited, we just have to stick in the
2627 BRAZERO and do no more at this point. However, we do need to adjust
2628 any OP_RECURSE calls inside the group that refer to the group itself or
2629 any internal group, because the offset is from the start of the whole
2630 regex. Temporarily terminate the pattern while doing this. */
2631
2632 if (repeat_max <= 1)
2633 {
2634 *code = OP_END;
2635 adjust_recurse(previous, 1, utf8, cd);
2636 memmove(previous+1, previous, len);
2637 code++;
2638 *previous++ = OP_BRAZERO + repeat_type;
2639 }
2640
2641 /* If the maximum is greater than 1 and limited, we have to replicate
2642 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
2643 The first one has to be handled carefully because it's the original
2644 copy, which has to be moved up. The remainder can be handled by code
2645 that is common with the non-zero minimum case below. We have to
2646 adjust the value or repeat_max, since one less copy is required. Once
2647 again, we may have to adjust any OP_RECURSE calls inside the group. */
2648
2649 else
2650 {
2651 int offset;
2652 *code = OP_END;
2653 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);
2654 memmove(previous + 2 + LINK_SIZE, previous, len);
2655 code += 2 + LINK_SIZE;
2656 *previous++ = OP_BRAZERO + repeat_type;
2657 *previous++ = OP_BRA;
2658
2659 /* We chain together the bracket offset fields that have to be
2660 filled in later when the ends of the brackets are reached. */
2661
2662 offset = (bralink == NULL)? 0 : previous - bralink;
2663 bralink = previous;
2664 PUTINC(previous, 0, offset);
2665 }
2666
2667 repeat_max--;
2668 }
2669
2670 /* If the minimum is greater than zero, replicate the group as many
2671 times as necessary, and adjust the maximum to the number of subsequent
2672 copies that we need. If we set a first char from the group, and didn't
2673 set a required char, copy the latter from the former. */
2674
2675 else
2676 {
2677 if (repeat_min > 1)
2678 {
2679 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
2680 for (i = 1; i < repeat_min; i++)
2681 {
2682 memcpy(code, previous, len);
2683 code += len;
2684 }
2685 }
2686 if (repeat_max > 0) repeat_max -= repeat_min;
2687 }
2688
2689 /* This code is common to both the zero and non-zero minimum cases. If
2690 the maximum is limited, it replicates the group in a nested fashion,
2691 remembering the bracket starts on a stack. In the case of a zero minimum,
2692 the first one was set up above. In all cases the repeat_max now specifies
2693 the number of additional copies needed. */
2694
2695 if (repeat_max >= 0)
2696 {
2697 for (i = repeat_max - 1; i >= 0; i--)
2698 {
2699 *code++ = OP_BRAZERO + repeat_type;
2700
2701 /* All but the final copy start a new nesting, maintaining the
2702 chain of brackets outstanding. */
2703
2704 if (i != 0)
2705 {
2706 int offset;
2707 *code++ = OP_BRA;
2708 offset = (bralink == NULL)? 0 : code - bralink;
2709 bralink = code;
2710 PUTINC(code, 0, offset);
2711 }
2712
2713 memcpy(code, previous, len);
2714 code += len;
2715 }
2716
2717 /* Now chain through the pending brackets, and fill in their length
2718 fields (which are holding the chain links pro tem). */
2719
2720 while (bralink != NULL)
2721 {
2722 int oldlinkoffset;
2723 int offset = code - bralink + 1;
2724 uschar *bra = code - offset;
2725 oldlinkoffset = GET(bra, 1);
2726 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
2727 *code++ = OP_KET;
2728 PUTINC(code, 0, offset);
2729 PUT(bra, 1, offset);
2730 }
2731 }
2732
2733 /* If the maximum is unlimited, set a repeater in the final copy. We
2734 can't just offset backwards from the current code point, because we
2735 don't know if there's been an options resetting after the ket. The
2736 correct offset was computed above. */
2737
2738 else code[-ketoffset] = OP_KETRMAX + repeat_type;
2739 }
2740
2741 /* Else there's some kind of shambles */
2742
2743 else
2744 {
2745 *errorcodeptr = ERR11;
2746 goto FAILED;
2747 }
2748
2749 /* If the character following a repeat is '+', we wrap the entire repeated
2750 item inside OP_ONCE brackets. This is just syntactic sugar, taken from
2751 Sun's Java package. The repeated item starts at tempcode, not at previous,
2752 which might be the first part of a string whose (former) last char we
2753 repeated. However, we don't support '+' after a greediness '?'. */
2754
2755 if (possessive_quantifier)
2756 {
2757 int len = code - tempcode;
2758 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
2759 code += 1 + LINK_SIZE;
2760 len += 1 + LINK_SIZE;
2761 tempcode[0] = OP_ONCE;
2762 *code++ = OP_KET;
2763 PUTINC(code, 0, len);
2764 PUT(tempcode, 1, len);
2765 }
2766
2767 /* In all case we no longer have a previous item. We also set the
2768 "follows varying string" flag for subsequently encountered reqbytes if
2769 it isn't already set and we have just passed a varying length item. */
2770
2771 END_REPEAT:
2772 previous = NULL;
2773 cd->req_varyopt |= reqvary;
2774 break;
2775
2776
2777 /* Start of nested bracket sub-expression, or comment or lookahead or
2778 lookbehind or option setting or condition. First deal with special things
2779 that can come after a bracket; all are introduced by ?, and the appearance
2780 of any of them means that this is not a referencing group. They were
2781 checked for validity in the first pass over the string, so we don't have to
2782 check for syntax errors here. */
2783
2784 case '(':
2785 newoptions = options;
2786 skipbytes = 0;
2787
2788 if (*(++ptr) == '?')
2789 {
2790 int set, unset;
2791 int *optset;
2792
2793 switch (*(++ptr))
2794 {
2795 case '#': /* Comment; skip to ket */
2796 ptr++;
2797 while (*ptr != ')') ptr++;
2798 continue;
2799
2800 case ':': /* Non-extracting bracket */
2801 bravalue = OP_BRA;
2802 ptr++;
2803 break;
2804
2805 case '(':
2806 bravalue = OP_COND; /* Conditional group */
2807
2808 /* Condition to test for recursion */
2809
2810 if (ptr[1] == 'R')
2811 {
2812 code[1+LINK_SIZE] = OP_CREF;
2813 PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
2814 skipbytes = 3;
2815 ptr += 3;
2816 }
2817
2818 /* Condition to test for a numbered subpattern match. We know that
2819 if a digit follows ( then there will just be digits until ) because
2820 the syntax was checked in the first pass. */
2821
2822 else if ((digitab[ptr[1]] && ctype_digit) != 0)
2823 {
2824 int condref; /* Don't amalgamate; some compilers */
2825 condref = *(++ptr) - '0'; /* grumble at autoincrement in declaration */
2826 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
2827 if (condref == 0)
2828 {
2829 *errorcodeptr = ERR35;
2830 goto FAILED;
2831 }
2832 ptr++;
2833 code[1+LINK_SIZE] = OP_CREF;
2834 PUT2(code, 2+LINK_SIZE, condref);
2835 skipbytes = 3;
2836 }
2837 /* For conditions that are assertions, we just fall through, having
2838 set bravalue above. */
2839 break;
2840
2841 case '=': /* Positive lookahead */
2842 bravalue = OP_ASSERT;
2843 ptr++;
2844 break;
2845
2846 case '!': /* Negative lookahead */
2847 bravalue = OP_ASSERT_NOT;
2848 ptr++;
2849 break;
2850
2851 case '<': /* Lookbehinds */
2852 switch (*(++ptr))
2853 {
2854 case '=': /* Positive lookbehind */
2855 bravalue = OP_ASSERTBACK;
2856 ptr++;
2857 break;
2858
2859 case '!': /* Negative lookbehind */
2860 bravalue = OP_ASSERTBACK_NOT;
2861 ptr++;
2862 break;
2863 }
2864 break;
2865
2866 case '>': /* One-time brackets */
2867 bravalue = OP_ONCE;
2868 ptr++;
2869 break;
2870
2871 case 'C': /* Callout - may be followed by digits; */
2872 previous_callout = code; /* Save for later completion */
2873 after_manual_callout = 1; /* Skip one item before completing */
2874 *code++ = OP_CALLOUT; /* Already checked that the terminating */
2875 { /* closing parenthesis is present. */
2876 int n = 0;
2877 while ((digitab[*(++ptr)] & ctype_digit) != 0)
2878 n = n * 10 + *ptr - '0';
2879 if (n > 255)
2880 {
2881 *errorcodeptr = ERR38;
2882 goto FAILED;
2883 }
2884 *code++ = n;
2885 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
2886 PUT(code, LINK_SIZE, 0); /* Default length */
2887 code += 2 * LINK_SIZE;
2888 }
2889 previous = NULL;
2890 continue;
2891
2892 case 'P': /* Named subpattern handling */
2893 if (*(++ptr) == '<') /* Definition */
2894 {
2895 int i, namelen;
2896 uschar *slot = cd->name_table;
2897 const uschar *name; /* Don't amalgamate; some compilers */
2898 name = ++ptr; /* grumble at autoincrement in declaration */
2899
2900 while (*ptr++ != '>');
2901 namelen = ptr - name - 1;
2902
2903 for (i = 0; i < cd->names_found; i++)
2904 {
2905 int crc = memcmp(name, slot+2, namelen);
2906 if (crc == 0)
2907 {
2908 if (slot[2+namelen] == 0)
2909 {
2910 *errorcodeptr = ERR43;
2911 goto FAILED;
2912 }
2913 crc = -1; /* Current name is substring */
2914 }
2915 if (crc < 0)
2916 {
2917 memmove(slot + cd->name_entry_size, slot,
2918 (cd->names_found - i) * cd->name_entry_size);
2919 break;
2920 }
2921 slot += cd->name_entry_size;
2922 }
2923
2924 PUT2(slot, 0, *brackets + 1);
2925 memcpy(slot + 2, name, namelen);
2926 slot[2+namelen] = 0;
2927 cd->names_found++;
2928 goto NUMBERED_GROUP;
2929 }
2930
2931 if (*ptr == '=' || *ptr == '>') /* Reference or recursion */
2932 {
2933 int i, namelen;
2934 int type = *ptr++;
2935 const uschar *name = ptr;
2936 uschar *slot = cd->name_table;
2937
2938 while (*ptr != ')') ptr++;
2939 namelen = ptr - name;
2940
2941 for (i = 0; i < cd->names_found; i++)
2942 {
2943 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
2944 slot += cd->name_entry_size;
2945 }
2946 if (i >= cd->names_found)
2947 {
2948 *errorcodeptr = ERR15;
2949 goto FAILED;
2950 }
2951
2952 recno = GET2(slot, 0);
2953
2954 if (type == '>') goto HANDLE_RECURSION; /* A few lines below */
2955
2956 /* Back reference */
2957
2958 previous = code;
2959 *code++ = OP_REF;
2960 PUT2INC(code, 0, recno);
2961 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
2962 if (recno > cd->top_backref) cd->top_backref = recno;
2963 continue;
2964 }
2965
2966 /* Should never happen */
2967 break;
2968
2969 case 'R': /* Pattern recursion */
2970 ptr++; /* Same as (?0) */
2971 /* Fall through */
2972
2973 /* Recursion or "subroutine" call */
2974
2975 case '0': case '1': case '2': case '3': case '4':
2976 case '5': case '6': case '7': case '8': case '9':
2977 {
2978 const uschar *called;
2979 recno = 0;
2980 while((digitab[*ptr] & ctype_digit) != 0)
2981 recno = recno * 10 + *ptr++ - '0';
2982
2983 /* Come here from code above that handles a named recursion */
2984
2985 HANDLE_RECURSION:
2986
2987 previous = code;
2988
2989 /* Find the bracket that is being referenced. Temporarily end the
2990 regex in case it doesn't exist. */
2991
2992 *code = OP_END;
2993 called = (recno == 0)?
2994 cd->start_code : find_bracket(cd->start_code, utf8, recno);
2995
2996 if (called == NULL)
2997 {
2998 *errorcodeptr = ERR15;
2999 goto FAILED;
3000 }
3001
3002 /* If the subpattern is still open, this is a recursive call. We
3003 check to see if this is a left recursion that could loop for ever,
3004 and diagnose that case. */
3005
3006 if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
3007 {
3008 *errorcodeptr = ERR40;
3009 goto FAILED;
3010 }
3011
3012 /* Insert the recursion/subroutine item */
3013
3014 *code = OP_RECURSE;
3015 PUT(code, 1, called - cd->start_code);
3016 code += 1 + LINK_SIZE;
3017 }
3018 continue;
3019
3020 /* Character after (? not specially recognized */
3021
3022 default: /* Option setting */
3023 set = unset = 0;
3024 optset = &set;
3025
3026 while (*ptr != ')' && *ptr != ':')
3027 {
3028 switch (*ptr++)
3029 {
3030 case '-': optset = &unset; break;
3031
3032 case 'i': *optset |= PCRE_CASELESS; break;
3033 case 'm': *optset |= PCRE_MULTILINE; break;
3034 case 's': *optset |= PCRE_DOTALL; break;
3035 case 'x': *optset |= PCRE_EXTENDED; break;
3036 case 'U': *optset |= PCRE_UNGREEDY; break;
3037 case 'X': *optset |= PCRE_EXTRA; break;
3038 }
3039 }
3040
3041 /* Set up the changed option bits, but don't change anything yet. */
3042
3043 newoptions = (options | set) & (~unset);
3044
3045 /* If the options ended with ')' this is not the start of a nested
3046 group with option changes, so the options change at this level. Compile
3047 code to change the ims options if this setting actually changes any of
3048 them. We also pass the new setting back so that it can be put at the
3049 start of any following branches, and when this group ends (if we are in
3050 a group), a resetting item can be compiled.
3051
3052 Note that if this item is right at the start of the pattern, the
3053 options will have been abstracted and made global, so there will be no
3054 change to compile. */
3055
3056 if (*ptr == ')')
3057 {
3058 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
3059 {
3060 *code++ = OP_OPT;
3061 *code++ = newoptions & PCRE_IMS;
3062 }
3063
3064 /* Change options at this level, and pass them back for use
3065 in subsequent branches. Reset the greedy defaults and the case
3066 value for firstbyte and reqbyte. */
3067
3068 *optionsptr = options = newoptions;
3069 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
3070 greedy_non_default = greedy_default ^ 1;
3071 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3072
3073 previous = NULL; /* This item can't be repeated */
3074 continue; /* It is complete */
3075 }
3076
3077 /* If the options ended with ':' we are heading into a nested group
3078 with possible change of options. Such groups are non-capturing and are
3079 not assertions of any kind. All we need to do is skip over the ':';
3080 the newoptions value is handled below. */
3081
3082 bravalue = OP_BRA;
3083 ptr++;
3084 }
3085 }
3086
3087 /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
3088 non-capturing and behave like (?:...) brackets */
3089
3090 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
3091 {
3092 bravalue = OP_BRA;
3093 }
3094
3095 /* Else we have a referencing group; adjust the opcode. If the bracket
3096 number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
3097 arrange for the true number to follow later, in an OP_BRANUMBER item. */
3098
3099 else
3100 {
3101 NUMBERED_GROUP:
3102 if (++(*brackets) > EXTRACT_BASIC_MAX)
3103 {
3104 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
3105 code[1+LINK_SIZE] = OP_BRANUMBER;
3106 PUT2(code, 2+LINK_SIZE, *brackets);
3107 skipbytes = 3;
3108 }
3109 else bravalue = OP_BRA + *brackets;
3110 }
3111
3112 /* Process nested bracketed re. Assertions may not be repeated, but other
3113 kinds can be. We copy code into a non-register variable in order to be able
3114 to pass its address because some compilers complain otherwise. Pass in a
3115 new setting for the ims options if they have changed. */
3116
3117 previous = (bravalue >= OP_ONCE)? code : NULL;
3118 *code = bravalue;
3119 tempcode = code;
3120 tempreqvary = cd->req_varyopt; /* Save value before bracket */
3121
3122 if (!compile_regex(
3123 newoptions, /* The complete new option state */
3124 options & PCRE_IMS, /* The previous ims option state */
3125 brackets, /* Extracting bracket count */
3126 &tempcode, /* Where to put code (updated) */
3127 &ptr, /* Input pointer (updated) */
3128 errorcodeptr, /* Where to put an error message */
3129 (bravalue == OP_ASSERTBACK ||
3130 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
3131 skipbytes, /* Skip over OP_COND/OP_BRANUMBER */
3132 &subfirstbyte, /* For possible first char */
3133 &subreqbyte, /* For possible last char */
3134 bcptr, /* Current branch chain */
3135 cd)) /* Tables block */
3136 goto FAILED;
3137
3138 /* At the end of compiling, code is still pointing to the start of the
3139 group, while tempcode has been updated to point past the end of the group
3140 and any option resetting that may follow it. The pattern pointer (ptr)
3141 is on the bracket. */
3142
3143 /* If this is a conditional bracket, check that there are no more than
3144 two branches in the group. */
3145
3146 else if (bravalue == OP_COND)
3147 {
3148 uschar *tc = code;
3149 condcount = 0;
3150
3151 do {
3152 condcount++;
3153 tc += GET(tc,1);
3154 }
3155 while (*tc != OP_KET);
3156
3157 if (condcount > 2)
3158 {
3159 *errorcodeptr = ERR27;
3160 goto FAILED;
3161 }
3162
3163 /* If there is just one branch, we must not make use of its firstbyte or
3164 reqbyte, because this is equivalent to an empty second branch. */
3165
3166 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
3167 }
3168
3169 /* Handle updating of the required and first characters. Update for normal
3170 brackets of all kinds, and conditions with two branches (see code above).
3171 If the bracket is followed by a quantifier with zero repeat, we have to
3172 back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
3173 main loop so that they can be accessed for the back off. */
3174
3175 zeroreqbyte = reqbyte;
3176 zerofirstbyte = firstbyte;
3177 groupsetfirstbyte = FALSE;
3178
3179 if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
3180 {
3181 /* If we have not yet set a firstbyte in this branch, take it from the
3182 subpattern, remembering that it was set here so that a repeat of more
3183 than one can replicate it as reqbyte if necessary. If the subpattern has
3184 no firstbyte, set "none" for the whole branch. In both cases, a zero
3185 repeat forces firstbyte to "none". */
3186
3187 if (firstbyte == REQ_UNSET)
3188 {
3189 if (subfirstbyte >= 0)
3190 {
3191 firstbyte = subfirstbyte;
3192 groupsetfirstbyte = TRUE;
3193 }
3194 else firstbyte = REQ_NONE;
3195 zerofirstbyte = REQ_NONE;
3196 }
3197
3198 /* If firstbyte was previously set, convert the subpattern's firstbyte
3199 into reqbyte if there wasn't one, using the vary flag that was in
3200 existence beforehand. */
3201
3202 else if (subfirstbyte >= 0 && subreqbyte < 0)
3203 subreqbyte = subfirstbyte | tempreqvary;
3204
3205 /* If the subpattern set a required byte (or set a first byte that isn't
3206 really the first byte - see above), set it. */
3207
3208 if (subreqbyte >= 0) reqbyte = subreqbyte;
3209 }
3210
3211 /* For a forward assertion, we take the reqbyte, if set. This can be
3212 helpful if the pattern that follows the assertion doesn't set a different
3213 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
3214 for an assertion, however because it leads to incorrect effect for patterns
3215 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
3216 of a firstbyte. This is overcome by a scan at the end if there's no
3217 firstbyte, looking for an asserted first char. */
3218
3219 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
3220
3221 /* Now update the main code pointer to the end of the group. */
3222
3223 code = tempcode;
3224
3225 /* Error if hit end of pattern */
3226
3227 if (*ptr != ')')
3228 {
3229 *errorcodeptr = ERR14;
3230 goto FAILED;
3231 }
3232 break;
3233
3234 /* Check \ for being a real metacharacter; if not, fall through and handle
3235 it as a data character at the start of a string. Escape items are checked
3236 for validity in the pre-compiling pass. */
3237
3238 case '\\':
3239 tempptr = ptr;
3240 c = check_escape(&ptr, errorcodeptr, *brackets, options, FALSE);
3241
3242 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
3243 are arranged to be the negation of the corresponding OP_values. For the
3244 back references, the values are ESC_REF plus the reference number. Only
3245 back references and those types that consume a character may be repeated.
3246 We can test for values between ESC_b and ESC_Z for the latter; this may
3247 have to change if any new ones are ever created. */
3248