0b439e1f7c9fc3f9d9ba1311c7561754a8ad50a0
[exim.git] / src / src / pcre / pcre_compile.c
1 /* $Cambridge: exim/src/src/pcre/pcre_compile.c,v 1.1 2005/06/15 08:57:10 ph10 Exp $ */
2
3 /*************************************************
4 * Perl-Compatible Regular Expressions *
5 *************************************************/
6
7 /* PCRE is a library of functions to support regular expressions whose syntax
8 and semantics are as close as possible to those of the Perl 5 language.
9
10 Written by Philip Hazel
11 Copyright (c) 1997-2005 University of Cambridge
12
13 -----------------------------------------------------------------------------
14 Redistribution and use in source and binary forms, with or without
15 modification, are permitted provided that the following conditions are met:
16
17 * Redistributions of source code must retain the above copyright notice,
18 this list of conditions and the following disclaimer.
19
20 * Redistributions in binary form must reproduce the above copyright
21 notice, this list of conditions and the following disclaimer in the
22 documentation and/or other materials provided with the distribution.
23
24 * Neither the name of the University of Cambridge nor the names of its
25 contributors may be used to endorse or promote products derived from
26 this software without specific prior written permission.
27
28 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 POSSIBILITY OF SUCH DAMAGE.
39 -----------------------------------------------------------------------------
40 */
41
42
43 /* This module contains the external function pcre_compile(), along with
44 supporting internal functions that are not used by other modules. */
45
46
47 #include "pcre_internal.h"
48
49
50 /*************************************************
51 * Code parameters and static tables *
52 *************************************************/
53
54 /* Maximum number of items on the nested bracket stacks at compile time. This
55 applies to the nesting of all kinds of parentheses. It does not limit
56 un-nested, non-capturing parentheses. This number can be made bigger if
57 necessary - it is used to dimension one int and one unsigned char vector at
58 compile time. */
59
60 #define BRASTACK_SIZE 200
61
62
63 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
64 are simple data values; negative values are for special things like \d and so
65 on. Zero means further processing is needed (for things like \x), or the escape
66 is invalid. */
67
68 #if !EBCDIC /* This is the "normal" table for ASCII systems */
69 static const short int escapes[] = {
70 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
71 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
72 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
73 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
74 -ESC_P, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
75 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
76 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
77 0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
78 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
79 0, 0, -ESC_z /* x - z */
80 };
81
82 #else /* This is the "abnormal" table for EBCDIC systems */
83 static const short int escapes[] = {
84 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
85 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
86 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
87 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
88 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
89 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
90 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
91 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
92 /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
93 /* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, -ESC_p,
94 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
95 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
96 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
97 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
98 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
99 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
100 /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
101 /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
102 /* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0,
103 /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
104 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
105 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
106 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
107 };
108 #endif
109
110
111 /* Tables of names of POSIX character classes and their lengths. The list is
112 terminated by a zero length entry. The first three must be alpha, upper, lower,
113 as this is assumed for handling case independence. */
114
115 static const char *const posix_names[] = {
116 "alpha", "lower", "upper",
117 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
118 "print", "punct", "space", "word", "xdigit" };
119
120 static const uschar posix_name_lengths[] = {
121 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
122
123 /* Table of class bit maps for each POSIX class; up to three may be combined
124 to form the class. The table for [:blank:] is dynamically modified to remove
125 the vertical space characters. */
126
127 static const int posix_class_maps[] = {
128 cbit_lower, cbit_upper, -1, /* alpha */
129 cbit_lower, -1, -1, /* lower */
130 cbit_upper, -1, -1, /* upper */
131 cbit_digit, cbit_lower, cbit_upper, /* alnum */
132 cbit_print, cbit_cntrl, -1, /* ascii */
133 cbit_space, -1, -1, /* blank - a GNU extension */
134 cbit_cntrl, -1, -1, /* cntrl */
135 cbit_digit, -1, -1, /* digit */
136 cbit_graph, -1, -1, /* graph */
137 cbit_print, -1, -1, /* print */
138 cbit_punct, -1, -1, /* punct */
139 cbit_space, -1, -1, /* space */
140 cbit_word, -1, -1, /* word - a Perl extension */
141 cbit_xdigit,-1, -1 /* xdigit */
142 };
143
144
145 /* The texts of compile-time error messages. These are "char *" because they
146 are passed to the outside world. */
147
148 static const char *error_texts[] = {
149 "no error",
150 "\\ at end of pattern",
151 "\\c at end of pattern",
152 "unrecognized character follows \\",
153 "numbers out of order in {} quantifier",
154 /* 5 */
155 "number too big in {} quantifier",
156 "missing terminating ] for character class",
157 "invalid escape sequence in character class",
158 "range out of order in character class",
159 "nothing to repeat",
160 /* 10 */
161 "operand of unlimited repeat could match the empty string",
162 "internal error: unexpected repeat",
163 "unrecognized character after (?",
164 "POSIX named classes are supported only within a class",
165 "missing )",
166 /* 15 */
167 "reference to non-existent subpattern",
168 "erroffset passed as NULL",
169 "unknown option bit(s) set",
170 "missing ) after comment",
171 "parentheses nested too deeply",
172 /* 20 */
173 "regular expression too large",
174 "failed to get memory",
175 "unmatched parentheses",
176 "internal error: code overflow",
177 "unrecognized character after (?<",
178 /* 25 */
179 "lookbehind assertion is not fixed length",
180 "malformed number after (?(",
181 "conditional group contains more than two branches",
182 "assertion expected after (?(",
183 "(?R or (?digits must be followed by )",
184 /* 30 */
185 "unknown POSIX class name",
186 "POSIX collating elements are not supported",
187 "this version of PCRE is not compiled with PCRE_UTF8 support",
188 "spare error",
189 "character value in \\x{...} sequence is too large",
190 /* 35 */
191 "invalid condition (?(0)",
192 "\\C not allowed in lookbehind assertion",
193 "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
194 "number after (?C is > 255",
195 "closing ) for (?C expected",
196 /* 40 */
197 "recursive call could loop indefinitely",
198 "unrecognized character after (?P",
199 "syntax error after (?P",
200 "two named groups have the same name",
201 "invalid UTF-8 string",
202 /* 45 */
203 "support for \\P, \\p, and \\X has not been compiled",
204 "malformed \\P or \\p sequence",
205 "unknown property name after \\P or \\p"
206 };
207
208
209 /* Table to identify digits and hex digits. This is used when compiling
210 patterns. Note that the tables in chartables are dependent on the locale, and
211 may mark arbitrary characters as digits - but the PCRE compiling code expects
212 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
213 a private table here. It costs 256 bytes, but it is a lot faster than doing
214 character value tests (at least in some simple cases I timed), and in some
215 applications one wants PCRE to compile efficiently as well as match
216 efficiently.
217
218 For convenience, we use the same bit definitions as in chartables:
219
220 0x04 decimal digit
221 0x08 hexadecimal digit
222
223 Then we can use ctype_digit and ctype_xdigit in the code. */
224
225 #if !EBCDIC /* This is the "normal" case, for ASCII systems */
226 static const unsigned char digitab[] =
227 {
228 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
229 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
230 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
231 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
232 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
233 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
234 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
235 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
236 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
237 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
238 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
239 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
240 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
241 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
242 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
243 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
244 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
245 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
246 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
247 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
248 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
249 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
250 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
251 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
252 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
253 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
254 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
255 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
256 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
257 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
258 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
259 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
260
261 #else /* This is the "abnormal" case, for EBCDIC systems */
262 static const unsigned char digitab[] =
263 {
264 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
265 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
266 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
267 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
268 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
269 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
270 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
271 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
272 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
273 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
274 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
275 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- ¬ */
276 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
277 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
278 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
279 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
280 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
281 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
282 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
283 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
284 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
285 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
286 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
287 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
288 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
289 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
290 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
291 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
292 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
293 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
294 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
295 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
296
297 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
298 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
299 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
300 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
301 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
302 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
303 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
304 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
305 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
306 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
307 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
308 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
309 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- ¬ */
310 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
311 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
312 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
313 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
314 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
315 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
316 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
317 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
318 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
319 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
320 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
321 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
322 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
323 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
324 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
325 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
326 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
327 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
328 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
329 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
330 #endif
331
332
333 /* Definition to allow mutual recursion */
334
335 static BOOL
336 compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,
337 int *, int *, branch_chain *, compile_data *);
338
339
340
341 /*************************************************
342 * Handle escapes *
343 *************************************************/
344
345 /* This function is called when a \ has been encountered. It either returns a
346 positive value for a simple escape such as \n, or a negative value which
347 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
348 a positive value greater than 255 may be returned. On entry, ptr is pointing at
349 the \. On exit, it is on the final character of the escape sequence.
350
351 Arguments:
352 ptrptr points to the pattern position pointer
353 errorcodeptr points to the errorcode variable
354 bracount number of previous extracting brackets
355 options the options bits
356 isclass TRUE if inside a character class
357
358 Returns: zero or positive => a data character
359 negative => a special escape sequence
360 on error, errorptr is set
361 */
362
363 static int
364 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
365 int options, BOOL isclass)
366 {
367 const uschar *ptr = *ptrptr;
368 int c, i;
369
370 /* If backslash is at the end of the pattern, it's an error. */
371
372 c = *(++ptr);
373 if (c == 0) *errorcodeptr = ERR1;
374
375 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
376 a table. A non-zero result is something that can be returned immediately.
377 Otherwise further processing may be required. */
378
379 #if !EBCDIC /* ASCII coding */
380 else if (c < '0' || c > 'z') {} /* Not alphameric */
381 else if ((i = escapes[c - '0']) != 0) c = i;
382
383 #else /* EBCDIC coding */
384 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
385 else if ((i = escapes[c - 0x48]) != 0) c = i;
386 #endif
387
388 /* Escapes that need further processing, or are illegal. */
389
390 else
391 {
392 const uschar *oldptr;
393 switch (c)
394 {
395 /* A number of Perl escapes are not handled by PCRE. We give an explicit
396 error. */
397
398 case 'l':
399 case 'L':
400 case 'N':
401 case 'u':
402 case 'U':
403 *errorcodeptr = ERR37;
404 break;
405
406 /* The handling of escape sequences consisting of a string of digits
407 starting with one that is not zero is not straightforward. By experiment,
408 the way Perl works seems to be as follows:
409
410 Outside a character class, the digits are read as a decimal number. If the
411 number is less than 10, or if there are that many previous extracting
412 left brackets, then it is a back reference. Otherwise, up to three octal
413 digits are read to form an escaped byte. Thus \123 is likely to be octal
414 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
415 value is greater than 377, the least significant 8 bits are taken. Inside a
416 character class, \ followed by a digit is always an octal number. */
417
418 case '1': case '2': case '3': case '4': case '5':
419 case '6': case '7': case '8': case '9':
420
421 if (!isclass)
422 {
423 oldptr = ptr;
424 c -= '0';
425 while ((digitab[ptr[1]] & ctype_digit) != 0)
426 c = c * 10 + *(++ptr) - '0';
427 if (c < 10 || c <= bracount)
428 {
429 c = -(ESC_REF + c);
430 break;
431 }
432 ptr = oldptr; /* Put the pointer back and fall through */
433 }
434
435 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
436 generates a binary zero byte and treats the digit as a following literal.
437 Thus we have to pull back the pointer by one. */
438
439 if ((c = *ptr) >= '8')
440 {
441 ptr--;
442 c = 0;
443 break;
444 }
445
446 /* \0 always starts an octal number, but we may drop through to here with a
447 larger first octal digit. */
448
449 case '0':
450 c -= '0';
451 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
452 c = c * 8 + *(++ptr) - '0';
453 c &= 255; /* Take least significant 8 bits */
454 break;
455
456 /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
457 which can be greater than 0xff, but only if the ddd are hex digits. */
458
459 case 'x':
460 #ifdef SUPPORT_UTF8
461 if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
462 {
463 const uschar *pt = ptr + 2;
464 register int count = 0;
465 c = 0;
466 while ((digitab[*pt] & ctype_xdigit) != 0)
467 {
468 int cc = *pt++;
469 count++;
470 #if !EBCDIC /* ASCII coding */
471 if (cc >= 'a') cc -= 32; /* Convert to upper case */
472 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
473 #else /* EBCDIC coding */
474 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
475 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
476 #endif
477 }
478 if (*pt == '}')
479 {
480 if (c < 0 || count > 8) *errorcodeptr = ERR34;
481 ptr = pt;
482 break;
483 }
484 /* If the sequence of hex digits does not end with '}', then we don't
485 recognize this construct; fall through to the normal \x handling. */
486 }
487 #endif
488
489 /* Read just a single hex char */
490
491 c = 0;
492 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
493 {
494 int cc; /* Some compilers don't like ++ */
495 cc = *(++ptr); /* in initializers */
496 #if !EBCDIC /* ASCII coding */
497 if (cc >= 'a') cc -= 32; /* Convert to upper case */
498 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
499 #else /* EBCDIC coding */
500 if (cc <= 'z') cc += 64; /* Convert to upper case */
501 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
502 #endif
503 }
504 break;
505
506 /* Other special escapes not starting with a digit are straightforward */
507
508 case 'c':
509 c = *(++ptr);
510 if (c == 0)
511 {
512 *errorcodeptr = ERR2;
513 return 0;
514 }
515
516 /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
517 is ASCII-specific, but then the whole concept of \cx is ASCII-specific.
518 (However, an EBCDIC equivalent has now been added.) */
519
520 #if !EBCDIC /* ASCII coding */
521 if (c >= 'a' && c <= 'z') c -= 32;
522 c ^= 0x40;
523 #else /* EBCDIC coding */
524 if (c >= 'a' && c <= 'z') c += 64;
525 c ^= 0xC0;
526 #endif
527 break;
528
529 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
530 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
531 for Perl compatibility, it is a literal. This code looks a bit odd, but
532 there used to be some cases other than the default, and there may be again
533 in future, so I haven't "optimized" it. */
534
535 default:
536 if ((options & PCRE_EXTRA) != 0) switch(c)
537 {
538 default:
539 *errorcodeptr = ERR3;
540 break;
541 }
542 break;
543 }
544 }
545
546 *ptrptr = ptr;
547 return c;
548 }
549
550
551
552 #ifdef SUPPORT_UCP
553 /*************************************************
554 * Handle \P and \p *
555 *************************************************/
556
557 /* This function is called after \P or \p has been encountered, provided that
558 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
559 pointing at the P or p. On exit, it is pointing at the final character of the
560 escape sequence.
561
562 Argument:
563 ptrptr points to the pattern position pointer
564 negptr points to a boolean that is set TRUE for negation else FALSE
565 errorcodeptr points to the error code variable
566
567 Returns: value from ucp_type_table, or -1 for an invalid type
568 */
569
570 static int
571 get_ucp(const uschar **ptrptr, BOOL *negptr, int *errorcodeptr)
572 {
573 int c, i, bot, top;
574 const uschar *ptr = *ptrptr;
575 char name[4];
576
577 c = *(++ptr);
578 if (c == 0) goto ERROR_RETURN;
579
580 *negptr = FALSE;
581
582 /* \P or \p can be followed by a one- or two-character name in {}, optionally
583 preceded by ^ for negation. */
584
585 if (c == '{')
586 {
587 if (ptr[1] == '^')
588 {
589 *negptr = TRUE;
590 ptr++;
591 }
592 for (i = 0; i <= 2; i++)
593 {
594 c = *(++ptr);
595 if (c == 0) goto ERROR_RETURN;
596 if (c == '}') break;
597 name[i] = c;
598 }
599 if (c !='}') /* Try to distinguish error cases */
600 {
601 while (*(++ptr) != 0 && *ptr != '}');
602 if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;
603 }
604 name[i] = 0;
605 }
606
607 /* Otherwise there is just one following character */
608
609 else
610 {
611 name[0] = c;
612 name[1] = 0;
613 }
614
615 *ptrptr = ptr;
616
617 /* Search for a recognized property name using binary chop */
618
619 bot = 0;
620 top = _pcre_utt_size;
621
622 while (bot < top)
623 {
624 i = (bot + top)/2;
625 c = strcmp(name, _pcre_utt[i].name);
626 if (c == 0) return _pcre_utt[i].value;
627 if (c > 0) bot = i + 1; else top = i;
628 }
629
630 UNKNOWN_RETURN:
631 *errorcodeptr = ERR47;
632 *ptrptr = ptr;
633 return -1;
634
635 ERROR_RETURN:
636 *errorcodeptr = ERR46;
637 *ptrptr = ptr;
638 return -1;
639 }
640 #endif
641
642
643
644
645 /*************************************************
646 * Check for counted repeat *
647 *************************************************/
648
649 /* This function is called when a '{' is encountered in a place where it might
650 start a quantifier. It looks ahead to see if it really is a quantifier or not.
651 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
652 where the ddds are digits.
653
654 Arguments:
655 p pointer to the first char after '{'
656
657 Returns: TRUE or FALSE
658 */
659
660 static BOOL
661 is_counted_repeat(const uschar *p)
662 {
663 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
664 while ((digitab[*p] & ctype_digit) != 0) p++;
665 if (*p == '}') return TRUE;
666
667 if (*p++ != ',') return FALSE;
668 if (*p == '}') return TRUE;
669
670 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
671 while ((digitab[*p] & ctype_digit) != 0) p++;
672
673 return (*p == '}');
674 }
675
676
677
678 /*************************************************
679 * Read repeat counts *
680 *************************************************/
681
682 /* Read an item of the form {n,m} and return the values. This is called only
683 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
684 so the syntax is guaranteed to be correct, but we need to check the values.
685
686 Arguments:
687 p pointer to first char after '{'
688 minp pointer to int for min
689 maxp pointer to int for max
690 returned as -1 if no max
691 errorcodeptr points to error code variable
692
693 Returns: pointer to '}' on success;
694 current ptr on error, with errorcodeptr set non-zero
695 */
696
697 static const uschar *
698 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
699 {
700 int min = 0;
701 int max = -1;
702
703 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
704
705 if (*p == '}') max = min; else
706 {
707 if (*(++p) != '}')
708 {
709 max = 0;
710 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
711 if (max < min)
712 {
713 *errorcodeptr = ERR4;
714 return p;
715 }
716 }
717 }
718
719 /* Do paranoid checks, then fill in the required variables, and pass back the
720 pointer to the terminating '}'. */
721
722 if (min > 65535 || max > 65535)
723 *errorcodeptr = ERR5;
724 else
725 {
726 *minp = min;
727 *maxp = max;
728 }
729 return p;
730 }
731
732
733
734 /*************************************************
735 * Find first significant op code *
736 *************************************************/
737
738 /* This is called by several functions that scan a compiled expression looking
739 for a fixed first character, or an anchoring op code etc. It skips over things
740 that do not influence this. For some calls, a change of option is important.
741 For some calls, it makes sense to skip negative forward and all backward
742 assertions, and also the \b assertion; for others it does not.
743
744 Arguments:
745 code pointer to the start of the group
746 options pointer to external options
747 optbit the option bit whose changing is significant, or
748 zero if none are
749 skipassert TRUE if certain assertions are to be skipped
750
751 Returns: pointer to the first significant opcode
752 */
753
754 static const uschar*
755 first_significant_code(const uschar *code, int *options, int optbit,
756 BOOL skipassert)
757 {
758 for (;;)
759 {
760 switch ((int)*code)
761 {
762 case OP_OPT:
763 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
764 *options = (int)code[1];
765 code += 2;
766 break;
767
768 case OP_ASSERT_NOT:
769 case OP_ASSERTBACK:
770 case OP_ASSERTBACK_NOT:
771 if (!skipassert) return code;
772 do code += GET(code, 1); while (*code == OP_ALT);
773 code += _pcre_OP_lengths[*code];
774 break;
775
776 case OP_WORD_BOUNDARY:
777 case OP_NOT_WORD_BOUNDARY:
778 if (!skipassert) return code;
779 /* Fall through */
780
781 case OP_CALLOUT:
782 case OP_CREF:
783 case OP_BRANUMBER:
784 code += _pcre_OP_lengths[*code];
785 break;
786
787 default:
788 return code;
789 }
790 }
791 /* Control never reaches here */
792 }
793
794
795
796
797 /*************************************************
798 * Find the fixed length of a pattern *
799 *************************************************/
800
801 /* Scan a pattern and compute the fixed length of subject that will match it,
802 if the length is fixed. This is needed for dealing with backward assertions.
803 In UTF8 mode, the result is in characters rather than bytes.
804
805 Arguments:
806 code points to the start of the pattern (the bracket)
807 options the compiling options
808
809 Returns: the fixed length, or -1 if there is no fixed length,
810 or -2 if \C was encountered
811 */
812
813 static int
814 find_fixedlength(uschar *code, int options)
815 {
816 int length = -1;
817
818 register int branchlength = 0;
819 register uschar *cc = code + 1 + LINK_SIZE;
820
821 /* Scan along the opcodes for this branch. If we get to the end of the
822 branch, check the length against that of the other branches. */
823
824 for (;;)
825 {
826 int d;
827 register int op = *cc;
828 if (op >= OP_BRA) op = OP_BRA;
829
830 switch (op)
831 {
832 case OP_BRA:
833 case OP_ONCE:
834 case OP_COND:
835 d = find_fixedlength(cc, options);
836 if (d < 0) return d;
837 branchlength += d;
838 do cc += GET(cc, 1); while (*cc == OP_ALT);
839 cc += 1 + LINK_SIZE;
840 break;
841
842 /* Reached end of a branch; if it's a ket it is the end of a nested
843 call. If it's ALT it is an alternation in a nested call. If it is
844 END it's the end of the outer call. All can be handled by the same code. */
845
846 case OP_ALT:
847 case OP_KET:
848 case OP_KETRMAX:
849 case OP_KETRMIN:
850 case OP_END:
851 if (length < 0) length = branchlength;
852 else if (length != branchlength) return -1;
853 if (*cc != OP_ALT) return length;
854 cc += 1 + LINK_SIZE;
855 branchlength = 0;
856 break;
857
858 /* Skip over assertive subpatterns */
859
860 case OP_ASSERT:
861 case OP_ASSERT_NOT:
862 case OP_ASSERTBACK:
863 case OP_ASSERTBACK_NOT:
864 do cc += GET(cc, 1); while (*cc == OP_ALT);
865 /* Fall through */
866
867 /* Skip over things that don't match chars */
868
869 case OP_REVERSE:
870 case OP_BRANUMBER:
871 case OP_CREF:
872 case OP_OPT:
873 case OP_CALLOUT:
874 case OP_SOD:
875 case OP_SOM:
876 case OP_EOD:
877 case OP_EODN:
878 case OP_CIRC:
879 case OP_DOLL:
880 case OP_NOT_WORD_BOUNDARY:
881 case OP_WORD_BOUNDARY:
882 cc += _pcre_OP_lengths[*cc];
883 break;
884
885 /* Handle literal characters */
886
887 case OP_CHAR:
888 case OP_CHARNC:
889 branchlength++;
890 cc += 2;
891 #ifdef SUPPORT_UTF8
892 if ((options & PCRE_UTF8) != 0)
893 {
894 while ((*cc & 0xc0) == 0x80) cc++;
895 }
896 #endif
897 break;
898
899 /* Handle exact repetitions. The count is already in characters, but we
900 need to skip over a multibyte character in UTF8 mode. */
901
902 case OP_EXACT:
903 branchlength += GET2(cc,1);
904 cc += 4;
905 #ifdef SUPPORT_UTF8
906 if ((options & PCRE_UTF8) != 0)
907 {
908 while((*cc & 0x80) == 0x80) cc++;
909 }
910 #endif
911 break;
912
913 case OP_TYPEEXACT:
914 branchlength += GET2(cc,1);
915 cc += 4;
916 break;
917
918 /* Handle single-char matchers */
919
920 case OP_PROP:
921 case OP_NOTPROP:
922 cc++;
923 /* Fall through */
924
925 case OP_NOT_DIGIT:
926 case OP_DIGIT:
927 case OP_NOT_WHITESPACE:
928 case OP_WHITESPACE:
929 case OP_NOT_WORDCHAR:
930 case OP_WORDCHAR:
931 case OP_ANY:
932 branchlength++;
933 cc++;
934 break;
935
936 /* The single-byte matcher isn't allowed */
937
938 case OP_ANYBYTE:
939 return -2;
940
941 /* Check a class for variable quantification */
942
943 #ifdef SUPPORT_UTF8
944 case OP_XCLASS:
945 cc += GET(cc, 1) - 33;
946 /* Fall through */
947 #endif
948
949 case OP_CLASS:
950 case OP_NCLASS:
951 cc += 33;
952
953 switch (*cc)
954 {
955 case OP_CRSTAR:
956 case OP_CRMINSTAR:
957 case OP_CRQUERY:
958 case OP_CRMINQUERY:
959 return -1;
960
961 case OP_CRRANGE:
962 case OP_CRMINRANGE:
963 if (GET2(cc,1) != GET2(cc,3)) return -1;
964 branchlength += GET2(cc,1);
965 cc += 5;
966 break;
967
968 default:
969 branchlength++;
970 }
971 break;
972
973 /* Anything else is variable length */
974
975 default:
976 return -1;
977 }
978 }
979 /* Control never gets here */
980 }
981
982
983
984
985 /*************************************************
986 * Scan compiled regex for numbered bracket *
987 *************************************************/
988
989 /* This little function scans through a compiled pattern until it finds a
990 capturing bracket with the given number.
991
992 Arguments:
993 code points to start of expression
994 utf8 TRUE in UTF-8 mode
995 number the required bracket number
996
997 Returns: pointer to the opcode for the bracket, or NULL if not found
998 */
999
1000 static const uschar *
1001 find_bracket(const uschar *code, BOOL utf8, int number)
1002 {
1003 #ifndef SUPPORT_UTF8
1004 utf8 = utf8; /* Stop pedantic compilers complaining */
1005 #endif
1006
1007 for (;;)
1008 {
1009 register int c = *code;
1010 if (c == OP_END) return NULL;
1011 else if (c > OP_BRA)
1012 {
1013 int n = c - OP_BRA;
1014 if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1015 if (n == number) return (uschar *)code;
1016 code += _pcre_OP_lengths[OP_BRA];
1017 }
1018 else
1019 {
1020 code += _pcre_OP_lengths[c];
1021
1022 #ifdef SUPPORT_UTF8
1023
1024 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1025 by a multi-byte character. The length in the table is a minimum, so we have
1026 to scan along to skip the extra bytes. All opcodes are less than 128, so we
1027 can use relatively efficient code. */
1028
1029 if (utf8) switch(c)
1030 {
1031 case OP_CHAR:
1032 case OP_CHARNC:
1033 case OP_EXACT:
1034 case OP_UPTO:
1035 case OP_MINUPTO:
1036 case OP_STAR:
1037 case OP_MINSTAR:
1038 case OP_PLUS:
1039 case OP_MINPLUS:
1040 case OP_QUERY:
1041 case OP_MINQUERY:
1042 while ((*code & 0xc0) == 0x80) code++;
1043 break;
1044
1045 /* XCLASS is used for classes that cannot be represented just by a bit
1046 map. This includes negated single high-valued characters. The length in
1047 the table is zero; the actual length is stored in the compiled code. */
1048
1049 case OP_XCLASS:
1050 code += GET(code, 1) + 1;
1051 break;
1052 }
1053 #endif
1054 }
1055 }
1056 }
1057
1058
1059
1060 /*************************************************
1061 * Scan compiled regex for recursion reference *
1062 *************************************************/
1063
1064 /* This little function scans through a compiled pattern until it finds an
1065 instance of OP_RECURSE.
1066
1067 Arguments:
1068 code points to start of expression
1069 utf8 TRUE in UTF-8 mode
1070
1071 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1072 */
1073
1074 static const uschar *
1075 find_recurse(const uschar *code, BOOL utf8)
1076 {
1077 #ifndef SUPPORT_UTF8
1078 utf8 = utf8; /* Stop pedantic compilers complaining */
1079 #endif
1080
1081 for (;;)
1082 {
1083 register int c = *code;
1084 if (c == OP_END) return NULL;
1085 else if (c == OP_RECURSE) return code;
1086 else if (c > OP_BRA)
1087 {
1088 code += _pcre_OP_lengths[OP_BRA];
1089 }
1090 else
1091 {
1092 code += _pcre_OP_lengths[c];
1093
1094 #ifdef SUPPORT_UTF8
1095
1096 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1097 by a multi-byte character. The length in the table is a minimum, so we have
1098 to scan along to skip the extra bytes. All opcodes are less than 128, so we
1099 can use relatively efficient code. */
1100
1101 if (utf8) switch(c)
1102 {
1103 case OP_CHAR:
1104 case OP_CHARNC:
1105 case OP_EXACT:
1106 case OP_UPTO:
1107 case OP_MINUPTO:
1108 case OP_STAR:
1109 case OP_MINSTAR:
1110 case OP_PLUS:
1111 case OP_MINPLUS:
1112 case OP_QUERY:
1113 case OP_MINQUERY:
1114 while ((*code & 0xc0) == 0x80) code++;
1115 break;
1116
1117 /* XCLASS is used for classes that cannot be represented just by a bit
1118 map. This includes negated single high-valued characters. The length in
1119 the table is zero; the actual length is stored in the compiled code. */
1120
1121 case OP_XCLASS:
1122 code += GET(code, 1) + 1;
1123 break;
1124 }
1125 #endif
1126 }
1127 }
1128 }
1129
1130
1131
1132 /*************************************************
1133 * Scan compiled branch for non-emptiness *
1134 *************************************************/
1135
1136 /* This function scans through a branch of a compiled pattern to see whether it
1137 can match the empty string or not. It is called only from could_be_empty()
1138 below. Note that first_significant_code() skips over assertions. If we hit an
1139 unclosed bracket, we return "empty" - this means we've struck an inner bracket
1140 whose current branch will already have been scanned.
1141
1142 Arguments:
1143 code points to start of search
1144 endcode points to where to stop
1145 utf8 TRUE if in UTF8 mode
1146
1147 Returns: TRUE if what is matched could be empty
1148 */
1149
1150 static BOOL
1151 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1152 {
1153 register int c;
1154 for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);
1155 code < endcode;
1156 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1157 {
1158 const uschar *ccode;
1159
1160 c = *code;
1161
1162 if (c >= OP_BRA)
1163 {
1164 BOOL empty_branch;
1165 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1166
1167 /* Scan a closed bracket */
1168
1169 empty_branch = FALSE;
1170 do
1171 {
1172 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1173 empty_branch = TRUE;
1174 code += GET(code, 1);
1175 }
1176 while (*code == OP_ALT);
1177 if (!empty_branch) return FALSE; /* All branches are non-empty */
1178 code += 1 + LINK_SIZE;
1179 c = *code;
1180 }
1181
1182 else switch (c)
1183 {
1184 /* Check for quantifiers after a class */
1185
1186 #ifdef SUPPORT_UTF8
1187 case OP_XCLASS:
1188 ccode = code + GET(code, 1);
1189 goto CHECK_CLASS_REPEAT;
1190 #endif
1191
1192 case OP_CLASS:
1193 case OP_NCLASS:
1194 ccode = code + 33;
1195
1196 #ifdef SUPPORT_UTF8
1197 CHECK_CLASS_REPEAT:
1198 #endif
1199
1200 switch (*ccode)
1201 {
1202 case OP_CRSTAR: /* These could be empty; continue */
1203 case OP_CRMINSTAR:
1204 case OP_CRQUERY:
1205 case OP_CRMINQUERY:
1206 break;
1207
1208 default: /* Non-repeat => class must match */
1209 case OP_CRPLUS: /* These repeats aren't empty */
1210 case OP_CRMINPLUS:
1211 return FALSE;
1212
1213 case OP_CRRANGE:
1214 case OP_CRMINRANGE:
1215 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1216 break;
1217 }
1218 break;
1219
1220 /* Opcodes that must match a character */
1221
1222 case OP_PROP:
1223 case OP_NOTPROP:
1224 case OP_EXTUNI:
1225 case OP_NOT_DIGIT:
1226 case OP_DIGIT:
1227 case OP_NOT_WHITESPACE:
1228 case OP_WHITESPACE:
1229 case OP_NOT_WORDCHAR:
1230 case OP_WORDCHAR:
1231 case OP_ANY:
1232 case OP_ANYBYTE:
1233 case OP_CHAR:
1234 case OP_CHARNC:
1235 case OP_NOT:
1236 case OP_PLUS:
1237 case OP_MINPLUS:
1238 case OP_EXACT:
1239 case OP_NOTPLUS:
1240 case OP_NOTMINPLUS:
1241 case OP_NOTEXACT:
1242 case OP_TYPEPLUS:
1243 case OP_TYPEMINPLUS:
1244 case OP_TYPEEXACT:
1245 return FALSE;
1246
1247 /* End of branch */
1248
1249 case OP_KET:
1250 case OP_KETRMAX:
1251 case OP_KETRMIN:
1252 case OP_ALT:
1253 return TRUE;
1254
1255 /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO may be
1256 followed by a multibyte character */
1257
1258 #ifdef SUPPORT_UTF8
1259 case OP_STAR:
1260 case OP_MINSTAR:
1261 case OP_QUERY:
1262 case OP_MINQUERY:
1263 case OP_UPTO:
1264 case OP_MINUPTO:
1265 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1266 break;
1267 #endif
1268 }
1269 }
1270
1271 return TRUE;
1272 }
1273
1274
1275
1276 /*************************************************
1277 * Scan compiled regex for non-emptiness *
1278 *************************************************/
1279
1280 /* This function is called to check for left recursive calls. We want to check
1281 the current branch of the current pattern to see if it could match the empty
1282 string. If it could, we must look outwards for branches at other levels,
1283 stopping when we pass beyond the bracket which is the subject of the recursion.
1284
1285 Arguments:
1286 code points to start of the recursion
1287 endcode points to where to stop (current RECURSE item)
1288 bcptr points to the chain of current (unclosed) branch starts
1289 utf8 TRUE if in UTF-8 mode
1290
1291 Returns: TRUE if what is matched could be empty
1292 */
1293
1294 static BOOL
1295 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1296 BOOL utf8)
1297 {
1298 while (bcptr != NULL && bcptr->current >= code)
1299 {
1300 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1301 bcptr = bcptr->outer;
1302 }
1303 return TRUE;
1304 }
1305
1306
1307
1308 /*************************************************
1309 * Check for POSIX class syntax *
1310 *************************************************/
1311
1312 /* This function is called when the sequence "[:" or "[." or "[=" is
1313 encountered in a character class. It checks whether this is followed by an
1314 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1315 ".]" or "=]".
1316
1317 Argument:
1318 ptr pointer to the initial [
1319 endptr where to return the end pointer
1320 cd pointer to compile data
1321
1322 Returns: TRUE or FALSE
1323 */
1324
1325 static BOOL
1326 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1327 {
1328 int terminator; /* Don't combine these lines; the Solaris cc */
1329 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1330 if (*(++ptr) == '^') ptr++;
1331 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1332 if (*ptr == terminator && ptr[1] == ']')
1333 {
1334 *endptr = ptr;
1335 return TRUE;
1336 }
1337 return FALSE;
1338 }
1339
1340
1341
1342
1343 /*************************************************
1344 * Check POSIX class name *
1345 *************************************************/
1346
1347 /* This function is called to check the name given in a POSIX-style class entry
1348 such as [:alnum:].
1349
1350 Arguments:
1351 ptr points to the first letter
1352 len the length of the name
1353
1354 Returns: a value representing the name, or -1 if unknown
1355 */
1356
1357 static int
1358 check_posix_name(const uschar *ptr, int len)
1359 {
1360 register int yield = 0;
1361 while (posix_name_lengths[yield] != 0)
1362 {
1363 if (len == posix_name_lengths[yield] &&
1364 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1365 yield++;
1366 }
1367 return -1;
1368 }
1369
1370
1371 /*************************************************
1372 * Adjust OP_RECURSE items in repeated group *
1373 *************************************************/
1374
1375 /* OP_RECURSE items contain an offset from the start of the regex to the group
1376 that is referenced. This means that groups can be replicated for fixed
1377 repetition simply by copying (because the recursion is allowed to refer to
1378 earlier groups that are outside the current group). However, when a group is
1379 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1380 it, after it has been compiled. This means that any OP_RECURSE items within it
1381 that refer to the group itself or any contained groups have to have their
1382 offsets adjusted. That is the job of this function. Before it is called, the
1383 partially compiled regex must be temporarily terminated with OP_END.
1384
1385 Arguments:
1386 group points to the start of the group
1387 adjust the amount by which the group is to be moved
1388 utf8 TRUE in UTF-8 mode
1389 cd contains pointers to tables etc.
1390
1391 Returns: nothing
1392 */
1393
1394 static void
1395 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)
1396 {
1397 uschar *ptr = group;
1398 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1399 {
1400 int offset = GET(ptr, 1);
1401 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1402 ptr += 1 + LINK_SIZE;
1403 }
1404 }
1405
1406
1407
1408 /*************************************************
1409 * Insert an automatic callout point *
1410 *************************************************/
1411
1412 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1413 callout points before each pattern item.
1414
1415 Arguments:
1416 code current code pointer
1417 ptr current pattern pointer
1418 cd pointers to tables etc
1419
1420 Returns: new code pointer
1421 */
1422
1423 static uschar *
1424 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1425 {
1426 *code++ = OP_CALLOUT;
1427 *code++ = 255;
1428 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1429 PUT(code, LINK_SIZE, 0); /* Default length */
1430 return code + 2*LINK_SIZE;
1431 }
1432
1433
1434
1435 /*************************************************
1436 * Complete a callout item *
1437 *************************************************/
1438
1439 /* A callout item contains the length of the next item in the pattern, which
1440 we can't fill in till after we have reached the relevant point. This is used
1441 for both automatic and manual callouts.
1442
1443 Arguments:
1444 previous_callout points to previous callout item
1445 ptr current pattern pointer
1446 cd pointers to tables etc
1447
1448 Returns: nothing
1449 */
1450
1451 static void
1452 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1453 {
1454 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1455 PUT(previous_callout, 2 + LINK_SIZE, length);
1456 }
1457
1458
1459
1460 #ifdef SUPPORT_UCP
1461 /*************************************************
1462 * Get othercase range *
1463 *************************************************/
1464
1465 /* This function is passed the start and end of a class range, in UTF-8 mode
1466 with UCP support. It searches up the characters, looking for internal ranges of
1467 characters in the "other" case. Each call returns the next one, updating the
1468 start address.
1469
1470 Arguments:
1471 cptr points to starting character value; updated
1472 d end value
1473 ocptr where to put start of othercase range
1474 odptr where to put end of othercase range
1475
1476 Yield: TRUE when range returned; FALSE when no more
1477 */
1478
1479 static BOOL
1480 get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)
1481 {
1482 int c, chartype, othercase, next;
1483
1484 for (c = *cptr; c <= d; c++)
1485 {
1486 if (_pcre_ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0)
1487 break;
1488 }
1489
1490 if (c > d) return FALSE;
1491
1492 *ocptr = othercase;
1493 next = othercase + 1;
1494
1495 for (++c; c <= d; c++)
1496 {
1497 if (_pcre_ucp_findchar(c, &chartype, &othercase) != ucp_L ||
1498 othercase != next)
1499 break;
1500 next++;
1501 }
1502
1503 *odptr = next - 1;
1504 *cptr = c;
1505
1506 return TRUE;
1507 }
1508 #endif /* SUPPORT_UCP */
1509
1510
1511 /*************************************************
1512 * Compile one branch *
1513 *************************************************/
1514
1515 /* Scan the pattern, compiling it into the code vector. If the options are
1516 changed during the branch, the pointer is used to change the external options
1517 bits.
1518
1519 Arguments:
1520 optionsptr pointer to the option bits
1521 brackets points to number of extracting brackets used
1522 codeptr points to the pointer to the current code point
1523 ptrptr points to the current pattern pointer
1524 errorcodeptr points to error code variable
1525 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
1526 reqbyteptr set to the last literal character required, else < 0
1527 bcptr points to current branch chain
1528 cd contains pointers to tables etc.
1529
1530 Returns: TRUE on success
1531 FALSE, with *errorcodeptr set non-zero on error
1532 */
1533
1534 static BOOL
1535 compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
1536 const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,
1537 int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
1538 {
1539 int repeat_type, op_type;
1540 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
1541 int bravalue = 0;
1542 int greedy_default, greedy_non_default;
1543 int firstbyte, reqbyte;
1544 int zeroreqbyte, zerofirstbyte;
1545 int req_caseopt, reqvary, tempreqvary;
1546 int condcount = 0;
1547 int options = *optionsptr;
1548 int after_manual_callout = 0;
1549 register int c;
1550 register uschar *code = *codeptr;
1551 uschar *tempcode;
1552 BOOL inescq = FALSE;
1553 BOOL groupsetfirstbyte = FALSE;
1554 const uschar *ptr = *ptrptr;
1555 const uschar *tempptr;
1556 uschar *previous = NULL;
1557 uschar *previous_callout = NULL;
1558 uschar classbits[32];
1559
1560 #ifdef SUPPORT_UTF8
1561 BOOL class_utf8;
1562 BOOL utf8 = (options & PCRE_UTF8) != 0;
1563 uschar *class_utf8data;
1564 uschar utf8_char[6];
1565 #else
1566 BOOL utf8 = FALSE;
1567 #endif
1568
1569 /* Set up the default and non-default settings for greediness */
1570
1571 greedy_default = ((options & PCRE_UNGREEDY) != 0);
1572 greedy_non_default = greedy_default ^ 1;
1573
1574 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
1575 matching encountered yet". It gets changed to REQ_NONE if we hit something that
1576 matches a non-fixed char first char; reqbyte just remains unset if we never
1577 find one.
1578
1579 When we hit a repeat whose minimum is zero, we may have to adjust these values
1580 to take the zero repeat into account. This is implemented by setting them to
1581 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
1582 item types that can be repeated set these backoff variables appropriately. */
1583
1584 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
1585
1586 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
1587 according to the current setting of the caseless flag. REQ_CASELESS is a bit
1588 value > 255. It is added into the firstbyte or reqbyte variables to record the
1589 case status of the value. This is used only for ASCII characters. */
1590
1591 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
1592
1593 /* Switch on next character until the end of the branch */
1594
1595 for (;; ptr++)
1596 {
1597 BOOL negate_class;
1598 BOOL possessive_quantifier;
1599 BOOL is_quantifier;
1600 int class_charcount;
1601 int class_lastchar;
1602 int newoptions;
1603 int recno;
1604 int skipbytes;
1605 int subreqbyte;
1606 int subfirstbyte;
1607 int mclength;
1608 uschar mcbuffer[8];
1609
1610 /* Next byte in the pattern */
1611
1612 c = *ptr;
1613
1614 /* If in \Q...\E, check for the end; if not, we have a literal */
1615
1616 if (inescq && c != 0)
1617 {
1618 if (c == '\\' && ptr[1] == 'E')
1619 {
1620 inescq = FALSE;
1621 ptr++;
1622 continue;
1623 }
1624 else
1625 {
1626 if (previous_callout != NULL)
1627 {
1628 complete_callout(previous_callout, ptr, cd);
1629 previous_callout = NULL;
1630 }
1631 if ((options & PCRE_AUTO_CALLOUT) != 0)
1632 {
1633 previous_callout = code;
1634 code = auto_callout(code, ptr, cd);
1635 }
1636 goto NORMAL_CHAR;
1637 }
1638 }
1639
1640 /* Fill in length of a previous callout, except when the next thing is
1641 a quantifier. */
1642
1643 is_quantifier = c == '*' || c == '+' || c == '?' ||
1644 (c == '{' && is_counted_repeat(ptr+1));
1645
1646 if (!is_quantifier && previous_callout != NULL &&
1647 after_manual_callout-- <= 0)
1648 {
1649 complete_callout(previous_callout, ptr, cd);
1650 previous_callout = NULL;
1651 }
1652
1653 /* In extended mode, skip white space and comments */
1654
1655 if ((options & PCRE_EXTENDED) != 0)
1656 {
1657 if ((cd->ctypes[c] & ctype_space) != 0) continue;
1658 if (c == '#')
1659 {
1660 /* The space before the ; is to avoid a warning on a silly compiler
1661 on the Macintosh. */
1662 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
1663 if (c != 0) continue; /* Else fall through to handle end of string */
1664 }
1665 }
1666
1667 /* No auto callout for quantifiers. */
1668
1669 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
1670 {
1671 previous_callout = code;
1672 code = auto_callout(code, ptr, cd);
1673 }
1674
1675 switch(c)
1676 {
1677 /* The branch terminates at end of string, |, or ). */
1678
1679 case 0:
1680 case '|':
1681 case ')':
1682 *firstbyteptr = firstbyte;
1683 *reqbyteptr = reqbyte;
1684 *codeptr = code;
1685 *ptrptr = ptr;
1686 return TRUE;
1687
1688 /* Handle single-character metacharacters. In multiline mode, ^ disables
1689 the setting of any following char as a first character. */
1690
1691 case '^':
1692 if ((options & PCRE_MULTILINE) != 0)
1693 {
1694 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1695 }
1696 previous = NULL;
1697 *code++ = OP_CIRC;
1698 break;
1699
1700 case '$':
1701 previous = NULL;
1702 *code++ = OP_DOLL;
1703 break;
1704
1705 /* There can never be a first char if '.' is first, whatever happens about
1706 repeats. The value of reqbyte doesn't change either. */
1707
1708 case '.':
1709 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1710 zerofirstbyte = firstbyte;
1711 zeroreqbyte = reqbyte;
1712 previous = code;
1713 *code++ = OP_ANY;
1714 break;
1715
1716 /* Character classes. If the included characters are all < 255 in value, we
1717 build a 32-byte bitmap of the permitted characters, except in the special
1718 case where there is only one such character. For negated classes, we build
1719 the map as usual, then invert it at the end. However, we use a different
1720 opcode so that data characters > 255 can be handled correctly.
1721
1722 If the class contains characters outside the 0-255 range, a different
1723 opcode is compiled. It may optionally have a bit map for characters < 256,
1724 but those above are are explicitly listed afterwards. A flag byte tells
1725 whether the bitmap is present, and whether this is a negated class or not.
1726 */
1727
1728 case '[':
1729 previous = code;
1730
1731 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
1732 they are encountered at the top level, so we'll do that too. */
1733
1734 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1735 check_posix_syntax(ptr, &tempptr, cd))
1736 {
1737 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
1738 goto FAILED;
1739 }
1740
1741 /* If the first character is '^', set the negation flag and skip it. */
1742
1743 if ((c = *(++ptr)) == '^')
1744 {
1745 negate_class = TRUE;
1746 c = *(++ptr);
1747 }
1748 else
1749 {
1750 negate_class = FALSE;
1751 }
1752
1753 /* Keep a count of chars with values < 256 so that we can optimize the case
1754 of just a single character (as long as it's < 256). For higher valued UTF-8
1755 characters, we don't yet do any optimization. */
1756
1757 class_charcount = 0;
1758 class_lastchar = -1;
1759
1760 #ifdef SUPPORT_UTF8
1761 class_utf8 = FALSE; /* No chars >= 256 */
1762 class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
1763 #endif
1764
1765 /* Initialize the 32-char bit map to all zeros. We have to build the
1766 map in a temporary bit of store, in case the class contains only 1
1767 character (< 256), because in that case the compiled code doesn't use the
1768 bit map. */
1769
1770 memset(classbits, 0, 32 * sizeof(uschar));
1771
1772 /* Process characters until ] is reached. By writing this as a "do" it
1773 means that an initial ] is taken as a data character. The first pass
1774 through the regex checked the overall syntax, so we don't need to be very
1775 strict here. At the start of the loop, c contains the first byte of the
1776 character. */
1777
1778 do
1779 {
1780 #ifdef SUPPORT_UTF8
1781 if (utf8 && c > 127)
1782 { /* Braces are required because the */
1783 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
1784 }
1785 #endif
1786
1787 /* Inside \Q...\E everything is literal except \E */
1788
1789 if (inescq)
1790 {
1791 if (c == '\\' && ptr[1] == 'E')
1792 {
1793 inescq = FALSE;
1794 ptr++;
1795 continue;
1796 }
1797 else goto LONE_SINGLE_CHARACTER;
1798 }
1799
1800 /* Handle POSIX class names. Perl allows a negation extension of the
1801 form [:^name:]. A square bracket that doesn't match the syntax is
1802 treated as a literal. We also recognize the POSIX constructions
1803 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1804 5.6 and 5.8 do. */
1805
1806 if (c == '[' &&
1807 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1808 check_posix_syntax(ptr, &tempptr, cd))
1809 {
1810 BOOL local_negate = FALSE;
1811 int posix_class, i;
1812 register const uschar *cbits = cd->cbits;
1813
1814 if (ptr[1] != ':')
1815 {
1816 *errorcodeptr = ERR31;
1817 goto FAILED;
1818 }
1819
1820 ptr += 2;
1821 if (*ptr == '^')
1822 {
1823 local_negate = TRUE;
1824 ptr++;
1825 }
1826
1827 posix_class = check_posix_name(ptr, tempptr - ptr);
1828 if (posix_class < 0)
1829 {
1830 *errorcodeptr = ERR30;
1831 goto FAILED;
1832 }
1833
1834 /* If matching is caseless, upper and lower are converted to
1835 alpha. This relies on the fact that the class table starts with
1836 alpha, lower, upper as the first 3 entries. */
1837
1838 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1839 posix_class = 0;
1840
1841 /* Or into the map we are building up to 3 of the static class
1842 tables, or their negations. The [:blank:] class sets up the same
1843 chars as the [:space:] class (all white space). We remove the vertical
1844 white space chars afterwards. */
1845
1846 posix_class *= 3;
1847 for (i = 0; i < 3; i++)
1848 {
1849 BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;
1850 int taboffset = posix_class_maps[posix_class + i];
1851 if (taboffset < 0) break;
1852 if (local_negate)
1853 {
1854 if (i == 0)
1855 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset];
1856 else
1857 for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset];
1858 if (blankclass) classbits[1] |= 0x3c;
1859 }
1860 else
1861 {
1862 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset];
1863 if (blankclass) classbits[1] &= ~0x3c;
1864 }
1865 }
1866
1867 ptr = tempptr + 1;
1868 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
1869 continue; /* End of POSIX syntax handling */
1870 }
1871
1872 /* Backslash may introduce a single character, or it may introduce one
1873 of the specials, which just set a flag. Escaped items are checked for
1874 validity in the pre-compiling pass. The sequence \b is a special case.
1875 Inside a class (and only there) it is treated as backspace. Elsewhere
1876 it marks a word boundary. Other escapes have preset maps ready to
1877 or into the one we are building. We assume they have more than one
1878 character in them, so set class_charcount bigger than one. */
1879
1880 if (c == '\\')
1881 {
1882 c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);
1883
1884 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
1885 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
1886 else if (-c == ESC_Q) /* Handle start of quoted string */
1887 {
1888 if (ptr[1] == '\\' && ptr[2] == 'E')
1889 {
1890 ptr += 2; /* avoid empty string */
1891 }
1892 else inescq = TRUE;
1893 continue;
1894 }
1895
1896 if (c < 0)
1897 {
1898 register const uschar *cbits = cd->cbits;
1899 class_charcount += 2; /* Greater than 1 is what matters */
1900 switch (-c)
1901 {
1902 case ESC_d:
1903 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
1904 continue;
1905
1906 case ESC_D:
1907 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
1908 continue;
1909
1910 case ESC_w:
1911 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
1912 continue;
1913
1914 case ESC_W:
1915 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
1916 continue;
1917
1918 case ESC_s:
1919 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
1920 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
1921 continue;
1922
1923 case ESC_S:
1924 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
1925 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
1926 continue;
1927
1928 #ifdef SUPPORT_UCP
1929 case ESC_p:
1930 case ESC_P:
1931 {
1932 BOOL negated;
1933 int property = get_ucp(&ptr, &negated, errorcodeptr);
1934 if (property < 0) goto FAILED;
1935 class_utf8 = TRUE;
1936 *class_utf8data++ = ((-c == ESC_p) != negated)?
1937 XCL_PROP : XCL_NOTPROP;
1938 *class_utf8data++ = property;
1939 class_charcount -= 2; /* Not a < 256 character */
1940 }
1941 continue;
1942 #endif
1943
1944 /* Unrecognized escapes are faulted if PCRE is running in its
1945 strict mode. By default, for compatibility with Perl, they are
1946 treated as literals. */
1947
1948 default:
1949 if ((options & PCRE_EXTRA) != 0)
1950 {
1951 *errorcodeptr = ERR7;
1952 goto FAILED;
1953 }
1954 c = *ptr; /* The final character */
1955 class_charcount -= 2; /* Undo the default count from above */
1956 }
1957 }
1958
1959 /* Fall through if we have a single character (c >= 0). This may be
1960 > 256 in UTF-8 mode. */
1961
1962 } /* End of backslash handling */
1963
1964 /* A single character may be followed by '-' to form a range. However,
1965 Perl does not permit ']' to be the end of the range. A '-' character
1966 here is treated as a literal. */
1967
1968 if (ptr[1] == '-' && ptr[2] != ']')
1969 {
1970 int d;
1971 ptr += 2;
1972
1973 #ifdef SUPPORT_UTF8
1974 if (utf8)
1975 { /* Braces are required because the */
1976 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
1977 }
1978 else
1979 #endif
1980 d = *ptr; /* Not UTF-8 mode */
1981
1982 /* The second part of a range can be a single-character escape, but
1983 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1984 in such circumstances. */
1985
1986 if (d == '\\')
1987 {
1988 const uschar *oldptr = ptr;
1989 d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);
1990
1991 /* \b is backslash; \X is literal X; any other special means the '-'
1992 was literal */
1993
1994 if (d < 0)
1995 {
1996 if (d == -ESC_b) d = '\b';
1997 else if (d == -ESC_X) d = 'X'; else
1998 {
1999 ptr = oldptr - 2;
2000 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2001 }
2002 }
2003 }
2004
2005 /* The check that the two values are in the correct order happens in
2006 the pre-pass. Optimize one-character ranges */
2007
2008 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2009
2010 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2011 matching, we have to use an XCLASS with extra data items. Caseless
2012 matching for characters > 127 is available only if UCP support is
2013 available. */
2014
2015 #ifdef SUPPORT_UTF8
2016 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2017 {
2018 class_utf8 = TRUE;
2019
2020 /* With UCP support, we can find the other case equivalents of
2021 the relevant characters. There may be several ranges. Optimize how
2022 they fit with the basic range. */
2023
2024 #ifdef SUPPORT_UCP
2025 if ((options & PCRE_CASELESS) != 0)
2026 {
2027 int occ, ocd;
2028 int cc = c;
2029 int origd = d;
2030 while (get_othercase_range(&cc, origd, &occ, &ocd))
2031 {
2032 if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
2033
2034 if (occ < c && ocd >= c - 1) /* Extend the basic range */
2035 { /* if there is overlap, */
2036 c = occ; /* noting that if occ < c */
2037 continue; /* we can't have ocd > d */
2038 } /* because a subrange is */
2039 if (ocd > d && occ <= d + 1) /* always shorter than */
2040 { /* the basic range. */
2041 d = ocd;
2042 continue;
2043 }
2044
2045 if (occ == ocd)
2046 {
2047 *class_utf8data++ = XCL_SINGLE;
2048 }
2049 else
2050 {
2051 *class_utf8data++ = XCL_RANGE;
2052 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2053 }
2054 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2055 }
2056 }
2057 #endif /* SUPPORT_UCP */
2058
2059 /* Now record the original range, possibly modified for UCP caseless
2060 overlapping ranges. */
2061
2062 *class_utf8data++ = XCL_RANGE;
2063 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2064 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2065
2066 /* With UCP support, we are done. Without UCP support, there is no
2067 caseless matching for UTF-8 characters > 127; we can use the bit map
2068 for the smaller ones. */
2069
2070 #ifdef SUPPORT_UCP
2071 continue; /* With next character in the class */
2072 #else
2073 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2074
2075 /* Adjust upper limit and fall through to set up the map */
2076
2077 d = 127;
2078
2079 #endif /* SUPPORT_UCP */
2080 }
2081 #endif /* SUPPORT_UTF8 */
2082
2083 /* We use the bit map for all cases when not in UTF-8 mode; else
2084 ranges that lie entirely within 0-127 when there is UCP support; else
2085 for partial ranges without UCP support. */
2086
2087 for (; c <= d; c++)
2088 {
2089 classbits[c/8] |= (1 << (c&7));
2090 if ((options & PCRE_CASELESS) != 0)
2091 {
2092 int uc = cd->fcc[c]; /* flip case */
2093 classbits[uc/8] |= (1 << (uc&7));
2094 }
2095 class_charcount++; /* in case a one-char range */
2096 class_lastchar = c;
2097 }
2098
2099 continue; /* Go get the next char in the class */
2100 }
2101
2102 /* Handle a lone single character - we can get here for a normal
2103 non-escape char, or after \ that introduces a single character or for an
2104 apparent range that isn't. */
2105
2106 LONE_SINGLE_CHARACTER:
2107
2108 /* Handle a character that cannot go in the bit map */
2109
2110 #ifdef SUPPORT_UTF8
2111 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2112 {
2113 class_utf8 = TRUE;
2114 *class_utf8data++ = XCL_SINGLE;
2115 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2116
2117 #ifdef SUPPORT_UCP
2118 if ((options & PCRE_CASELESS) != 0)
2119 {
2120 int chartype;
2121 int othercase;
2122 if (_pcre_ucp_findchar(c, &chartype, &othercase) >= 0 &&
2123 othercase > 0)
2124 {
2125 *class_utf8data++ = XCL_SINGLE;
2126 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
2127 }
2128 }
2129 #endif /* SUPPORT_UCP */
2130
2131 }
2132 else
2133 #endif /* SUPPORT_UTF8 */
2134
2135 /* Handle a single-byte character */
2136 {
2137 classbits[c/8] |= (1 << (c&7));
2138 if ((options & PCRE_CASELESS) != 0)
2139 {
2140 c = cd->fcc[c]; /* flip case */
2141 classbits[c/8] |= (1 << (c&7));
2142 }
2143 class_charcount++;
2144 class_lastchar = c;
2145 }
2146 }
2147
2148 /* Loop until ']' reached; the check for end of string happens inside the
2149 loop. This "while" is the end of the "do" above. */
2150
2151 while ((c = *(++ptr)) != ']' || inescq);
2152
2153 /* If class_charcount is 1, we saw precisely one character whose value is
2154 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2155 can optimize the negative case only if there were no characters >= 128
2156 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2157 single-bytes only. This is an historical hangover. Maybe one day we can
2158 tidy these opcodes to handle multi-byte characters.
2159
2160 The optimization throws away the bit map. We turn the item into a
2161 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2162 that OP_NOT does not support multibyte characters. In the positive case, it
2163 can cause firstbyte to be set. Otherwise, there can be no first char if
2164 this item is first, whatever repeat count may follow. In the case of
2165 reqbyte, save the previous value for reinstating. */
2166
2167 #ifdef SUPPORT_UTF8
2168 if (class_charcount == 1 &&
2169 (!utf8 ||
2170 (!class_utf8 && (!negate_class || class_lastchar < 128))))
2171
2172 #else
2173 if (class_charcount == 1)
2174 #endif
2175 {
2176 zeroreqbyte = reqbyte;
2177
2178 /* The OP_NOT opcode works on one-byte characters only. */
2179
2180 if (negate_class)
2181 {
2182 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2183 zerofirstbyte = firstbyte;
2184 *code++ = OP_NOT;
2185 *code++ = class_lastchar;
2186 break;
2187 }
2188
2189 /* For a single, positive character, get the value into mcbuffer, and
2190 then we can handle this with the normal one-character code. */
2191
2192 #ifdef SUPPORT_UTF8
2193 if (utf8 && class_lastchar > 127)
2194 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
2195 else
2196 #endif
2197 {
2198 mcbuffer[0] = class_lastchar;
2199 mclength = 1;
2200 }
2201 goto ONE_CHAR;
2202 } /* End of 1-char optimization */
2203
2204 /* The general case - not the one-char optimization. If this is the first
2205 thing in the branch, there can be no first char setting, whatever the
2206 repeat count. Any reqbyte setting must remain unchanged after any kind of
2207 repeat. */
2208
2209 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2210 zerofirstbyte = firstbyte;
2211 zeroreqbyte = reqbyte;
2212
2213 /* If there are characters with values > 255, we have to compile an
2214 extended class, with its own opcode. If there are no characters < 256,
2215 we can omit the bitmap. */
2216
2217 #ifdef SUPPORT_UTF8
2218 if (class_utf8)
2219 {
2220 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2221 *code++ = OP_XCLASS;
2222 code += LINK_SIZE;
2223 *code = negate_class? XCL_NOT : 0;
2224
2225 /* If the map is required, install it, and move on to the end of
2226 the extra data */
2227
2228 if (class_charcount > 0)
2229 {
2230 *code++ |= XCL_MAP;
2231 memcpy(code, classbits, 32);
2232 code = class_utf8data;
2233 }
2234
2235 /* If the map is not required, slide down the extra data. */
2236
2237 else
2238 {
2239 int len = class_utf8data - (code + 33);
2240 memmove(code + 1, code + 33, len);
2241 code += len + 1;
2242 }
2243
2244 /* Now fill in the complete length of the item */
2245
2246 PUT(previous, 1, code - previous);
2247 break; /* End of class handling */
2248 }
2249 #endif
2250
2251 /* If there are no characters > 255, negate the 32-byte map if necessary,
2252 and copy it into the code vector. If this is the first thing in the branch,
2253 there can be no first char setting, whatever the repeat count. Any reqbyte
2254 setting must remain unchanged after any kind of repeat. */
2255
2256 if (negate_class)
2257 {
2258 *code++ = OP_NCLASS;
2259 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2260 }
2261 else
2262 {
2263 *code++ = OP_CLASS;
2264 memcpy(code, classbits, 32);
2265 }
2266 code += 32;
2267 break;
2268
2269 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2270 has been tested above. */
2271
2272 case '{':
2273 if (!is_quantifier) goto NORMAL_CHAR;
2274 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
2275 if (*errorcodeptr != 0) goto FAILED;
2276 goto REPEAT;
2277
2278 case '*':
2279 repeat_min = 0;
2280 repeat_max = -1;
2281 goto REPEAT;
2282
2283 case '+':
2284 repeat_min = 1;
2285 repeat_max = -1;
2286 goto REPEAT;
2287
2288 case '?':
2289 repeat_min = 0;
2290 repeat_max = 1;
2291
2292 REPEAT:
2293 if (previous == NULL)
2294 {
2295 *errorcodeptr = ERR9;
2296 goto FAILED;
2297 }
2298
2299 if (repeat_min == 0)
2300 {
2301 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2302 reqbyte = zeroreqbyte; /* Ditto */
2303 }
2304
2305 /* Remember whether this is a variable length repeat */
2306
2307 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2308
2309 op_type = 0; /* Default single-char op codes */
2310 possessive_quantifier = FALSE; /* Default not possessive quantifier */
2311
2312 /* Save start of previous item, in case we have to move it up to make space
2313 for an inserted OP_ONCE for the additional '+' extension. */
2314
2315 tempcode = previous;
2316
2317 /* If the next character is '+', we have a possessive quantifier. This
2318 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2319 If the next character is '?' this is a minimizing repeat, by default,
2320 but if PCRE_UNGREEDY is set, it works the other way round. We change the
2321 repeat type to the non-default. */
2322
2323 if (ptr[1] == '+')
2324 {
2325 repeat_type = 0; /* Force greedy */
2326 possessive_quantifier = TRUE;
2327 ptr++;
2328 }
2329 else if (ptr[1] == '?')
2330 {
2331 repeat_type = greedy_non_default;
2332 ptr++;
2333 }
2334 else repeat_type = greedy_default;
2335
2336 /* If previous was a recursion, we need to wrap it inside brackets so that
2337 it can be replicated if necessary. */
2338
2339 if (*previous == OP_RECURSE)
2340 {
2341 memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2342 code += 1 + LINK_SIZE;
2343 *previous = OP_BRA;
2344 PUT(previous, 1, code - previous);
2345 *code = OP_KET;
2346 PUT(code, 1, code - previous);
2347 code += 1 + LINK_SIZE;
2348 }
2349
2350 /* If previous was a character match, abolish the item and generate a
2351 repeat item instead. If a char item has a minumum of more than one, ensure
2352 that it is set in reqbyte - it might not be if a sequence such as x{3} is
2353 the first thing in a branch because the x will have gone into firstbyte
2354 instead. */
2355
2356 if (*previous == OP_CHAR || *previous == OP_CHARNC)
2357 {
2358 /* Deal with UTF-8 characters that take up more than one byte. It's
2359 easier to write this out separately than try to macrify it. Use c to
2360 hold the length of the character in bytes, plus 0x80 to flag that it's a
2361 length rather than a small character. */
2362
2363 #ifdef SUPPORT_UTF8
2364 if (utf8 && (code[-1] & 0x80) != 0)
2365 {
2366 uschar *lastchar = code - 1;
2367 while((*lastchar & 0xc0) == 0x80) lastchar--;
2368 c = code - lastchar; /* Length of UTF-8 character */
2369 memcpy(utf8_char, lastchar, c); /* Save the char */
2370 c |= 0x80; /* Flag c as a length */
2371 }
2372 else
2373 #endif
2374
2375 /* Handle the case of a single byte - either with no UTF8 support, or
2376 with UTF-8 disabled, or for a UTF-8 character < 128. */
2377
2378 {
2379 c = code[-1];
2380 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2381 }
2382
2383 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
2384 }
2385
2386 /* If previous was a single negated character ([^a] or similar), we use
2387 one of the special opcodes, replacing it. The code is shared with single-
2388 character repeats by setting opt_type to add a suitable offset into
2389 repeat_type. OP_NOT is currently used only for single-byte chars. */
2390
2391 else if (*previous == OP_NOT)
2392 {
2393 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
2394 c = previous[1];
2395 goto OUTPUT_SINGLE_REPEAT;
2396 }
2397
2398 /* If previous was a character type match (\d or similar), abolish it and
2399 create a suitable repeat item. The code is shared with single-character
2400 repeats by setting op_type to add a suitable offset into repeat_type. Note
2401 the the Unicode property types will be present only when SUPPORT_UCP is
2402 defined, but we don't wrap the little bits of code here because it just
2403 makes it horribly messy. */
2404
2405 else if (*previous < OP_EODN)
2406 {
2407 uschar *oldcode;
2408 int prop_type;
2409 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
2410 c = *previous;
2411
2412 OUTPUT_SINGLE_REPEAT:
2413 prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)?
2414 previous[1] : -1;
2415
2416 oldcode = code;
2417 code = previous; /* Usually overwrite previous item */
2418
2419 /* If the maximum is zero then the minimum must also be zero; Perl allows
2420 this case, so we do too - by simply omitting the item altogether. */
2421
2422 if (repeat_max == 0) goto END_REPEAT;
2423
2424 /* All real repeats make it impossible to handle partial matching (maybe
2425 one day we will be able to remove this restriction). */
2426
2427 if (repeat_max != 1) cd->nopartial = TRUE;
2428
2429 /* Combine the op_type with the repeat_type */
2430
2431 repeat_type += op_type;
2432
2433 /* A minimum of zero is handled either as the special case * or ?, or as
2434 an UPTO, with the maximum given. */
2435
2436 if (repeat_min == 0)
2437 {
2438 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
2439 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
2440 else
2441 {
2442 *code++ = OP_UPTO + repeat_type;
2443 PUT2INC(code, 0, repeat_max);
2444 }
2445 }
2446
2447 /* A repeat minimum of 1 is optimized into some special cases. If the
2448 maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
2449 left in place and, if the maximum is greater than 1, we use OP_UPTO with
2450 one less than the maximum. */
2451
2452 else if (repeat_min == 1)
2453 {
2454 if (repeat_max == -1)
2455 *code++ = OP_PLUS + repeat_type;
2456 else
2457 {
2458 code = oldcode; /* leave previous item in place */
2459 if (repeat_max == 1) goto END_REPEAT;
2460 *code++ = OP_UPTO + repeat_type;
2461 PUT2INC(code, 0, repeat_max - 1);
2462 }
2463 }
2464
2465 /* The case {n,n} is just an EXACT, while the general case {n,m} is
2466 handled as an EXACT followed by an UPTO. */
2467
2468 else
2469 {
2470 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
2471 PUT2INC(code, 0, repeat_min);
2472
2473 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
2474 we have to insert the character for the previous code. For a repeated
2475 Unicode property match, there is an extra byte that defines the
2476 required property. In UTF-8 mode, long characters have their length in
2477 c, with the 0x80 bit as a flag. */
2478
2479 if (repeat_max < 0)
2480 {
2481 #ifdef SUPPORT_UTF8
2482 if (utf8 && c >= 128)
2483 {
2484 memcpy(code, utf8_char, c & 7);
2485 code += c & 7;
2486 }
2487 else
2488 #endif
2489 {
2490 *code++ = c;
2491 if (prop_type >= 0) *code++ = prop_type;
2492 }
2493 *code++ = OP_STAR + repeat_type;
2494 }
2495
2496 /* Else insert an UPTO if the max is greater than the min, again
2497 preceded by the character, for the previously inserted code. */
2498
2499 else if (repeat_max != repeat_min)
2500 {
2501 #ifdef SUPPORT_UTF8
2502 if (utf8 && c >= 128)
2503 {
2504 memcpy(code, utf8_char, c & 7);
2505 code += c & 7;
2506 }
2507 else
2508 #endif
2509 *code++ = c;
2510 if (prop_type >= 0) *code++ = prop_type;
2511 repeat_max -= repeat_min;
2512 *code++ = OP_UPTO + repeat_type;
2513 PUT2INC(code, 0, repeat_max);
2514 }
2515 }
2516
2517 /* The character or character type itself comes last in all cases. */
2518
2519 #ifdef SUPPORT_UTF8
2520 if (utf8 && c >= 128)
2521 {
2522 memcpy(code, utf8_char, c & 7);
2523 code += c & 7;
2524 }
2525 else
2526 #endif
2527 *code++ = c;
2528
2529 /* For a repeated Unicode property match, there is an extra byte that
2530 defines the required property. */
2531
2532 #ifdef SUPPORT_UCP
2533 if (prop_type >= 0) *code++ = prop_type;
2534 #endif
2535 }
2536
2537 /* If previous was a character class or a back reference, we put the repeat
2538 stuff after it, but just skip the item if the repeat was {0,0}. */
2539
2540 else if (*previous == OP_CLASS ||
2541 *previous == OP_NCLASS ||
2542 #ifdef SUPPORT_UTF8
2543 *previous == OP_XCLASS ||
2544 #endif
2545 *previous == OP_REF)
2546 {
2547 if (repeat_max == 0)
2548 {
2549 code = previous;
2550 goto END_REPEAT;
2551 }
2552
2553 /* All real repeats make it impossible to handle partial matching (maybe
2554 one day we will be able to remove this restriction). */
2555
2556 if (repeat_max != 1) cd->nopartial = TRUE;
2557
2558 if (repeat_min == 0 && repeat_max == -1)
2559 *code++ = OP_CRSTAR + repeat_type;
2560 else if (repeat_min == 1 && repeat_max == -1)
2561 *code++ = OP_CRPLUS + repeat_type;
2562 else if (repeat_min == 0 && repeat_max == 1)
2563 *code++ = OP_CRQUERY + repeat_type;
2564 else
2565 {
2566 *code++ = OP_CRRANGE + repeat_type;
2567 PUT2INC(code, 0, repeat_min);
2568 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
2569 PUT2INC(code, 0, repeat_max);
2570 }
2571 }
2572
2573 /* If previous was a bracket group, we may have to replicate it in certain
2574 cases. */
2575
2576 else if (*previous >= OP_BRA || *previous == OP_ONCE ||
2577 *previous == OP_COND)
2578 {
2579 register int i;
2580 int ketoffset = 0;
2581 int len = code - previous;
2582 uschar *bralink = NULL;
2583
2584 /* If the maximum repeat count is unlimited, find the end of the bracket
2585 by scanning through from the start, and compute the offset back to it
2586 from the current code pointer. There may be an OP_OPT setting following
2587 the final KET, so we can't find the end just by going back from the code
2588 pointer. */
2589
2590 if (repeat_max == -1)
2591 {
2592 register uschar *ket = previous;
2593 do ket += GET(ket, 1); while (*ket != OP_KET);
2594 ketoffset = code - ket;
2595 }
2596
2597 /* The case of a zero minimum is special because of the need to stick
2598 OP_BRAZERO in front of it, and because the group appears once in the
2599 data, whereas in other cases it appears the minimum number of times. For
2600 this reason, it is simplest to treat this case separately, as otherwise
2601 the code gets far too messy. There are several special subcases when the
2602 minimum is zero. */
2603
2604 if (repeat_min == 0)
2605 {
2606 /* If the maximum is also zero, we just omit the group from the output
2607 altogether. */
2608
2609 if (repeat_max == 0)
2610 {
2611 code = previous;
2612 goto END_REPEAT;
2613 }
2614
2615 /* If the maximum is 1 or unlimited, we just have to stick in the
2616 BRAZERO and do no more at this point. However, we do need to adjust
2617 any OP_RECURSE calls inside the group that refer to the group itself or
2618 any internal group, because the offset is from the start of the whole
2619 regex. Temporarily terminate the pattern while doing this. */
2620
2621 if (repeat_max <= 1)
2622 {
2623 *code = OP_END;
2624 adjust_recurse(previous, 1, utf8, cd);
2625 memmove(previous+1, previous, len);
2626 code++;
2627 *previous++ = OP_BRAZERO + repeat_type;
2628 }
2629
2630 /* If the maximum is greater than 1 and limited, we have to replicate
2631 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
2632 The first one has to be handled carefully because it's the original
2633 copy, which has to be moved up. The remainder can be handled by code
2634 that is common with the non-zero minimum case below. We have to
2635 adjust the value or repeat_max, since one less copy is required. Once
2636 again, we may have to adjust any OP_RECURSE calls inside the group. */
2637
2638 else
2639 {
2640 int offset;
2641 *code = OP_END;
2642 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);
2643 memmove(previous + 2 + LINK_SIZE, previous, len);
2644 code += 2 + LINK_SIZE;
2645 *previous++ = OP_BRAZERO + repeat_type;
2646 *previous++ = OP_BRA;
2647
2648 /* We chain together the bracket offset fields that have to be
2649 filled in later when the ends of the brackets are reached. */
2650
2651 offset = (bralink == NULL)? 0 : previous - bralink;
2652 bralink = previous;
2653 PUTINC(previous, 0, offset);
2654 }
2655
2656 repeat_max--;
2657 }
2658
2659 /* If the minimum is greater than zero, replicate the group as many
2660 times as necessary, and adjust the maximum to the number of subsequent
2661 copies that we need. If we set a first char from the group, and didn't
2662 set a required char, copy the latter from the former. */
2663
2664 else
2665 {
2666 if (repeat_min > 1)
2667 {
2668 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
2669 for (i = 1; i < repeat_min; i++)
2670 {
2671 memcpy(code, previous, len);
2672 code += len;
2673 }
2674 }
2675 if (repeat_max > 0) repeat_max -= repeat_min;
2676 }
2677
2678 /* This code is common to both the zero and non-zero minimum cases. If
2679 the maximum is limited, it replicates the group in a nested fashion,
2680 remembering the bracket starts on a stack. In the case of a zero minimum,
2681 the first one was set up above. In all cases the repeat_max now specifies
2682 the number of additional copies needed. */
2683
2684 if (repeat_max >= 0)
2685 {
2686 for (i = repeat_max - 1; i >= 0; i--)
2687 {
2688 *code++ = OP_BRAZERO + repeat_type;
2689
2690 /* All but the final copy start a new nesting, maintaining the
2691 chain of brackets outstanding. */
2692
2693 if (i != 0)
2694 {
2695 int offset;
2696 *code++ = OP_BRA;
2697 offset = (bralink == NULL)? 0 : code - bralink;
2698 bralink = code;
2699 PUTINC(code, 0, offset);
2700 }
2701
2702 memcpy(code, previous, len);
2703 code += len;
2704 }
2705
2706 /* Now chain through the pending brackets, and fill in their length
2707 fields (which are holding the chain links pro tem). */
2708
2709 while (bralink != NULL)
2710 {
2711 int oldlinkoffset;
2712 int offset = code - bralink + 1;
2713 uschar *bra = code - offset;
2714 oldlinkoffset = GET(bra, 1);
2715 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
2716 *code++ = OP_KET;
2717 PUTINC(code, 0, offset);
2718 PUT(bra, 1, offset);
2719 }
2720 }
2721
2722 /* If the maximum is unlimited, set a repeater in the final copy. We
2723 can't just offset backwards from the current code point, because we
2724 don't know if there's been an options resetting after the ket. The
2725 correct offset was computed above. */
2726
2727 else code[-ketoffset] = OP_KETRMAX + repeat_type;
2728 }
2729
2730 /* Else there's some kind of shambles */
2731
2732 else
2733 {
2734 *errorcodeptr = ERR11;
2735 goto FAILED;
2736 }
2737
2738 /* If the character following a repeat is '+', we wrap the entire repeated
2739 item inside OP_ONCE brackets. This is just syntactic sugar, taken from
2740 Sun's Java package. The repeated item starts at tempcode, not at previous,
2741 which might be the first part of a string whose (former) last char we
2742 repeated. However, we don't support '+' after a greediness '?'. */
2743
2744 if (possessive_quantifier)
2745 {
2746 int len = code - tempcode;
2747 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
2748 code += 1 + LINK_SIZE;
2749 len += 1 + LINK_SIZE;
2750 tempcode[0] = OP_ONCE;
2751 *code++ = OP_KET;
2752 PUTINC(code, 0, len);
2753 PUT(tempcode, 1, len);
2754 }
2755
2756 /* In all case we no longer have a previous item. We also set the
2757 "follows varying string" flag for subsequently encountered reqbytes if
2758 it isn't already set and we have just passed a varying length item. */
2759
2760 END_REPEAT:
2761 previous = NULL;
2762 cd->req_varyopt |= reqvary;
2763 break;
2764
2765
2766 /* Start of nested bracket sub-expression, or comment or lookahead or
2767 lookbehind or option setting or condition. First deal with special things
2768 that can come after a bracket; all are introduced by ?, and the appearance
2769 of any of them means that this is not a referencing group. They were
2770 checked for validity in the first pass over the string, so we don't have to
2771 check for syntax errors here. */
2772
2773 case '(':
2774 newoptions = options;
2775 skipbytes = 0;
2776
2777 if (*(++ptr) == '?')
2778 {
2779 int set, unset;
2780 int *optset;
2781
2782 switch (*(++ptr))
2783 {
2784 case '#': /* Comment; skip to ket */
2785 ptr++;
2786 while (*ptr != ')') ptr++;
2787 continue;
2788
2789 case ':': /* Non-extracting bracket */
2790 bravalue = OP_BRA;
2791 ptr++;
2792 break;
2793
2794 case '(':
2795 bravalue = OP_COND; /* Conditional group */
2796
2797 /* Condition to test for recursion */
2798
2799 if (ptr[1] == 'R')
2800 {
2801 code[1+LINK_SIZE] = OP_CREF;
2802 PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
2803 skipbytes = 3;
2804 ptr += 3;
2805 }
2806
2807 /* Condition to test for a numbered subpattern match. We know that
2808 if a digit follows ( then there will just be digits until ) because
2809 the syntax was checked in the first pass. */
2810
2811 else if ((digitab[ptr[1]] && ctype_digit) != 0)
2812 {
2813 int condref; /* Don't amalgamate; some compilers */
2814 condref = *(++ptr) - '0'; /* grumble at autoincrement in declaration */
2815 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
2816 if (condref == 0)
2817 {
2818 *errorcodeptr = ERR35;
2819 goto FAILED;
2820 }
2821 ptr++;
2822 code[1+LINK_SIZE] = OP_CREF;
2823 PUT2(code, 2+LINK_SIZE, condref);
2824 skipbytes = 3;
2825 }
2826 /* For conditions that are assertions, we just fall through, having
2827 set bravalue above. */
2828 break;
2829
2830 case '=': /* Positive lookahead */
2831 bravalue = OP_ASSERT;
2832 ptr++;
2833 break;
2834
2835 case '!': /* Negative lookahead */
2836 bravalue = OP_ASSERT_NOT;
2837 ptr++;
2838 break;
2839
2840 case '<': /* Lookbehinds */
2841 switch (*(++ptr))
2842 {
2843 case '=': /* Positive lookbehind */
2844 bravalue = OP_ASSERTBACK;
2845 ptr++;
2846 break;
2847
2848 case '!': /* Negative lookbehind */
2849 bravalue = OP_ASSERTBACK_NOT;
2850 ptr++;
2851 break;
2852 }
2853 break;
2854
2855 case '>': /* One-time brackets */
2856 bravalue = OP_ONCE;
2857 ptr++;
2858 break;
2859
2860 case 'C': /* Callout - may be followed by digits; */
2861 previous_callout = code; /* Save for later completion */
2862 after_manual_callout = 1; /* Skip one item before completing */
2863 *code++ = OP_CALLOUT; /* Already checked that the terminating */
2864 { /* closing parenthesis is present. */
2865 int n = 0;
2866 while ((digitab[*(++ptr)] & ctype_digit) != 0)
2867 n = n * 10 + *ptr - '0';
2868 if (n > 255)
2869 {
2870 *errorcodeptr = ERR38;
2871 goto FAILED;
2872 }
2873 *code++ = n;
2874 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
2875 PUT(code, LINK_SIZE, 0); /* Default length */
2876 code += 2 * LINK_SIZE;
2877 }
2878 previous = NULL;
2879 continue;
2880
2881 case 'P': /* Named subpattern handling */
2882 if (*(++ptr) == '<') /* Definition */
2883 {
2884 int i, namelen;
2885 uschar *slot = cd->name_table;
2886 const uschar *name; /* Don't amalgamate; some compilers */
2887 name = ++ptr; /* grumble at autoincrement in declaration */
2888
2889 while (*ptr++ != '>');
2890 namelen = ptr - name - 1;
2891
2892 for (i = 0; i < cd->names_found; i++)
2893 {
2894 int crc = memcmp(name, slot+2, namelen);
2895 if (crc == 0)
2896 {
2897 if (slot[2+namelen] == 0)
2898 {
2899 *errorcodeptr = ERR43;
2900 goto FAILED;
2901 }
2902 crc = -1; /* Current name is substring */
2903 }
2904 if (crc < 0)
2905 {
2906 memmove(slot + cd->name_entry_size, slot,
2907 (cd->names_found - i) * cd->name_entry_size);
2908 break;
2909 }
2910 slot += cd->name_entry_size;
2911 }
2912
2913 PUT2(slot, 0, *brackets + 1);
2914 memcpy(slot + 2, name, namelen);
2915 slot[2+namelen] = 0;
2916 cd->names_found++;
2917 goto NUMBERED_GROUP;
2918 }
2919
2920 if (*ptr == '=' || *ptr == '>') /* Reference or recursion */
2921 {
2922 int i, namelen;
2923 int type = *ptr++;
2924 const uschar *name = ptr;
2925 uschar *slot = cd->name_table;
2926
2927 while (*ptr != ')') ptr++;
2928 namelen = ptr - name;
2929
2930 for (i = 0; i < cd->names_found; i++)
2931 {
2932 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
2933 slot += cd->name_entry_size;
2934 }
2935 if (i >= cd->names_found)
2936 {
2937 *errorcodeptr = ERR15;
2938 goto FAILED;
2939 }
2940
2941 recno = GET2(slot, 0);
2942
2943 if (type == '>') goto HANDLE_RECURSION; /* A few lines below */
2944
2945 /* Back reference */
2946
2947 previous = code;
2948 *code++ = OP_REF;
2949 PUT2INC(code, 0, recno);
2950 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
2951 if (recno > cd->top_backref) cd->top_backref = recno;
2952 continue;
2953 }
2954
2955 /* Should never happen */
2956 break;
2957
2958 case 'R': /* Pattern recursion */
2959 ptr++; /* Same as (?0) */
2960 /* Fall through */
2961
2962 /* Recursion or "subroutine" call */
2963
2964 case '0': case '1': case '2': case '3': case '4':
2965 case '5': case '6': case '7': case '8': case '9':
2966 {
2967 const uschar *called;
2968 recno = 0;
2969 while((digitab[*ptr] & ctype_digit) != 0)
2970 recno = recno * 10 + *ptr++ - '0';
2971
2972 /* Come here from code above that handles a named recursion */
2973
2974 HANDLE_RECURSION:
2975
2976 previous = code;
2977
2978 /* Find the bracket that is being referenced. Temporarily end the
2979 regex in case it doesn't exist. */
2980
2981 *code = OP_END;
2982 called = (recno == 0)?
2983 cd->start_code : find_bracket(cd->start_code, utf8, recno);
2984
2985 if (called == NULL)
2986 {
2987 *errorcodeptr = ERR15;
2988 goto FAILED;
2989 }
2990
2991 /* If the subpattern is still open, this is a recursive call. We
2992 check to see if this is a left recursion that could loop for ever,
2993 and diagnose that case. */
2994
2995 if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
2996 {
2997 *errorcodeptr = ERR40;
2998 goto FAILED;
2999 }
3000
3001 /* Insert the recursion/subroutine item */
3002
3003 *code = OP_RECURSE;
3004 PUT(code, 1, called - cd->start_code);
3005 code += 1 + LINK_SIZE;
3006 }
3007 continue;
3008
3009 /* Character after (? not specially recognized */
3010
3011 default: /* Option setting */
3012 set = unset = 0;
3013 optset = &set;
3014
3015 while (*ptr != ')' && *ptr != ':')
3016 {
3017 switch (*ptr++)
3018 {
3019 case '-': optset = &unset; break;
3020
3021 case 'i': *optset |= PCRE_CASELESS; break;
3022 case 'm': *optset |= PCRE_MULTILINE; break;
3023 case 's': *optset |= PCRE_DOTALL; break;
3024 case 'x': *optset |= PCRE_EXTENDED; break;
3025 case 'U': *optset |= PCRE_UNGREEDY; break;
3026 case 'X': *optset |= PCRE_EXTRA; break;
3027 }
3028 }
3029
3030 /* Set up the changed option bits, but don't change anything yet. */
3031
3032 newoptions = (options | set) & (~unset);
3033
3034 /* If the options ended with ')' this is not the start of a nested
3035 group with option changes, so the options change at this level. Compile
3036 code to change the ims options if this setting actually changes any of
3037 them. We also pass the new setting back so that it can be put at the
3038 start of any following branches, and when this group ends (if we are in
3039 a group), a resetting item can be compiled.
3040
3041 Note that if this item is right at the start of the pattern, the
3042 options will have been abstracted and made global, so there will be no
3043 change to compile. */
3044
3045 if (*ptr == ')')
3046 {
3047 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
3048 {
3049 *code++ = OP_OPT;
3050 *code++ = newoptions & PCRE_IMS;
3051 }
3052
3053 /* Change options at this level, and pass them back for use
3054 in subsequent branches. Reset the greedy defaults and the case
3055 value for firstbyte and reqbyte. */
3056
3057 *optionsptr = options = newoptions;
3058 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
3059 greedy_non_default = greedy_default ^ 1;
3060 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3061
3062 previous = NULL; /* This item can't be repeated */
3063 continue; /* It is complete */
3064 }
3065
3066 /* If the options ended with ':' we are heading into a nested group
3067 with possible change of options. Such groups are non-capturing and are
3068 not assertions of any kind. All we need to do is skip over the ':';
3069 the newoptions value is handled below. */
3070
3071 bravalue = OP_BRA;
3072 ptr++;
3073 }
3074 }
3075
3076 /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
3077 non-capturing and behave like (?:...) brackets */
3078
3079 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
3080 {
3081 bravalue = OP_BRA;
3082 }
3083
3084 /* Else we have a referencing group; adjust the opcode. If the bracket
3085 number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
3086 arrange for the true number to follow later, in an OP_BRANUMBER item. */
3087
3088 else
3089 {
3090 NUMBERED_GROUP:
3091 if (++(*brackets) > EXTRACT_BASIC_MAX)
3092 {
3093 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
3094 code[1+LINK_SIZE] = OP_BRANUMBER;
3095 PUT2(code, 2+LINK_SIZE, *brackets);
3096 skipbytes = 3;
3097 }
3098 else bravalue = OP_BRA + *brackets;
3099 }
3100
3101 /* Process nested bracketed re. Assertions may not be repeated, but other
3102 kinds can be. We copy code into a non-register variable in order to be able
3103 to pass its address because some compilers complain otherwise. Pass in a
3104 new setting for the ims options if they have changed. */
3105
3106 previous = (bravalue >= OP_ONCE)? code : NULL;
3107 *code = bravalue;
3108 tempcode = code;
3109 tempreqvary = cd->req_varyopt; /* Save value before bracket */
3110
3111 if (!compile_regex(
3112 newoptions, /* The complete new option state */
3113 options & PCRE_IMS, /* The previous ims option state */
3114 brackets, /* Extracting bracket count */
3115 &tempcode, /* Where to put code (updated) */
3116 &ptr, /* Input pointer (updated) */
3117 errorcodeptr, /* Where to put an error message */
3118 (bravalue == OP_ASSERTBACK ||
3119 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
3120 skipbytes, /* Skip over OP_COND/OP_BRANUMBER */
3121 &subfirstbyte, /* For possible first char */
3122 &subreqbyte, /* For possible last char */
3123 bcptr, /* Current branch chain */
3124 cd)) /* Tables block */
3125 goto FAILED;
3126
3127 /* At the end of compiling, code is still pointing to the start of the
3128 group, while tempcode has been updated to point past the end of the group
3129 and any option resetting that may follow it. The pattern pointer (ptr)
3130 is on the bracket. */
3131
3132 /* If this is a conditional bracket, check that there are no more than
3133 two branches in the group. */
3134
3135 else if (bravalue == OP_COND)
3136 {
3137 uschar *tc = code;
3138 condcount = 0;
3139
3140 do {
3141 condcount++;
3142 tc += GET(tc,1);
3143 }
3144 while (*tc != OP_KET);
3145
3146 if (condcount > 2)
3147 {
3148 *errorcodeptr = ERR27;
3149 goto FAILED;
3150 }
3151
3152 /* If there is just one branch, we must not make use of its firstbyte or
3153 reqbyte, because this is equivalent to an empty second branch. */
3154
3155 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
3156 }
3157
3158 /* Handle updating of the required and first characters. Update for normal
3159 brackets of all kinds, and conditions with two branches (see code above).
3160 If the bracket is followed by a quantifier with zero repeat, we have to
3161 back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
3162 main loop so that they can be accessed for the back off. */
3163
3164 zeroreqbyte = reqbyte;
3165 zerofirstbyte = firstbyte;
3166 groupsetfirstbyte = FALSE;
3167
3168 if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
3169 {
3170 /* If we have not yet set a firstbyte in this branch, take it from the
3171 subpattern, remembering that it was set here so that a repeat of more
3172 than one can replicate it as reqbyte if necessary. If the subpattern has
3173 no firstbyte, set "none" for the whole branch. In both cases, a zero
3174 repeat forces firstbyte to "none". */
3175
3176 if (firstbyte == REQ_UNSET)
3177 {
3178 if (subfirstbyte >= 0)
3179 {
3180 firstbyte = subfirstbyte;
3181 groupsetfirstbyte = TRUE;
3182 }
3183 else firstbyte = REQ_NONE;
3184 zerofirstbyte = REQ_NONE;
3185 }
3186
3187 /* If firstbyte was previously set, convert the subpattern's firstbyte
3188 into reqbyte if there wasn't one, using the vary flag that was in
3189 existence beforehand. */
3190
3191 else if (subfirstbyte >= 0 && subreqbyte < 0)
3192 subreqbyte = subfirstbyte | tempreqvary;
3193
3194 /* If the subpattern set a required byte (or set a first byte that isn't
3195 really the first byte - see above), set it. */
3196
3197 if (subreqbyte >= 0) reqbyte = subreqbyte;
3198 }
3199
3200 /* For a forward assertion, we take the reqbyte, if set. This can be
3201 helpful if the pattern that follows the assertion doesn't set a different
3202 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
3203 for an assertion, however because it leads to incorrect effect for patterns
3204 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
3205 of a firstbyte. This is overcome by a scan at the end if there's no
3206 firstbyte, looking for an asserted first char. */
3207
3208 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
3209
3210 /* Now update the main code pointer to the end of the group. */
3211
3212 code = tempcode;
3213
3214 /* Error if hit end of pattern */
3215
3216 if (*ptr != ')')
3217 {
3218 *errorcodeptr = ERR14;
3219 goto FAILED;
3220 }
3221 break;
3222
3223 /* Check \ for being a real metacharacter; if not, fall through and handle
3224 it as a data character at the start of a string. Escape items are checked
3225 for validity in the pre-compiling pass. */
3226
3227 case '\\':
3228 tempptr = ptr;
3229 c = check_escape(&ptr, errorcodeptr, *brackets, options, FALSE);
3230
3231 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
3232 are arranged to be the negation of the corresponding OP_values. For the
3233 back references, the values are ESC_REF plus the reference number. Only
3234 back references and those types that consume a character may be repeated.
3235 We can test for values between ESC_b and ESC_Z for the latter; this may
3236 have to change if any new ones are ever created. */
3237
3238 if (c < 0)
3239 {
3240 if (-c == ESC_Q) /* Handle start of quoted string */
3241 {
3242 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
3243 else inescq = TRUE;
3244 continue;
3245 }
3246
3247 /* For metasequences that actually match a character, we disable the
3248 setting of a first character if it hasn't already been set. */
3249
3250 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
3251 firstbyte = REQ_NONE;
3252
3253 /* Set values to reset to if this is followed by a zero repeat. */
3254
3255 zerofirstbyte = firstbyte;
3256 zeroreqbyte = reqbyte;
3257
3258 /* Back references are handled specially */
3259
3260 if (-c >= ESC_REF)
3261 {
3262 int number = -c - ESC_REF;
3263 previous = code;
3264 *code++ = OP_REF;
3265 PUT2INC(code, 0, number);
3266 }
3267
3268 /* So are Unicode property matches, if supported. We know that get_ucp
3269 won't fail because it was tested in the pre-pass. */
3270
3271 #ifdef SUPPORT_UCP
3272 else if (-c == ESC_P || -c == ESC_p)
3273 {
3274 BOOL negated;
3275 int value = get_ucp(&ptr, &negated, errorcodeptr);
3276 previous = code;
3277 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
3278 *code++ = value;
3279 }
3280 #endif
3281
3282 /* For the rest, we can obtain the OP value by negating the escape
3283 value */
3284
3285 else
3286 {
3287 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
3288 *code++ = -c;
3289 }
3290 continue;
3291 }
3292
3293 /* We have a data character whose value is in c. In UTF-8 mode it may have
3294 a value > 127. We set its representation in the length/buffer, and then
3295 handle it as a data character. */
3296
3297 #ifdef SUPPORT_UTF8
3298 if (utf8 && c > 127)
3299 mclength = _pcre_ord2utf8(c, mcbuffer);
3300 else
3301 #endif
3302
3303 {
3304 mcbuffer[0] = c;
3305 mclength = 1;
3306 }
3307
3308 goto ONE_CHAR;
3309
3310 /* Handle a literal character. It is guaranteed not to be whitespace or #
3311 when the extended flag is set. If we are in UTF-8 mode, it may be a
3312 multi-byte literal character. */
3313
3314 default:
3315 NORMAL_CHAR:
3316 mclength = 1;
3317 mcbuffer[0] = c;
3318
3319 #ifdef SUPPORT_UTF8
3320 if (utf8 && (c & 0xc0) == 0xc0)
3321 {
3322 while ((ptr[1] & 0xc0) == 0x80)
3323 mcbuffer[mclength++] = *(++ptr);
3324 }
3325 #endif
3326
3327 /* At this point we have the character's bytes in mcbuffer, and the length
3328 in mclength. When not in UTF-8 mode, the length is always 1. */
3329
3330 ONE_CHAR:
3331 previous = code;
3332 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
3333 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
3334
3335 /* Set the first and required bytes appropriately. If no previous first
3336 byte, set it from this character, but revert to none on a zero repeat.
3337 Otherwise, leave the firstbyte value alone, and don't change it on a zero
3338 repeat. */
3339
3340 if (firstbyte == REQ_UNSET)
3341 {
3342 zerofirstbyte = REQ_NONE;
3343 zeroreqbyte = reqbyte;
3344
3345 /* If the character is more than one byte long, we can set firstbyte
3346 only if it is not to be matched caselessly. */
3347
3348 if (mclength == 1 || req_caseopt == 0)
3349 {
3350 firstbyte = mcbuffer[0] | req_caseopt;
3351 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
3352 }
3353 else firstbyte = reqbyte = REQ_NONE;
3354 }
3355
3356 /* firstbyte was previously set; we can set reqbyte only the length is
3357 1 or the matching is caseful. */
3358
3359 else
3360 {
3361 zerofirstbyte = firstbyte;
3362 zeroreqbyte = reqbyte;
3363 if (mclength == 1 || req_caseopt == 0)
3364 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3365 }
3366
3367 break; /* End of literal character handling */
3368 }
3369 } /* end of big loop */
3370
3371 /* Control never reaches here by falling through, only by a goto for all the
3372 error states. Pass back the position in the pattern so that it can be displayed
3373 to the user for diagnosing the error. */
3374
3375 FAILED:
3376 *ptrptr = ptr;
3377 return FALSE;
3378 }
3379
3380
3381
3382
3383 /*************************************************
3384 * Compile sequence of alternatives *
3385 *************************************************/
3386
3387 /* On entry, ptr is pointing past the bracket character, but on return
3388 it points to the closing bracket, or vertical bar, or end of string.
3389 The code variable is pointing at the byte into which the BRA operator has been
3390 stored. If the ims options are changed at the start (for a (?ims: group) or
3391 during any branch, we need to insert an OP_OPT item at the start of every
3392 following branch to ensure they get set correctly at run time, and also pass
3393 the new options into every subsequent branch compile.
3394
3395 Argument:
3396 options option bits, including any changes for this subpattern
3397 oldims previous settings of ims option bits
3398 brackets -> int containing the number of extracting brackets used
3399 codeptr -> the address of the current code pointer
3400 ptrptr -> the address of the current pattern pointer
3401 errorcodeptr -> pointer to error code variable
3402 lookbehind TRUE if this is a lookbehind assertion
3403 skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER)
3404 firstbyteptr place to put the first required character, or a negative number
3405 reqbyteptr place to put the last required character, or a negative number
3406 bcptr pointer to the chain of currently open branches
3407 cd points to the data block with tables pointers etc.
3408
3409 Returns: TRUE on success
3410 */
3411
3412 static BOOL
3413 compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
3414 const uschar **ptrptr, int *errorcodeptr, BOOL lookbehind, int skipbytes,
3415 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
3416 {
3417 const uschar *ptr = *ptrptr;
3418 uschar *code = *codeptr;
3419 uschar *last_branch = code;
3420 uschar *start_bracket = code;
3421 uschar *reverse_count = NULL;
3422 int firstbyte, reqbyte;
3423 int branchfirstbyte, branchreqbyte;
3424 branch_chain bc;
3425
3426 bc.outer = bcptr;
3427 bc.current = code;
3428
3429 firstbyte = reqbyte = REQ_UNSET;
3430
3431 /* Offset is set zero to mark that this bracket is still open */
3432
3433 PUT(code, 1, 0);
3434 code += 1 + LINK_SIZE + skipbytes;
3435
3436 /* Loop for each alternative branch */
3437
3438 for (;;)
3439 {
3440 /* Handle a change of ims options at the start of the branch */
3441
3442 if ((options & PCRE_IMS) != oldims)
3443 {
3444 *code++ = OP_OPT;
3445 *code++ = options & PCRE_IMS;
3446 }
3447
3448 /* Set up dummy OP_REVERSE if lookbehind assertion */
3449
3450 if (lookbehind)
3451 {
3452 *code++ = OP_REVERSE;
3453 reverse_count = code;
3454 PUTINC(code, 0, 0);
3455 }
3456
3457 /* Now compile the branch */
3458
3459 if (!compile_branch(&options, brackets, &code, &ptr, errorcodeptr,
3460 &branchfirstbyte, &branchreqbyte, &bc, cd))
3461 {
3462 *ptrptr = ptr;
3463 return FALSE;
3464 }
3465
3466 /* If this is the first branch, the firstbyte and reqbyte values for the
3467 branch become the values for the regex. */
3468
3469 if (*last_branch != OP_ALT)
3470 {
3471 firstbyte = branchfirstbyte;
3472 reqbyte = branchreqbyte;
3473 }
3474
3475 /* If this is not the first branch, the first char and reqbyte have to
3476 match the values from all the previous branches, except that if the previous
3477 value for reqbyte didn't have REQ_VARY set, it can still match, and we set
3478 REQ_VARY for the regex. */
3479
3480 else
3481 {
3482 /* If we previously had a firstbyte, but it doesn't match the new branch,
3483 we have to abandon the firstbyte for the regex, but if there was previously
3484 no reqbyte, it takes on the value of the old firstbyte. */
3485
3486 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
3487 {
3488 if (reqbyte < 0) reqbyte = firstbyte;
3489 firstbyte = REQ_NONE;
3490 }
3491
3492 /* If we (now or from before) have no firstbyte, a firstbyte from the
3493 branch becomes a reqbyte if there isn't a branch reqbyte. */
3494
3495 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
3496 branchreqbyte = branchfirstbyte;
3497
3498 /* Now ensure that the reqbytes match */
3499
3500 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
3501 reqbyte = REQ_NONE;
3502 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
3503 }
3504
3505 /* If lookbehind, check that this branch matches a fixed-length string,
3506 and put the length into the OP_REVERSE item. Temporarily mark the end of
3507 the branch with OP_END. */
3508
3509 if (lookbehind)
3510 {
3511 int length;
3512 *code = OP_END;
3513 length = find_fixedlength(last_branch, options);
3514 DPRINTF(("fixed length = %d\n", length));
3515 if (length < 0)
3516 {
3517 *errorcodeptr = (length == -2)? ERR36 : ERR25;
3518 *ptrptr = ptr;
3519 return FALSE;
3520 }
3521 PUT(reverse_count, 0, length);
3522 }
3523
3524 /* Reached end of expression, either ')' or end of pattern. Go back through
3525 the alternative branches and reverse the chain of offsets, with the field in
3526 the BRA item now becoming an offset to the first alternative. If there are
3527 no alternatives, it points to the end of the group. The length in the
3528 terminating ket is always the length of the whole bracketed item. If any of
3529 the ims options were changed inside the group, compile a resetting op-code
3530 following, except at the very end of the pattern. Return leaving the pointer
3531 at the terminating char. */
3532
3533 if (*ptr != '|')
3534 {
3535 int length = code - last_branch;
3536 do
3537 {
3538 int prev_length = GET(last_branch, 1);
3539 PUT(last_branch, 1, length);
3540 length = prev_length;
3541 last_branch -= length;
3542 }
3543 while (length > 0);
3544
3545 /* Fill in the ket */
3546
3547 *code = OP_KET;
3548 PUT(code, 1, code - start_bracket);
3549 code += 1 + LINK_SIZE;
3550
3551 /* Resetting option if needed */
3552
3553 if ((options & PCRE_IMS) != oldims && *ptr == ')')
3554 {
3555 *code++ = OP_OPT;
3556 *code++ = oldims;
3557 }
3558
3559 /* Set values to pass back */
3560
3561 *codeptr = code;
3562 *ptrptr = ptr;
3563 *firstbyteptr = firstbyte;
3564 *reqbyteptr = reqbyte;
3565 return TRUE;
3566 }
3567
3568 /* Another branch follows; insert an "or" node. Its length field points back
3569 to the previous branch while the bracket remains open. At the end the chain
3570 is reversed. It's done like this so that the start of the bracket has a
3571 zero offset until it is closed, making it possible to detect recursion. */
3572
3573 *code = OP_ALT;
3574 PUT(code, 1, code - last_branch);
3575 bc.current = last_branch = code;
3576 code += 1 + LINK_SIZE;
3577 ptr++;
3578 }
3579 /* Control never reaches here */
3580 }
3581
3582
3583
3584
3585 /*************************************************
3586 * Check for anchored expression *
3587 *************************************************/
3588
3589 /* Try to find out if this is an anchored regular expression. Consider each
3590 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
3591 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
3592 it's anchored. However, if this is a multiline pattern, then only OP_SOD
3593 counts, since OP_CIRC can match in the middle.
3594
3595 We can also consider a regex to be anchored if OP_SOM starts all its branches.
3596 This is the code for \G, which means "match at start of match position, taking
3597 into account the match offset".
3598
3599 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
3600 because that will try the rest of the pattern at all possible matching points,
3601 so there is no point trying again.... er ....
3602
3603 .... except when the .* appears inside capturing parentheses, and there is a
3604 subsequent back reference to those parentheses. We haven't enough information
3605 to catch that case precisely.
3606
3607 At first, the best we could do was to detect when .* was in capturing brackets
3608 and the highest back reference was greater than or equal to that level.
3609 However, by keeping a bitmap of the first 31 back references, we can catch some
3610 of the more common cases more precisely.
3611
3612 Arguments:
3613 code points to start of expression (the bracket)
3614 options points to the options setting
3615 bracket_map a bitmap of which brackets we are inside while testing; this
3616 handles up to substring 31; after that we just have to take
3617 the less precise approach
3618 backref_map the back reference bitmap
3619
3620 Returns: TRUE or FALSE
3621 */
3622
3623 static BOOL
3624 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
3625 unsigned int backref_map)
3626 {
3627 do {
3628 const uschar *scode =
3629 first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE);
3630 register int op = *scode;
3631
3632 /* Capturing brackets */
3633
3634 if (op > OP_BRA)
3635 {
3636 int new_map;
3637 op -= OP_BRA;
3638 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3639 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3640 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
3641 }
3642
3643 /* Other brackets */
3644
3645 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3646 {
3647 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
3648 }
3649
3650 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3651 are or may be referenced. */
3652
3653 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
3654 (*options & PCRE_DOTALL) != 0)
3655 {
3656 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3657 }
3658
3659 /* Check for explicit anchoring */
3660
3661 else if (op != OP_SOD && op != OP_SOM &&
3662 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
3663 return FALSE;
3664 code += GET(code, 1);
3665 }
3666 while (*code == OP_ALT); /* Loop for each alternative */
3667 return TRUE;
3668 }
3669
3670
3671
3672 /*************************************************
3673 * Check for starting with ^ or .* *
3674 *************************************************/
3675
3676 /* This is called to find out if every branch starts with ^ or .* so that
3677 "first char" processing can be done to speed things up in multiline
3678 matching and for non-DOTALL patterns that start with .* (which must start at
3679 the beginning or after \n). As in the case of is_anchored() (see above), we
3680 have to take account of back references to capturing brackets that contain .*
3681 because in that case we can't make the assumption.
3682
3683 Arguments:
3684 code points to start of expression (the bracket)
3685 bracket_map a bitmap of which brackets we are inside while testing; this
3686 handles up to substring 31; after that we just have to take
3687 the less precise approach
3688 backref_map the back reference bitmap
3689
3690 Returns: TRUE or FALSE
3691 */
3692
3693 static BOOL
3694 is_startline(const uschar *code, unsigned int bracket_map,
3695 unsigned int backref_map)
3696 {
3697 do {
3698 const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0,
3699 FALSE);
3700 register int op = *scode;
3701
3702 /* Capturing brackets */
3703
3704 if (op > OP_BRA)
3705 {
3706 int new_map;
3707 op -= OP_BRA;
3708 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3709 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3710 if (!is_startline(scode, new_map, backref_map)) return FALSE;
3711 }
3712
3713 /* Other brackets */
3714
3715 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3716 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
3717
3718 /* .* means "start at start or after \n" if it isn't in brackets that
3719 may be referenced. */
3720
3721 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
3722 {
3723 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3724 }
3725
3726 /* Check for explicit circumflex */
3727
3728 else if (op != OP_CIRC) return FALSE;
3729
3730 /* Move on to the next alternative */
3731
3732 code += GET(code, 1);
3733 }
3734 while (*code == OP_ALT); /* Loop for each alternative */
3735 return TRUE;
3736 }
3737
3738
3739
3740 /*************************************************
3741 * Check for asserted fixed first char *
3742 *************************************************/
3743
3744 /* During compilation, the "first char" settings from forward assertions are
3745 discarded, because they can cause conflicts with actual literals that follow.
3746 However, if we end up without a first char setting for an unanchored pattern,
3747 it is worth scanning the regex to see if there is an initial asserted first
3748 char. If all branches start with the same asserted char, or with a bracket all
3749 of whose alternatives start with the same asserted char (recurse ad lib), then
3750 we return that char, otherwise -1.
3751
3752 Arguments:
3753 code points to start of expression (the bracket)
3754 options pointer to the options (used to check casing changes)
3755 inassert TRUE if in an assertion
3756
3757 Returns: -1 or the fixed first char
3758 */
3759
3760 static int
3761 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
3762 {
3763 register int c = -1;
3764 do {
3765 int d;
3766 const uschar *scode =
3767 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
3768 register int op = *scode;
3769
3770 if (op >= OP_BRA) op = OP_BRA;
3771
3772 switch(op)
3773 {
3774 default:
3775 return -1;
3776
3777 case OP_BRA:
3778 case OP_ASSERT:
3779 case OP_ONCE:
3780 case OP_COND:
3781 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
3782 return -1;
3783 if (c < 0) c = d; else if (c != d) return -1;
3784 break;
3785
3786 case OP_EXACT: /* Fall through */
3787 scode += 2;
3788
3789 case OP_CHAR:
3790 case OP_CHARNC:
3791 case OP_PLUS:
3792 case OP_MINPLUS:
3793 if (!inassert) return -1;
3794 if (c < 0)
3795 {
3796 c = scode[1];
3797 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
3798 }
3799 else if (c != scode[1]) return -1;
3800 break;
3801 }
3802
3803 code += GET(code, 1);
3804 }
3805 while (*code == OP_ALT);
3806 return c;