eaa9550a7fc40a1cbab8a7cd4cc1cede104118cb
[exim.git] / src / src / pcre / pcre_compile.c
1 /* $Cambridge: exim/src/src/pcre/pcre_compile.c,v 1.3 2006/11/07 16:50:36 ph10 Exp $ */
2
3 /*************************************************
4 * Perl-Compatible Regular Expressions *
5 *************************************************/
6
7 /* PCRE is a library of functions to support regular expressions whose syntax
8 and semantics are as close as possible to those of the Perl 5 language.
9
10 Written by Philip Hazel
11 Copyright (c) 1997-2006 University of Cambridge
12
13 -----------------------------------------------------------------------------
14 Redistribution and use in source and binary forms, with or without
15 modification, are permitted provided that the following conditions are met:
16
17 * Redistributions of source code must retain the above copyright notice,
18 this list of conditions and the following disclaimer.
19
20 * Redistributions in binary form must reproduce the above copyright
21 notice, this list of conditions and the following disclaimer in the
22 documentation and/or other materials provided with the distribution.
23
24 * Neither the name of the University of Cambridge nor the names of its
25 contributors may be used to endorse or promote products derived from
26 this software without specific prior written permission.
27
28 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 POSSIBILITY OF SUCH DAMAGE.
39 -----------------------------------------------------------------------------
40 */
41
42
43 /* This module contains the external function pcre_compile(), along with
44 supporting internal functions that are not used by other modules. */
45
46
47 #define NLBLOCK cd /* The block containing newline information */
48 #include "pcre_internal.h"
49
50
51 /* When DEBUG is defined, we need the pcre_printint() function, which is also
52 used by pcretest. DEBUG is not defined when building a production library. */
53
54 #ifdef DEBUG
55 #include "pcre_printint.src"
56 #endif
57
58
59
60 /*************************************************
61 * Code parameters and static tables *
62 *************************************************/
63
64 /* Maximum number of items on the nested bracket stacks at compile time. This
65 applies to the nesting of all kinds of parentheses. It does not limit
66 un-nested, non-capturing parentheses. This number can be made bigger if
67 necessary - it is used to dimension one int and one unsigned char vector at
68 compile time. */
69
70 #define BRASTACK_SIZE 200
71
72
73 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
74 are simple data values; negative values are for special things like \d and so
75 on. Zero means further processing is needed (for things like \x), or the escape
76 is invalid. */
77
78 #if !EBCDIC /* This is the "normal" table for ASCII systems */
79 static const short int escapes[] = {
80 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
81 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
82 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
83 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
84 -ESC_P, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
85 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
86 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
87 0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
88 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
89 0, 0, -ESC_z /* x - z */
90 };
91
92 #else /* This is the "abnormal" table for EBCDIC systems */
93 static const short int escapes[] = {
94 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
95 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
96 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
97 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
98 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
99 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
100 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
101 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
102 /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
103 /* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, -ESC_p,
104 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
105 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
106 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
107 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
108 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
109 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
110 /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
111 /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
112 /* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0,
113 /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
114 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
115 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
116 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
117 };
118 #endif
119
120
121 /* Tables of names of POSIX character classes and their lengths. The list is
122 terminated by a zero length entry. The first three must be alpha, lower, upper,
123 as this is assumed for handling case independence. */
124
125 static const char *const posix_names[] = {
126 "alpha", "lower", "upper",
127 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
128 "print", "punct", "space", "word", "xdigit" };
129
130 static const uschar posix_name_lengths[] = {
131 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
132
133 /* Table of class bit maps for each POSIX class. Each class is formed from a
134 base map, with an optional addition or removal of another map. Then, for some
135 classes, there is some additional tweaking: for [:blank:] the vertical space
136 characters are removed, and for [:alpha:] and [:alnum:] the underscore
137 character is removed. The triples in the table consist of the base map offset,
138 second map offset or -1 if no second map, and a non-negative value for map
139 addition or a negative value for map subtraction (if there are two maps). The
140 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
141 remove vertical space characters, 2 => remove underscore. */
142
143 static const int posix_class_maps[] = {
144 cbit_word, cbit_digit, -2, /* alpha */
145 cbit_lower, -1, 0, /* lower */
146 cbit_upper, -1, 0, /* upper */
147 cbit_word, -1, 2, /* alnum - word without underscore */
148 cbit_print, cbit_cntrl, 0, /* ascii */
149 cbit_space, -1, 1, /* blank - a GNU extension */
150 cbit_cntrl, -1, 0, /* cntrl */
151 cbit_digit, -1, 0, /* digit */
152 cbit_graph, -1, 0, /* graph */
153 cbit_print, -1, 0, /* print */
154 cbit_punct, -1, 0, /* punct */
155 cbit_space, -1, 0, /* space */
156 cbit_word, -1, 0, /* word - a Perl extension */
157 cbit_xdigit,-1, 0 /* xdigit */
158 };
159
160
161 /* The texts of compile-time error messages. These are "char *" because they
162 are passed to the outside world. */
163
164 static const char *error_texts[] = {
165 "no error",
166 "\\ at end of pattern",
167 "\\c at end of pattern",
168 "unrecognized character follows \\",
169 "numbers out of order in {} quantifier",
170 /* 5 */
171 "number too big in {} quantifier",
172 "missing terminating ] for character class",
173 "invalid escape sequence in character class",
174 "range out of order in character class",
175 "nothing to repeat",
176 /* 10 */
177 "operand of unlimited repeat could match the empty string",
178 "internal error: unexpected repeat",
179 "unrecognized character after (?",
180 "POSIX named classes are supported only within a class",
181 "missing )",
182 /* 15 */
183 "reference to non-existent subpattern",
184 "erroffset passed as NULL",
185 "unknown option bit(s) set",
186 "missing ) after comment",
187 "parentheses nested too deeply",
188 /* 20 */
189 "regular expression too large",
190 "failed to get memory",
191 "unmatched parentheses",
192 "internal error: code overflow",
193 "unrecognized character after (?<",
194 /* 25 */
195 "lookbehind assertion is not fixed length",
196 "malformed number or name after (?(",
197 "conditional group contains more than two branches",
198 "assertion expected after (?(",
199 "(?R or (?digits must be followed by )",
200 /* 30 */
201 "unknown POSIX class name",
202 "POSIX collating elements are not supported",
203 "this version of PCRE is not compiled with PCRE_UTF8 support",
204 "spare error",
205 "character value in \\x{...} sequence is too large",
206 /* 35 */
207 "invalid condition (?(0)",
208 "\\C not allowed in lookbehind assertion",
209 "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
210 "number after (?C is > 255",
211 "closing ) for (?C expected",
212 /* 40 */
213 "recursive call could loop indefinitely",
214 "unrecognized character after (?P",
215 "syntax error after (?P",
216 "two named subpatterns have the same name",
217 "invalid UTF-8 string",
218 /* 45 */
219 "support for \\P, \\p, and \\X has not been compiled",
220 "malformed \\P or \\p sequence",
221 "unknown property name after \\P or \\p",
222 "subpattern name is too long (maximum 32 characters)",
223 "too many named subpatterns (maximum 10,000)",
224 /* 50 */
225 "repeated subpattern is too long",
226 "octal value is greater than \\377 (not in UTF-8 mode)"
227 };
228
229
230 /* Table to identify digits and hex digits. This is used when compiling
231 patterns. Note that the tables in chartables are dependent on the locale, and
232 may mark arbitrary characters as digits - but the PCRE compiling code expects
233 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
234 a private table here. It costs 256 bytes, but it is a lot faster than doing
235 character value tests (at least in some simple cases I timed), and in some
236 applications one wants PCRE to compile efficiently as well as match
237 efficiently.
238
239 For convenience, we use the same bit definitions as in chartables:
240
241 0x04 decimal digit
242 0x08 hexadecimal digit
243
244 Then we can use ctype_digit and ctype_xdigit in the code. */
245
246 #if !EBCDIC /* This is the "normal" case, for ASCII systems */
247 static const unsigned char digitab[] =
248 {
249 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
250 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
251 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
252 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
253 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
254 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
255 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
256 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
257 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
258 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
259 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
260 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
261 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
262 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
263 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
264 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
265 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
266 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
267 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
268 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
269 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
270 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
271 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
272 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
273 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
274 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
275 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
276 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
277 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
278 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
279 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
280 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
281
282 #else /* This is the "abnormal" case, for EBCDIC systems */
283 static const unsigned char digitab[] =
284 {
285 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
286 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
287 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
288 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
289 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
290 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
291 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
292 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
293 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
294 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
295 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
296 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- ¬ */
297 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
298 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
299 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
300 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
301 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
302 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
303 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
304 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
305 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
306 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
307 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
308 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
309 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
310 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
311 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
312 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
313 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
314 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
315 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
316 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
317
318 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
319 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
320 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
321 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
322 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
323 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
324 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
325 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
326 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
327 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
328 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
329 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
330 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- ¬ */
331 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
332 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
333 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
334 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
335 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
336 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
337 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
338 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
339 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
340 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
341 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
342 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
343 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
344 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
345 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
346 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
347 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
348 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
349 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
350 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
351 #endif
352
353
354 /* Definition to allow mutual recursion */
355
356 static BOOL
357 compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,
358 int *, int *, branch_chain *, compile_data *);
359
360
361
362 /*************************************************
363 * Handle escapes *
364 *************************************************/
365
366 /* This function is called when a \ has been encountered. It either returns a
367 positive value for a simple escape such as \n, or a negative value which
368 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
369 a positive value greater than 255 may be returned. On entry, ptr is pointing at
370 the \. On exit, it is on the final character of the escape sequence.
371
372 Arguments:
373 ptrptr points to the pattern position pointer
374 errorcodeptr points to the errorcode variable
375 bracount number of previous extracting brackets
376 options the options bits
377 isclass TRUE if inside a character class
378
379 Returns: zero or positive => a data character
380 negative => a special escape sequence
381 on error, errorptr is set
382 */
383
384 static int
385 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
386 int options, BOOL isclass)
387 {
388 BOOL utf8 = (options & PCRE_UTF8) != 0;
389 const uschar *ptr = *ptrptr + 1;
390 int c, i;
391
392 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
393 ptr--; /* Set pointer back to the last byte */
394
395 /* If backslash is at the end of the pattern, it's an error. */
396
397 if (c == 0) *errorcodeptr = ERR1;
398
399 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
400 a table. A non-zero result is something that can be returned immediately.
401 Otherwise further processing may be required. */
402
403 #if !EBCDIC /* ASCII coding */
404 else if (c < '0' || c > 'z') {} /* Not alphameric */
405 else if ((i = escapes[c - '0']) != 0) c = i;
406
407 #else /* EBCDIC coding */
408 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
409 else if ((i = escapes[c - 0x48]) != 0) c = i;
410 #endif
411
412 /* Escapes that need further processing, or are illegal. */
413
414 else
415 {
416 const uschar *oldptr;
417 switch (c)
418 {
419 /* A number of Perl escapes are not handled by PCRE. We give an explicit
420 error. */
421
422 case 'l':
423 case 'L':
424 case 'N':
425 case 'u':
426 case 'U':
427 *errorcodeptr = ERR37;
428 break;
429
430 /* The handling of escape sequences consisting of a string of digits
431 starting with one that is not zero is not straightforward. By experiment,
432 the way Perl works seems to be as follows:
433
434 Outside a character class, the digits are read as a decimal number. If the
435 number is less than 10, or if there are that many previous extracting
436 left brackets, then it is a back reference. Otherwise, up to three octal
437 digits are read to form an escaped byte. Thus \123 is likely to be octal
438 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
439 value is greater than 377, the least significant 8 bits are taken. Inside a
440 character class, \ followed by a digit is always an octal number. */
441
442 case '1': case '2': case '3': case '4': case '5':
443 case '6': case '7': case '8': case '9':
444
445 if (!isclass)
446 {
447 oldptr = ptr;
448 c -= '0';
449 while ((digitab[ptr[1]] & ctype_digit) != 0)
450 c = c * 10 + *(++ptr) - '0';
451 if (c < 10 || c <= bracount)
452 {
453 c = -(ESC_REF + c);
454 break;
455 }
456 ptr = oldptr; /* Put the pointer back and fall through */
457 }
458
459 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
460 generates a binary zero byte and treats the digit as a following literal.
461 Thus we have to pull back the pointer by one. */
462
463 if ((c = *ptr) >= '8')
464 {
465 ptr--;
466 c = 0;
467 break;
468 }
469
470 /* \0 always starts an octal number, but we may drop through to here with a
471 larger first octal digit. The original code used just to take the least
472 significant 8 bits of octal numbers (I think this is what early Perls used
473 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
474 than 3 octal digits. */
475
476 case '0':
477 c -= '0';
478 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
479 c = c * 8 + *(++ptr) - '0';
480 if (!utf8 && c > 255) *errorcodeptr = ERR51;
481 break;
482
483 /* \x is complicated. \x{ddd} is a character number which can be greater
484 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
485 treated as a data character. */
486
487 case 'x':
488 if (ptr[1] == '{')
489 {
490 const uschar *pt = ptr + 2;
491 int count = 0;
492
493 c = 0;
494 while ((digitab[*pt] & ctype_xdigit) != 0)
495 {
496 register int cc = *pt++;
497 if (c == 0 && cc == '0') continue; /* Leading zeroes */
498 count++;
499
500 #if !EBCDIC /* ASCII coding */
501 if (cc >= 'a') cc -= 32; /* Convert to upper case */
502 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
503 #else /* EBCDIC coding */
504 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
505 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
506 #endif
507 }
508
509 if (*pt == '}')
510 {
511 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
512 ptr = pt;
513 break;
514 }
515
516 /* If the sequence of hex digits does not end with '}', then we don't
517 recognize this construct; fall through to the normal \x handling. */
518 }
519
520 /* Read just a single-byte hex-defined char */
521
522 c = 0;
523 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
524 {
525 int cc; /* Some compilers don't like ++ */
526 cc = *(++ptr); /* in initializers */
527 #if !EBCDIC /* ASCII coding */
528 if (cc >= 'a') cc -= 32; /* Convert to upper case */
529 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
530 #else /* EBCDIC coding */
531 if (cc <= 'z') cc += 64; /* Convert to upper case */
532 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
533 #endif
534 }
535 break;
536
537 /* Other special escapes not starting with a digit are straightforward */
538
539 case 'c':
540 c = *(++ptr);
541 if (c == 0)
542 {
543 *errorcodeptr = ERR2;
544 return 0;
545 }
546
547 /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
548 is ASCII-specific, but then the whole concept of \cx is ASCII-specific.
549 (However, an EBCDIC equivalent has now been added.) */
550
551 #if !EBCDIC /* ASCII coding */
552 if (c >= 'a' && c <= 'z') c -= 32;
553 c ^= 0x40;
554 #else /* EBCDIC coding */
555 if (c >= 'a' && c <= 'z') c += 64;
556 c ^= 0xC0;
557 #endif
558 break;
559
560 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
561 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
562 for Perl compatibility, it is a literal. This code looks a bit odd, but
563 there used to be some cases other than the default, and there may be again
564 in future, so I haven't "optimized" it. */
565
566 default:
567 if ((options & PCRE_EXTRA) != 0) switch(c)
568 {
569 default:
570 *errorcodeptr = ERR3;
571 break;
572 }
573 break;
574 }
575 }
576
577 *ptrptr = ptr;
578 return c;
579 }
580
581
582
583 #ifdef SUPPORT_UCP
584 /*************************************************
585 * Handle \P and \p *
586 *************************************************/
587
588 /* This function is called after \P or \p has been encountered, provided that
589 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
590 pointing at the P or p. On exit, it is pointing at the final character of the
591 escape sequence.
592
593 Argument:
594 ptrptr points to the pattern position pointer
595 negptr points to a boolean that is set TRUE for negation else FALSE
596 dptr points to an int that is set to the detailed property value
597 errorcodeptr points to the error code variable
598
599 Returns: type value from ucp_type_table, or -1 for an invalid type
600 */
601
602 static int
603 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
604 {
605 int c, i, bot, top;
606 const uschar *ptr = *ptrptr;
607 char name[32];
608
609 c = *(++ptr);
610 if (c == 0) goto ERROR_RETURN;
611
612 *negptr = FALSE;
613
614 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
615 negation. */
616
617 if (c == '{')
618 {
619 if (ptr[1] == '^')
620 {
621 *negptr = TRUE;
622 ptr++;
623 }
624 for (i = 0; i < sizeof(name) - 1; i++)
625 {
626 c = *(++ptr);
627 if (c == 0) goto ERROR_RETURN;
628 if (c == '}') break;
629 name[i] = c;
630 }
631 if (c !='}') goto ERROR_RETURN;
632 name[i] = 0;
633 }
634
635 /* Otherwise there is just one following character */
636
637 else
638 {
639 name[0] = c;
640 name[1] = 0;
641 }
642
643 *ptrptr = ptr;
644
645 /* Search for a recognized property name using binary chop */
646
647 bot = 0;
648 top = _pcre_utt_size;
649
650 while (bot < top)
651 {
652 i = (bot + top) >> 1;
653 c = strcmp(name, _pcre_utt[i].name);
654 if (c == 0)
655 {
656 *dptr = _pcre_utt[i].value;
657 return _pcre_utt[i].type;
658 }
659 if (c > 0) bot = i + 1; else top = i;
660 }
661
662 *errorcodeptr = ERR47;
663 *ptrptr = ptr;
664 return -1;
665
666 ERROR_RETURN:
667 *errorcodeptr = ERR46;
668 *ptrptr = ptr;
669 return -1;
670 }
671 #endif
672
673
674
675
676 /*************************************************
677 * Check for counted repeat *
678 *************************************************/
679
680 /* This function is called when a '{' is encountered in a place where it might
681 start a quantifier. It looks ahead to see if it really is a quantifier or not.
682 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
683 where the ddds are digits.
684
685 Arguments:
686 p pointer to the first char after '{'
687
688 Returns: TRUE or FALSE
689 */
690
691 static BOOL
692 is_counted_repeat(const uschar *p)
693 {
694 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
695 while ((digitab[*p] & ctype_digit) != 0) p++;
696 if (*p == '}') return TRUE;
697
698 if (*p++ != ',') return FALSE;
699 if (*p == '}') return TRUE;
700
701 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
702 while ((digitab[*p] & ctype_digit) != 0) p++;
703
704 return (*p == '}');
705 }
706
707
708
709 /*************************************************
710 * Read repeat counts *
711 *************************************************/
712
713 /* Read an item of the form {n,m} and return the values. This is called only
714 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
715 so the syntax is guaranteed to be correct, but we need to check the values.
716
717 Arguments:
718 p pointer to first char after '{'
719 minp pointer to int for min
720 maxp pointer to int for max
721 returned as -1 if no max
722 errorcodeptr points to error code variable
723
724 Returns: pointer to '}' on success;
725 current ptr on error, with errorcodeptr set non-zero
726 */
727
728 static const uschar *
729 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
730 {
731 int min = 0;
732 int max = -1;
733
734 /* Read the minimum value and do a paranoid check: a negative value indicates
735 an integer overflow. */
736
737 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
738 if (min < 0 || min > 65535)
739 {
740 *errorcodeptr = ERR5;
741 return p;
742 }
743
744 /* Read the maximum value if there is one, and again do a paranoid on its size.
745 Also, max must not be less than min. */
746
747 if (*p == '}') max = min; else
748 {
749 if (*(++p) != '}')
750 {
751 max = 0;
752 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
753 if (max < 0 || max > 65535)
754 {
755 *errorcodeptr = ERR5;
756 return p;
757 }
758 if (max < min)
759 {
760 *errorcodeptr = ERR4;
761 return p;
762 }
763 }
764 }
765
766 /* Fill in the required variables, and pass back the pointer to the terminating
767 '}'. */
768
769 *minp = min;
770 *maxp = max;
771 return p;
772 }
773
774
775
776 /*************************************************
777 * Find forward referenced named subpattern *
778 *************************************************/
779
780 /* This function scans along a pattern looking for capturing subpatterns, and
781 counting them. If it finds a named pattern that matches the name it is given,
782 it returns its number. This is used for forward references to named
783 subpatterns. We know that if (?P< is encountered, the name will be terminated
784 by '>' because that is checked in the first pass.
785
786 Arguments:
787 pointer current position in the pattern
788 count current count of capturing parens
789 name name to seek
790 namelen name length
791
792 Returns: the number of the named subpattern, or -1 if not found
793 */
794
795 static int
796 find_named_parens(const uschar *ptr, int count, const uschar *name, int namelen)
797 {
798 const uschar *thisname;
799 for (; *ptr != 0; ptr++)
800 {
801 if (*ptr == '\\' && ptr[1] != 0) { ptr++; continue; }
802 if (*ptr != '(') continue;
803 if (ptr[1] != '?') { count++; continue; }
804 if (ptr[2] == '(') { ptr += 2; continue; }
805 if (ptr[2] != 'P' || ptr[3] != '<') continue;
806 count++;
807 ptr += 4;
808 thisname = ptr;
809 while (*ptr != '>') ptr++;
810 if (namelen == ptr - thisname &&
811 strncmp((char *)name, (char*)thisname, namelen) == 0)
812 return count;
813 }
814 return -1;
815 }
816
817
818
819 /*************************************************
820 * Find first significant op code *
821 *************************************************/
822
823 /* This is called by several functions that scan a compiled expression looking
824 for a fixed first character, or an anchoring op code etc. It skips over things
825 that do not influence this. For some calls, a change of option is important.
826 For some calls, it makes sense to skip negative forward and all backward
827 assertions, and also the \b assertion; for others it does not.
828
829 Arguments:
830 code pointer to the start of the group
831 options pointer to external options
832 optbit the option bit whose changing is significant, or
833 zero if none are
834 skipassert TRUE if certain assertions are to be skipped
835
836 Returns: pointer to the first significant opcode
837 */
838
839 static const uschar*
840 first_significant_code(const uschar *code, int *options, int optbit,
841 BOOL skipassert)
842 {
843 for (;;)
844 {
845 switch ((int)*code)
846 {
847 case OP_OPT:
848 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
849 *options = (int)code[1];
850 code += 2;
851 break;
852
853 case OP_ASSERT_NOT:
854 case OP_ASSERTBACK:
855 case OP_ASSERTBACK_NOT:
856 if (!skipassert) return code;
857 do code += GET(code, 1); while (*code == OP_ALT);
858 code += _pcre_OP_lengths[*code];
859 break;
860
861 case OP_WORD_BOUNDARY:
862 case OP_NOT_WORD_BOUNDARY:
863 if (!skipassert) return code;
864 /* Fall through */
865
866 case OP_CALLOUT:
867 case OP_CREF:
868 case OP_BRANUMBER:
869 code += _pcre_OP_lengths[*code];
870 break;
871
872 default:
873 return code;
874 }
875 }
876 /* Control never reaches here */
877 }
878
879
880
881
882 /*************************************************
883 * Find the fixed length of a pattern *
884 *************************************************/
885
886 /* Scan a pattern and compute the fixed length of subject that will match it,
887 if the length is fixed. This is needed for dealing with backward assertions.
888 In UTF8 mode, the result is in characters rather than bytes.
889
890 Arguments:
891 code points to the start of the pattern (the bracket)
892 options the compiling options
893
894 Returns: the fixed length, or -1 if there is no fixed length,
895 or -2 if \C was encountered
896 */
897
898 static int
899 find_fixedlength(uschar *code, int options)
900 {
901 int length = -1;
902
903 register int branchlength = 0;
904 register uschar *cc = code + 1 + LINK_SIZE;
905
906 /* Scan along the opcodes for this branch. If we get to the end of the
907 branch, check the length against that of the other branches. */
908
909 for (;;)
910 {
911 int d;
912 register int op = *cc;
913 if (op >= OP_BRA) op = OP_BRA;
914
915 switch (op)
916 {
917 case OP_BRA:
918 case OP_ONCE:
919 case OP_COND:
920 d = find_fixedlength(cc, options);
921 if (d < 0) return d;
922 branchlength += d;
923 do cc += GET(cc, 1); while (*cc == OP_ALT);
924 cc += 1 + LINK_SIZE;
925 break;
926
927 /* Reached end of a branch; if it's a ket it is the end of a nested
928 call. If it's ALT it is an alternation in a nested call. If it is
929 END it's the end of the outer call. All can be handled by the same code. */
930
931 case OP_ALT:
932 case OP_KET:
933 case OP_KETRMAX:
934 case OP_KETRMIN:
935 case OP_END:
936 if (length < 0) length = branchlength;
937 else if (length != branchlength) return -1;
938 if (*cc != OP_ALT) return length;
939 cc += 1 + LINK_SIZE;
940 branchlength = 0;
941 break;
942
943 /* Skip over assertive subpatterns */
944
945 case OP_ASSERT:
946 case OP_ASSERT_NOT:
947 case OP_ASSERTBACK:
948 case OP_ASSERTBACK_NOT:
949 do cc += GET(cc, 1); while (*cc == OP_ALT);
950 /* Fall through */
951
952 /* Skip over things that don't match chars */
953
954 case OP_REVERSE:
955 case OP_BRANUMBER:
956 case OP_CREF:
957 case OP_OPT:
958 case OP_CALLOUT:
959 case OP_SOD:
960 case OP_SOM:
961 case OP_EOD:
962 case OP_EODN:
963 case OP_CIRC:
964 case OP_DOLL:
965 case OP_NOT_WORD_BOUNDARY:
966 case OP_WORD_BOUNDARY:
967 cc += _pcre_OP_lengths[*cc];
968 break;
969
970 /* Handle literal characters */
971
972 case OP_CHAR:
973 case OP_CHARNC:
974 case OP_NOT:
975 branchlength++;
976 cc += 2;
977 #ifdef SUPPORT_UTF8
978 if ((options & PCRE_UTF8) != 0)
979 {
980 while ((*cc & 0xc0) == 0x80) cc++;
981 }
982 #endif
983 break;
984
985 /* Handle exact repetitions. The count is already in characters, but we
986 need to skip over a multibyte character in UTF8 mode. */
987
988 case OP_EXACT:
989 branchlength += GET2(cc,1);
990 cc += 4;
991 #ifdef SUPPORT_UTF8
992 if ((options & PCRE_UTF8) != 0)
993 {
994 while((*cc & 0x80) == 0x80) cc++;
995 }
996 #endif
997 break;
998
999 case OP_TYPEEXACT:
1000 branchlength += GET2(cc,1);
1001 cc += 4;
1002 break;
1003
1004 /* Handle single-char matchers */
1005
1006 case OP_PROP:
1007 case OP_NOTPROP:
1008 cc += 2;
1009 /* Fall through */
1010
1011 case OP_NOT_DIGIT:
1012 case OP_DIGIT:
1013 case OP_NOT_WHITESPACE:
1014 case OP_WHITESPACE:
1015 case OP_NOT_WORDCHAR:
1016 case OP_WORDCHAR:
1017 case OP_ANY:
1018 branchlength++;
1019 cc++;
1020 break;
1021
1022 /* The single-byte matcher isn't allowed */
1023
1024 case OP_ANYBYTE:
1025 return -2;
1026
1027 /* Check a class for variable quantification */
1028
1029 #ifdef SUPPORT_UTF8
1030 case OP_XCLASS:
1031 cc += GET(cc, 1) - 33;
1032 /* Fall through */
1033 #endif
1034
1035 case OP_CLASS:
1036 case OP_NCLASS:
1037 cc += 33;
1038
1039 switch (*cc)
1040 {
1041 case OP_CRSTAR:
1042 case OP_CRMINSTAR:
1043 case OP_CRQUERY:
1044 case OP_CRMINQUERY:
1045 return -1;
1046
1047 case OP_CRRANGE:
1048 case OP_CRMINRANGE:
1049 if (GET2(cc,1) != GET2(cc,3)) return -1;
1050 branchlength += GET2(cc,1);
1051 cc += 5;
1052 break;
1053
1054 default:
1055 branchlength++;
1056 }
1057 break;
1058
1059 /* Anything else is variable length */
1060
1061 default:
1062 return -1;
1063 }
1064 }
1065 /* Control never gets here */
1066 }
1067
1068
1069
1070
1071 /*************************************************
1072 * Scan compiled regex for numbered bracket *
1073 *************************************************/
1074
1075 /* This little function scans through a compiled pattern until it finds a
1076 capturing bracket with the given number.
1077
1078 Arguments:
1079 code points to start of expression
1080 utf8 TRUE in UTF-8 mode
1081 number the required bracket number
1082
1083 Returns: pointer to the opcode for the bracket, or NULL if not found
1084 */
1085
1086 static const uschar *
1087 find_bracket(const uschar *code, BOOL utf8, int number)
1088 {
1089 for (;;)
1090 {
1091 register int c = *code;
1092 if (c == OP_END) return NULL;
1093
1094 /* XCLASS is used for classes that cannot be represented just by a bit
1095 map. This includes negated single high-valued characters. The length in
1096 the table is zero; the actual length is stored in the compiled code. */
1097
1098 if (c == OP_XCLASS) code += GET(code, 1);
1099
1100 /* Handle bracketed group */
1101
1102 else if (c > OP_BRA)
1103 {
1104 int n = c - OP_BRA;
1105 if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1106 if (n == number) return (uschar *)code;
1107 code += _pcre_OP_lengths[OP_BRA];
1108 }
1109
1110 /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1111 that are followed by a character may be followed by a multi-byte character.
1112 The length in the table is a minimum, so we have to scan along to skip the
1113 extra bytes. All opcodes are less than 128, so we can use relatively
1114 efficient code. */
1115
1116 else
1117 {
1118 code += _pcre_OP_lengths[c];
1119 if (utf8) switch(c)
1120 {
1121 case OP_CHAR:
1122 case OP_CHARNC:
1123 case OP_EXACT:
1124 case OP_UPTO:
1125 case OP_MINUPTO:
1126 case OP_STAR:
1127 case OP_MINSTAR:
1128 case OP_PLUS:
1129 case OP_MINPLUS:
1130 case OP_QUERY:
1131 case OP_MINQUERY:
1132 while ((*code & 0xc0) == 0x80) code++;
1133 break;
1134 }
1135 }
1136 }
1137 }
1138
1139
1140
1141 /*************************************************
1142 * Scan compiled regex for recursion reference *
1143 *************************************************/
1144
1145 /* This little function scans through a compiled pattern until it finds an
1146 instance of OP_RECURSE.
1147
1148 Arguments:
1149 code points to start of expression
1150 utf8 TRUE in UTF-8 mode
1151
1152 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1153 */
1154
1155 static const uschar *
1156 find_recurse(const uschar *code, BOOL utf8)
1157 {
1158 for (;;)
1159 {
1160 register int c = *code;
1161 if (c == OP_END) return NULL;
1162 if (c == OP_RECURSE) return code;
1163
1164 /* XCLASS is used for classes that cannot be represented just by a bit
1165 map. This includes negated single high-valued characters. The length in
1166 the table is zero; the actual length is stored in the compiled code. */
1167
1168 if (c == OP_XCLASS) code += GET(code, 1);
1169
1170 /* All bracketed groups have the same length. */
1171
1172 else if (c > OP_BRA)
1173 {
1174 code += _pcre_OP_lengths[OP_BRA];
1175 }
1176
1177 /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1178 that are followed by a character may be followed by a multi-byte character.
1179 The length in the table is a minimum, so we have to scan along to skip the
1180 extra bytes. All opcodes are less than 128, so we can use relatively
1181 efficient code. */
1182
1183 else
1184 {
1185 code += _pcre_OP_lengths[c];
1186 if (utf8) switch(c)
1187 {
1188 case OP_CHAR:
1189 case OP_CHARNC:
1190 case OP_EXACT:
1191 case OP_UPTO:
1192 case OP_MINUPTO:
1193 case OP_STAR:
1194 case OP_MINSTAR:
1195 case OP_PLUS:
1196 case OP_MINPLUS:
1197 case OP_QUERY:
1198 case OP_MINQUERY:
1199 while ((*code & 0xc0) == 0x80) code++;
1200 break;
1201 }
1202 }
1203 }
1204 }
1205
1206
1207
1208 /*************************************************
1209 * Scan compiled branch for non-emptiness *
1210 *************************************************/
1211
1212 /* This function scans through a branch of a compiled pattern to see whether it
1213 can match the empty string or not. It is called only from could_be_empty()
1214 below. Note that first_significant_code() skips over assertions. If we hit an
1215 unclosed bracket, we return "empty" - this means we've struck an inner bracket
1216 whose current branch will already have been scanned.
1217
1218 Arguments:
1219 code points to start of search
1220 endcode points to where to stop
1221 utf8 TRUE if in UTF8 mode
1222
1223 Returns: TRUE if what is matched could be empty
1224 */
1225
1226 static BOOL
1227 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1228 {
1229 register int c;
1230 for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);
1231 code < endcode;
1232 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1233 {
1234 const uschar *ccode;
1235
1236 c = *code;
1237
1238 if (c >= OP_BRA)
1239 {
1240 BOOL empty_branch;
1241 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1242
1243 /* Scan a closed bracket */
1244
1245 empty_branch = FALSE;
1246 do
1247 {
1248 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1249 empty_branch = TRUE;
1250 code += GET(code, 1);
1251 }
1252 while (*code == OP_ALT);
1253 if (!empty_branch) return FALSE; /* All branches are non-empty */
1254 code += 1 + LINK_SIZE;
1255 c = *code;
1256 }
1257
1258 else switch (c)
1259 {
1260 /* Check for quantifiers after a class */
1261
1262 #ifdef SUPPORT_UTF8
1263 case OP_XCLASS:
1264 ccode = code + GET(code, 1);
1265 goto CHECK_CLASS_REPEAT;
1266 #endif
1267
1268 case OP_CLASS:
1269 case OP_NCLASS:
1270 ccode = code + 33;
1271
1272 #ifdef SUPPORT_UTF8
1273 CHECK_CLASS_REPEAT:
1274 #endif
1275
1276 switch (*ccode)
1277 {
1278 case OP_CRSTAR: /* These could be empty; continue */
1279 case OP_CRMINSTAR:
1280 case OP_CRQUERY:
1281 case OP_CRMINQUERY:
1282 break;
1283
1284 default: /* Non-repeat => class must match */
1285 case OP_CRPLUS: /* These repeats aren't empty */
1286 case OP_CRMINPLUS:
1287 return FALSE;
1288
1289 case OP_CRRANGE:
1290 case OP_CRMINRANGE:
1291 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1292 break;
1293 }
1294 break;
1295
1296 /* Opcodes that must match a character */
1297
1298 case OP_PROP:
1299 case OP_NOTPROP:
1300 case OP_EXTUNI:
1301 case OP_NOT_DIGIT:
1302 case OP_DIGIT:
1303 case OP_NOT_WHITESPACE:
1304 case OP_WHITESPACE:
1305 case OP_NOT_WORDCHAR:
1306 case OP_WORDCHAR:
1307 case OP_ANY:
1308 case OP_ANYBYTE:
1309 case OP_CHAR:
1310 case OP_CHARNC:
1311 case OP_NOT:
1312 case OP_PLUS:
1313 case OP_MINPLUS:
1314 case OP_EXACT:
1315 case OP_NOTPLUS:
1316 case OP_NOTMINPLUS:
1317 case OP_NOTEXACT:
1318 case OP_TYPEPLUS:
1319 case OP_TYPEMINPLUS:
1320 case OP_TYPEEXACT:
1321 return FALSE;
1322
1323 /* End of branch */
1324
1325 case OP_KET:
1326 case OP_KETRMAX:
1327 case OP_KETRMIN:
1328 case OP_ALT:
1329 return TRUE;
1330
1331 /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO may be
1332 followed by a multibyte character */
1333
1334 #ifdef SUPPORT_UTF8
1335 case OP_STAR:
1336 case OP_MINSTAR:
1337 case OP_QUERY:
1338 case OP_MINQUERY:
1339 case OP_UPTO:
1340 case OP_MINUPTO:
1341 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1342 break;
1343 #endif
1344 }
1345 }
1346
1347 return TRUE;
1348 }
1349
1350
1351
1352 /*************************************************
1353 * Scan compiled regex for non-emptiness *
1354 *************************************************/
1355
1356 /* This function is called to check for left recursive calls. We want to check
1357 the current branch of the current pattern to see if it could match the empty
1358 string. If it could, we must look outwards for branches at other levels,
1359 stopping when we pass beyond the bracket which is the subject of the recursion.
1360
1361 Arguments:
1362 code points to start of the recursion
1363 endcode points to where to stop (current RECURSE item)
1364 bcptr points to the chain of current (unclosed) branch starts
1365 utf8 TRUE if in UTF-8 mode
1366
1367 Returns: TRUE if what is matched could be empty
1368 */
1369
1370 static BOOL
1371 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1372 BOOL utf8)
1373 {
1374 while (bcptr != NULL && bcptr->current >= code)
1375 {
1376 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1377 bcptr = bcptr->outer;
1378 }
1379 return TRUE;
1380 }
1381
1382
1383
1384 /*************************************************
1385 * Check for POSIX class syntax *
1386 *************************************************/
1387
1388 /* This function is called when the sequence "[:" or "[." or "[=" is
1389 encountered in a character class. It checks whether this is followed by an
1390 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1391 ".]" or "=]".
1392
1393 Argument:
1394 ptr pointer to the initial [
1395 endptr where to return the end pointer
1396 cd pointer to compile data
1397
1398 Returns: TRUE or FALSE
1399 */
1400
1401 static BOOL
1402 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1403 {
1404 int terminator; /* Don't combine these lines; the Solaris cc */
1405 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1406 if (*(++ptr) == '^') ptr++;
1407 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1408 if (*ptr == terminator && ptr[1] == ']')
1409 {
1410 *endptr = ptr;
1411 return TRUE;
1412 }
1413 return FALSE;
1414 }
1415
1416
1417
1418
1419 /*************************************************
1420 * Check POSIX class name *
1421 *************************************************/
1422
1423 /* This function is called to check the name given in a POSIX-style class entry
1424 such as [:alnum:].
1425
1426 Arguments:
1427 ptr points to the first letter
1428 len the length of the name
1429
1430 Returns: a value representing the name, or -1 if unknown
1431 */
1432
1433 static int
1434 check_posix_name(const uschar *ptr, int len)
1435 {
1436 register int yield = 0;
1437 while (posix_name_lengths[yield] != 0)
1438 {
1439 if (len == posix_name_lengths[yield] &&
1440 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1441 yield++;
1442 }
1443 return -1;
1444 }
1445
1446
1447 /*************************************************
1448 * Adjust OP_RECURSE items in repeated group *
1449 *************************************************/
1450
1451 /* OP_RECURSE items contain an offset from the start of the regex to the group
1452 that is referenced. This means that groups can be replicated for fixed
1453 repetition simply by copying (because the recursion is allowed to refer to
1454 earlier groups that are outside the current group). However, when a group is
1455 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1456 it, after it has been compiled. This means that any OP_RECURSE items within it
1457 that refer to the group itself or any contained groups have to have their
1458 offsets adjusted. That is the job of this function. Before it is called, the
1459 partially compiled regex must be temporarily terminated with OP_END.
1460
1461 Arguments:
1462 group points to the start of the group
1463 adjust the amount by which the group is to be moved
1464 utf8 TRUE in UTF-8 mode
1465 cd contains pointers to tables etc.
1466
1467 Returns: nothing
1468 */
1469
1470 static void
1471 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)
1472 {
1473 uschar *ptr = group;
1474 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1475 {
1476 int offset = GET(ptr, 1);
1477 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1478 ptr += 1 + LINK_SIZE;
1479 }
1480 }
1481
1482
1483
1484 /*************************************************
1485 * Insert an automatic callout point *
1486 *************************************************/
1487
1488 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1489 callout points before each pattern item.
1490
1491 Arguments:
1492 code current code pointer
1493 ptr current pattern pointer
1494 cd pointers to tables etc
1495
1496 Returns: new code pointer
1497 */
1498
1499 static uschar *
1500 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1501 {
1502 *code++ = OP_CALLOUT;
1503 *code++ = 255;
1504 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1505 PUT(code, LINK_SIZE, 0); /* Default length */
1506 return code + 2*LINK_SIZE;
1507 }
1508
1509
1510
1511 /*************************************************
1512 * Complete a callout item *
1513 *************************************************/
1514
1515 /* A callout item contains the length of the next item in the pattern, which
1516 we can't fill in till after we have reached the relevant point. This is used
1517 for both automatic and manual callouts.
1518
1519 Arguments:
1520 previous_callout points to previous callout item
1521 ptr current pattern pointer
1522 cd pointers to tables etc
1523
1524 Returns: nothing
1525 */
1526
1527 static void
1528 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1529 {
1530 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1531 PUT(previous_callout, 2 + LINK_SIZE, length);
1532 }
1533
1534
1535
1536 #ifdef SUPPORT_UCP
1537 /*************************************************
1538 * Get othercase range *
1539 *************************************************/
1540
1541 /* This function is passed the start and end of a class range, in UTF-8 mode
1542 with UCP support. It searches up the characters, looking for internal ranges of
1543 characters in the "other" case. Each call returns the next one, updating the
1544 start address.
1545
1546 Arguments:
1547 cptr points to starting character value; updated
1548 d end value
1549 ocptr where to put start of othercase range
1550 odptr where to put end of othercase range
1551
1552 Yield: TRUE when range returned; FALSE when no more
1553 */
1554
1555 static BOOL
1556 get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)
1557 {
1558 int c, othercase, next;
1559
1560 for (c = *cptr; c <= d; c++)
1561 { if ((othercase = _pcre_ucp_othercase(c)) >= 0) break; }
1562
1563 if (c > d) return FALSE;
1564
1565 *ocptr = othercase;
1566 next = othercase + 1;
1567
1568 for (++c; c <= d; c++)
1569 {
1570 if (_pcre_ucp_othercase(c) != next) break;
1571 next++;
1572 }
1573
1574 *odptr = next - 1;
1575 *cptr = c;
1576
1577 return TRUE;
1578 }
1579 #endif /* SUPPORT_UCP */
1580
1581
1582 /*************************************************
1583 * Compile one branch *
1584 *************************************************/
1585
1586 /* Scan the pattern, compiling it into the code vector. If the options are
1587 changed during the branch, the pointer is used to change the external options
1588 bits.
1589
1590 Arguments:
1591 optionsptr pointer to the option bits
1592 brackets points to number of extracting brackets used
1593 codeptr points to the pointer to the current code point
1594 ptrptr points to the current pattern pointer
1595 errorcodeptr points to error code variable
1596 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
1597 reqbyteptr set to the last literal character required, else < 0
1598 bcptr points to current branch chain
1599 cd contains pointers to tables etc.
1600
1601 Returns: TRUE on success
1602 FALSE, with *errorcodeptr set non-zero on error
1603 */
1604
1605 static BOOL
1606 compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
1607 const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,
1608 int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
1609 {
1610 int repeat_type, op_type;
1611 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
1612 int bravalue = 0;
1613 int greedy_default, greedy_non_default;
1614 int firstbyte, reqbyte;
1615 int zeroreqbyte, zerofirstbyte;
1616 int req_caseopt, reqvary, tempreqvary;
1617 int options = *optionsptr;
1618 int after_manual_callout = 0;
1619 register int c;
1620 register uschar *code = *codeptr;
1621 uschar *tempcode;
1622 BOOL inescq = FALSE;
1623 BOOL groupsetfirstbyte = FALSE;
1624 const uschar *ptr = *ptrptr;
1625 const uschar *tempptr;
1626 uschar *previous = NULL;
1627 uschar *previous_callout = NULL;
1628 uschar classbits[32];
1629
1630 #ifdef SUPPORT_UTF8
1631 BOOL class_utf8;
1632 BOOL utf8 = (options & PCRE_UTF8) != 0;
1633 uschar *class_utf8data;
1634 uschar utf8_char[6];
1635 #else
1636 BOOL utf8 = FALSE;
1637 #endif
1638
1639 /* Set up the default and non-default settings for greediness */
1640
1641 greedy_default = ((options & PCRE_UNGREEDY) != 0);
1642 greedy_non_default = greedy_default ^ 1;
1643
1644 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
1645 matching encountered yet". It gets changed to REQ_NONE if we hit something that
1646 matches a non-fixed char first char; reqbyte just remains unset if we never
1647 find one.
1648
1649 When we hit a repeat whose minimum is zero, we may have to adjust these values
1650 to take the zero repeat into account. This is implemented by setting them to
1651 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
1652 item types that can be repeated set these backoff variables appropriately. */
1653
1654 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
1655
1656 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
1657 according to the current setting of the caseless flag. REQ_CASELESS is a bit
1658 value > 255. It is added into the firstbyte or reqbyte variables to record the
1659 case status of the value. This is used only for ASCII characters. */
1660
1661 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
1662
1663 /* Switch on next character until the end of the branch */
1664
1665 for (;; ptr++)
1666 {
1667 BOOL negate_class;
1668 BOOL possessive_quantifier;
1669 BOOL is_quantifier;
1670 int class_charcount;
1671 int class_lastchar;
1672 int newoptions;
1673 int recno;
1674 int skipbytes;
1675 int subreqbyte;
1676 int subfirstbyte;
1677 int mclength;
1678 uschar mcbuffer[8];
1679
1680 /* Next byte in the pattern */
1681
1682 c = *ptr;
1683
1684 /* If in \Q...\E, check for the end; if not, we have a literal */
1685
1686 if (inescq && c != 0)
1687 {
1688 if (c == '\\' && ptr[1] == 'E')
1689 {
1690 inescq = FALSE;
1691 ptr++;
1692 continue;
1693 }
1694 else
1695 {
1696 if (previous_callout != NULL)
1697 {
1698 complete_callout(previous_callout, ptr, cd);
1699 previous_callout = NULL;
1700 }
1701 if ((options & PCRE_AUTO_CALLOUT) != 0)
1702 {
1703 previous_callout = code;
1704 code = auto_callout(code, ptr, cd);
1705 }
1706 goto NORMAL_CHAR;
1707 }
1708 }
1709
1710 /* Fill in length of a previous callout, except when the next thing is
1711 a quantifier. */
1712
1713 is_quantifier = c == '*' || c == '+' || c == '?' ||
1714 (c == '{' && is_counted_repeat(ptr+1));
1715
1716 if (!is_quantifier && previous_callout != NULL &&
1717 after_manual_callout-- <= 0)
1718 {
1719 complete_callout(previous_callout, ptr, cd);
1720 previous_callout = NULL;
1721 }
1722
1723 /* In extended mode, skip white space and comments */
1724
1725 if ((options & PCRE_EXTENDED) != 0)
1726 {
1727 if ((cd->ctypes[c] & ctype_space) != 0) continue;
1728 if (c == '#')
1729 {
1730 while (*(++ptr) != 0) if (IS_NEWLINE(ptr)) break;
1731 if (*ptr != 0)
1732 {
1733 ptr += cd->nllen - 1;
1734 continue;
1735 }
1736 /* Else fall through to handle end of string */
1737 c = 0;
1738 }
1739 }
1740
1741 /* No auto callout for quantifiers. */
1742
1743 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
1744 {
1745 previous_callout = code;
1746 code = auto_callout(code, ptr, cd);
1747 }
1748
1749 switch(c)
1750 {
1751 /* The branch terminates at end of string, |, or ). */
1752
1753 case 0:
1754 case '|':
1755 case ')':
1756 *firstbyteptr = firstbyte;
1757 *reqbyteptr = reqbyte;
1758 *codeptr = code;
1759 *ptrptr = ptr;
1760 return TRUE;
1761
1762 /* Handle single-character metacharacters. In multiline mode, ^ disables
1763 the setting of any following char as a first character. */
1764
1765 case '^':
1766 if ((options & PCRE_MULTILINE) != 0)
1767 {
1768 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1769 }
1770 previous = NULL;
1771 *code++ = OP_CIRC;
1772 break;
1773
1774 case '$':
1775 previous = NULL;
1776 *code++ = OP_DOLL;
1777 break;
1778
1779 /* There can never be a first char if '.' is first, whatever happens about
1780 repeats. The value of reqbyte doesn't change either. */
1781
1782 case '.':
1783 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1784 zerofirstbyte = firstbyte;
1785 zeroreqbyte = reqbyte;
1786 previous = code;
1787 *code++ = OP_ANY;
1788 break;
1789
1790 /* Character classes. If the included characters are all < 256, we build a
1791 32-byte bitmap of the permitted characters, except in the special case
1792 where there is only one such character. For negated classes, we build the
1793 map as usual, then invert it at the end. However, we use a different opcode
1794 so that data characters > 255 can be handled correctly.
1795
1796 If the class contains characters outside the 0-255 range, a different
1797 opcode is compiled. It may optionally have a bit map for characters < 256,
1798 but those above are are explicitly listed afterwards. A flag byte tells
1799 whether the bitmap is present, and whether this is a negated class or not.
1800 */
1801
1802 case '[':
1803 previous = code;
1804
1805 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
1806 they are encountered at the top level, so we'll do that too. */
1807
1808 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1809 check_posix_syntax(ptr, &tempptr, cd))
1810 {
1811 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
1812 goto FAILED;
1813 }
1814
1815 /* If the first character is '^', set the negation flag and skip it. */
1816
1817 if ((c = *(++ptr)) == '^')
1818 {
1819 negate_class = TRUE;
1820 c = *(++ptr);
1821 }
1822 else
1823 {
1824 negate_class = FALSE;
1825 }
1826
1827 /* Keep a count of chars with values < 256 so that we can optimize the case
1828 of just a single character (as long as it's < 256). For higher valued UTF-8
1829 characters, we don't yet do any optimization. */
1830
1831 class_charcount = 0;
1832 class_lastchar = -1;
1833
1834 #ifdef SUPPORT_UTF8
1835 class_utf8 = FALSE; /* No chars >= 256 */
1836 class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
1837 #endif
1838
1839 /* Initialize the 32-char bit map to all zeros. We have to build the
1840 map in a temporary bit of store, in case the class contains only 1
1841 character (< 256), because in that case the compiled code doesn't use the
1842 bit map. */
1843
1844 memset(classbits, 0, 32 * sizeof(uschar));
1845
1846 /* Process characters until ] is reached. By writing this as a "do" it
1847 means that an initial ] is taken as a data character. The first pass
1848 through the regex checked the overall syntax, so we don't need to be very
1849 strict here. At the start of the loop, c contains the first byte of the
1850 character. */
1851
1852 do
1853 {
1854 #ifdef SUPPORT_UTF8
1855 if (utf8 && c > 127)
1856 { /* Braces are required because the */
1857 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
1858 }
1859 #endif
1860
1861 /* Inside \Q...\E everything is literal except \E */
1862
1863 if (inescq)
1864 {
1865 if (c == '\\' && ptr[1] == 'E')
1866 {
1867 inescq = FALSE;
1868 ptr++;
1869 continue;
1870 }
1871 else goto LONE_SINGLE_CHARACTER;
1872 }
1873
1874 /* Handle POSIX class names. Perl allows a negation extension of the
1875 form [:^name:]. A square bracket that doesn't match the syntax is
1876 treated as a literal. We also recognize the POSIX constructions
1877 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1878 5.6 and 5.8 do. */
1879
1880 if (c == '[' &&
1881 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1882 check_posix_syntax(ptr, &tempptr, cd))
1883 {
1884 BOOL local_negate = FALSE;
1885 int posix_class, taboffset, tabopt;
1886 register const uschar *cbits = cd->cbits;
1887 uschar pbits[32];
1888
1889 if (ptr[1] != ':')
1890 {
1891 *errorcodeptr = ERR31;
1892 goto FAILED;
1893 }
1894
1895 ptr += 2;
1896 if (*ptr == '^')
1897 {
1898 local_negate = TRUE;
1899 ptr++;
1900 }
1901
1902 posix_class = check_posix_name(ptr, tempptr - ptr);
1903 if (posix_class < 0)
1904 {
1905 *errorcodeptr = ERR30;
1906 goto FAILED;
1907 }
1908
1909 /* If matching is caseless, upper and lower are converted to
1910 alpha. This relies on the fact that the class table starts with
1911 alpha, lower, upper as the first 3 entries. */
1912
1913 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1914 posix_class = 0;
1915
1916 /* We build the bit map for the POSIX class in a chunk of local store
1917 because we may be adding and subtracting from it, and we don't want to
1918 subtract bits that may be in the main map already. At the end we or the
1919 result into the bit map that is being built. */
1920
1921 posix_class *= 3;
1922
1923 /* Copy in the first table (always present) */
1924
1925 memcpy(pbits, cbits + posix_class_maps[posix_class],
1926 32 * sizeof(uschar));
1927
1928 /* If there is a second table, add or remove it as required. */
1929
1930 taboffset = posix_class_maps[posix_class + 1];
1931 tabopt = posix_class_maps[posix_class + 2];
1932
1933 if (taboffset >= 0)
1934 {
1935 if (tabopt >= 0)
1936 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
1937 else
1938 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
1939 }
1940
1941 /* Not see if we need to remove any special characters. An option
1942 value of 1 removes vertical space and 2 removes underscore. */
1943
1944 if (tabopt < 0) tabopt = -tabopt;
1945 if (tabopt == 1) pbits[1] &= ~0x3c;
1946 else if (tabopt == 2) pbits[11] &= 0x7f;
1947
1948 /* Add the POSIX table or its complement into the main table that is
1949 being built and we are done. */
1950
1951 if (local_negate)
1952 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
1953 else
1954 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
1955
1956 ptr = tempptr + 1;
1957 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
1958 continue; /* End of POSIX syntax handling */
1959 }
1960
1961 /* Backslash may introduce a single character, or it may introduce one
1962 of the specials, which just set a flag. Escaped items are checked for
1963 validity in the pre-compiling pass. The sequence \b is a special case.
1964 Inside a class (and only there) it is treated as backspace. Elsewhere
1965 it marks a word boundary. Other escapes have preset maps ready to
1966 or into the one we are building. We assume they have more than one
1967 character in them, so set class_charcount bigger than one. */
1968
1969 if (c == '\\')
1970 {
1971 c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);
1972
1973 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
1974 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
1975 else if (-c == ESC_Q) /* Handle start of quoted string */
1976 {
1977 if (ptr[1] == '\\' && ptr[2] == 'E')
1978 {
1979 ptr += 2; /* avoid empty string */
1980 }
1981 else inescq = TRUE;
1982 continue;
1983 }
1984
1985 if (c < 0)
1986 {
1987 register const uschar *cbits = cd->cbits;
1988 class_charcount += 2; /* Greater than 1 is what matters */
1989 switch (-c)
1990 {
1991 case ESC_d:
1992 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
1993 continue;
1994
1995 case ESC_D:
1996 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
1997 continue;
1998
1999 case ESC_w:
2000 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2001 continue;
2002
2003 case ESC_W:
2004 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2005 continue;
2006
2007 case ESC_s:
2008 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2009 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2010 continue;
2011
2012 case ESC_S:
2013 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2014 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2015 continue;
2016
2017 #ifdef SUPPORT_UCP
2018 case ESC_p:
2019 case ESC_P:
2020 {
2021 BOOL negated;
2022 int pdata;
2023 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2024 if (ptype < 0) goto FAILED;
2025 class_utf8 = TRUE;
2026 *class_utf8data++ = ((-c == ESC_p) != negated)?
2027 XCL_PROP : XCL_NOTPROP;
2028 *class_utf8data++ = ptype;
2029 *class_utf8data++ = pdata;
2030 class_charcount -= 2; /* Not a < 256 character */
2031 }
2032 continue;
2033 #endif
2034
2035 /* Unrecognized escapes are faulted if PCRE is running in its
2036 strict mode. By default, for compatibility with Perl, they are
2037 treated as literals. */
2038
2039 default:
2040 if ((options & PCRE_EXTRA) != 0)
2041 {
2042 *errorcodeptr = ERR7;
2043 goto FAILED;
2044 }
2045 c = *ptr; /* The final character */
2046 class_charcount -= 2; /* Undo the default count from above */
2047 }
2048 }
2049
2050 /* Fall through if we have a single character (c >= 0). This may be
2051 > 256 in UTF-8 mode. */
2052
2053 } /* End of backslash handling */
2054
2055 /* A single character may be followed by '-' to form a range. However,
2056 Perl does not permit ']' to be the end of the range. A '-' character
2057 here is treated as a literal. */
2058
2059 if (ptr[1] == '-' && ptr[2] != ']')
2060 {
2061 int d;
2062 ptr += 2;
2063
2064 #ifdef SUPPORT_UTF8
2065 if (utf8)
2066 { /* Braces are required because the */
2067 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2068 }
2069 else
2070 #endif
2071 d = *ptr; /* Not UTF-8 mode */
2072
2073 /* The second part of a range can be a single-character escape, but
2074 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2075 in such circumstances. */
2076
2077 if (d == '\\')
2078 {
2079 const uschar *oldptr = ptr;
2080 d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);
2081
2082 /* \b is backslash; \X is literal X; any other special means the '-'
2083 was literal */
2084
2085 if (d < 0)
2086 {
2087 if (d == -ESC_b) d = '\b';
2088 else if (d == -ESC_X) d = 'X'; else
2089 {
2090 ptr = oldptr - 2;
2091 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2092 }
2093 }
2094 }
2095
2096 /* The check that the two values are in the correct order happens in
2097 the pre-pass. Optimize one-character ranges */
2098
2099 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2100
2101 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2102 matching, we have to use an XCLASS with extra data items. Caseless
2103 matching for characters > 127 is available only if UCP support is
2104 available. */
2105
2106 #ifdef SUPPORT_UTF8
2107 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2108 {
2109 class_utf8 = TRUE;
2110
2111 /* With UCP support, we can find the other case equivalents of
2112 the relevant characters. There may be several ranges. Optimize how
2113 they fit with the basic range. */
2114
2115 #ifdef SUPPORT_UCP
2116 if ((options & PCRE_CASELESS) != 0)
2117 {
2118 int occ, ocd;
2119 int cc = c;
2120 int origd = d;
2121 while (get_othercase_range(&cc, origd, &occ, &ocd))
2122 {
2123 if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
2124
2125 if (occ < c && ocd >= c - 1) /* Extend the basic range */
2126 { /* if there is overlap, */
2127 c = occ; /* noting that if occ < c */
2128 continue; /* we can't have ocd > d */
2129 } /* because a subrange is */
2130 if (ocd > d && occ <= d + 1) /* always shorter than */
2131 { /* the basic range. */
2132 d = ocd;
2133 continue;
2134 }
2135
2136 if (occ == ocd)
2137 {
2138 *class_utf8data++ = XCL_SINGLE;
2139 }
2140 else
2141 {
2142 *class_utf8data++ = XCL_RANGE;
2143 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2144 }
2145 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2146 }
2147 }
2148 #endif /* SUPPORT_UCP */
2149
2150 /* Now record the original range, possibly modified for UCP caseless
2151 overlapping ranges. */
2152
2153 *class_utf8data++ = XCL_RANGE;
2154 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2155 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2156
2157 /* With UCP support, we are done. Without UCP support, there is no
2158 caseless matching for UTF-8 characters > 127; we can use the bit map
2159 for the smaller ones. */
2160
2161 #ifdef SUPPORT_UCP
2162 continue; /* With next character in the class */
2163 #else
2164 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2165
2166 /* Adjust upper limit and fall through to set up the map */
2167
2168 d = 127;
2169
2170 #endif /* SUPPORT_UCP */
2171 }
2172 #endif /* SUPPORT_UTF8 */
2173
2174 /* We use the bit map for all cases when not in UTF-8 mode; else
2175 ranges that lie entirely within 0-127 when there is UCP support; else
2176 for partial ranges without UCP support. */
2177
2178 for (; c <= d; c++)
2179 {
2180 classbits[c/8] |= (1 << (c&7));
2181 if ((options & PCRE_CASELESS) != 0)
2182 {
2183 int uc = cd->fcc[c]; /* flip case */
2184 classbits[uc/8] |= (1 << (uc&7));
2185 }
2186 class_charcount++; /* in case a one-char range */
2187 class_lastchar = c;
2188 }
2189
2190 continue; /* Go get the next char in the class */
2191 }
2192
2193 /* Handle a lone single character - we can get here for a normal
2194 non-escape char, or after \ that introduces a single character or for an
2195 apparent range that isn't. */
2196
2197 LONE_SINGLE_CHARACTER:
2198
2199 /* Handle a character that cannot go in the bit map */
2200
2201 #ifdef SUPPORT_UTF8
2202 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2203 {
2204 class_utf8 = TRUE;
2205 *class_utf8data++ = XCL_SINGLE;
2206 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2207
2208 #ifdef SUPPORT_UCP
2209 if ((options & PCRE_CASELESS) != 0)
2210 {
2211 int othercase;
2212 if ((othercase = _pcre_ucp_othercase(c)) >= 0)
2213 {
2214 *class_utf8data++ = XCL_SINGLE;
2215 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
2216 }
2217 }
2218 #endif /* SUPPORT_UCP */
2219
2220 }
2221 else
2222 #endif /* SUPPORT_UTF8 */
2223
2224 /* Handle a single-byte character */
2225 {
2226 classbits[c/8] |= (1 << (c&7));
2227 if ((options & PCRE_CASELESS) != 0)
2228 {
2229 c = cd->fcc[c]; /* flip case */
2230 classbits[c/8] |= (1 << (c&7));
2231 }
2232 class_charcount++;
2233 class_lastchar = c;
2234 }
2235 }
2236
2237 /* Loop until ']' reached; the check for end of string happens inside the
2238 loop. This "while" is the end of the "do" above. */
2239
2240 while ((c = *(++ptr)) != ']' || inescq);
2241
2242 /* If class_charcount is 1, we saw precisely one character whose value is
2243 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2244 can optimize the negative case only if there were no characters >= 128
2245 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2246 single-bytes only. This is an historical hangover. Maybe one day we can
2247 tidy these opcodes to handle multi-byte characters.
2248
2249 The optimization throws away the bit map. We turn the item into a
2250 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2251 that OP_NOT does not support multibyte characters. In the positive case, it
2252 can cause firstbyte to be set. Otherwise, there can be no first char if
2253 this item is first, whatever repeat count may follow. In the case of
2254 reqbyte, save the previous value for reinstating. */
2255
2256 #ifdef SUPPORT_UTF8
2257 if (class_charcount == 1 &&
2258 (!utf8 ||
2259 (!class_utf8 && (!negate_class || class_lastchar < 128))))
2260
2261 #else
2262 if (class_charcount == 1)
2263 #endif
2264 {
2265 zeroreqbyte = reqbyte;
2266
2267 /* The OP_NOT opcode works on one-byte characters only. */
2268
2269 if (negate_class)
2270 {
2271 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2272 zerofirstbyte = firstbyte;
2273 *code++ = OP_NOT;
2274 *code++ = class_lastchar;
2275 break;
2276 }
2277
2278 /* For a single, positive character, get the value into mcbuffer, and
2279 then we can handle this with the normal one-character code. */
2280
2281 #ifdef SUPPORT_UTF8
2282 if (utf8 && class_lastchar > 127)
2283 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
2284 else
2285 #endif
2286 {
2287 mcbuffer[0] = class_lastchar;
2288 mclength = 1;
2289 }
2290 goto ONE_CHAR;
2291 } /* End of 1-char optimization */
2292
2293 /* The general case - not the one-char optimization. If this is the first
2294 thing in the branch, there can be no first char setting, whatever the
2295 repeat count. Any reqbyte setting must remain unchanged after any kind of
2296 repeat. */
2297
2298 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2299 zerofirstbyte = firstbyte;
2300 zeroreqbyte = reqbyte;
2301
2302 /* If there are characters with values > 255, we have to compile an
2303 extended class, with its own opcode. If there are no characters < 256,
2304 we can omit the bitmap. */
2305
2306 #ifdef SUPPORT_UTF8
2307 if (class_utf8)
2308 {
2309 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2310 *code++ = OP_XCLASS;
2311 code += LINK_SIZE;
2312 *code = negate_class? XCL_NOT : 0;
2313
2314 /* If the map is required, install it, and move on to the end of
2315 the extra data */
2316
2317 if (class_charcount > 0)
2318 {
2319 *code++ |= XCL_MAP;
2320 memcpy(code, classbits, 32);
2321 code = class_utf8data;
2322 }
2323
2324 /* If the map is not required, slide down the extra data. */
2325
2326 else
2327 {
2328 int len = class_utf8data - (code + 33);
2329 memmove(code + 1, code + 33, len);
2330 code += len + 1;
2331 }
2332
2333 /* Now fill in the complete length of the item */
2334
2335 PUT(previous, 1, code - previous);
2336 break; /* End of class handling */
2337 }
2338 #endif
2339
2340 /* If there are no characters > 255, negate the 32-byte map if necessary,
2341 and copy it into the code vector. If this is the first thing in the branch,
2342 there can be no first char setting, whatever the repeat count. Any reqbyte
2343 setting must remain unchanged after any kind of repeat. */
2344
2345 if (negate_class)
2346 {
2347 *code++ = OP_NCLASS;
2348 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2349 }
2350 else
2351 {
2352 *code++ = OP_CLASS;
2353 memcpy(code, classbits, 32);
2354 }
2355 code += 32;
2356 break;
2357
2358 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2359 has been tested above. */
2360
2361 case '{':
2362 if (!is_quantifier) goto NORMAL_CHAR;
2363 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
2364 if (*errorcodeptr != 0) goto FAILED;
2365 goto REPEAT;
2366
2367 case '*':
2368 repeat_min = 0;
2369 repeat_max = -1;
2370 goto REPEAT;
2371
2372 case '+':
2373 repeat_min = 1;
2374 repeat_max = -1;
2375 goto REPEAT;
2376
2377 case '?':
2378 repeat_min = 0;
2379 repeat_max = 1;
2380
2381 REPEAT:
2382 if (previous == NULL)
2383 {
2384 *errorcodeptr = ERR9;
2385 goto FAILED;
2386 }
2387
2388 if (repeat_min == 0)
2389 {
2390 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2391 reqbyte = zeroreqbyte; /* Ditto */
2392 }
2393
2394 /* Remember whether this is a variable length repeat */
2395
2396 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2397
2398 op_type = 0; /* Default single-char op codes */
2399 possessive_quantifier = FALSE; /* Default not possessive quantifier */
2400
2401 /* Save start of previous item, in case we have to move it up to make space
2402 for an inserted OP_ONCE for the additional '+' extension. */
2403
2404 tempcode = previous;
2405
2406 /* If the next character is '+', we have a possessive quantifier. This
2407 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2408 If the next character is '?' this is a minimizing repeat, by default,
2409 but if PCRE_UNGREEDY is set, it works the other way round. We change the
2410 repeat type to the non-default. */
2411
2412 if (ptr[1] == '+')
2413 {
2414 repeat_type = 0; /* Force greedy */
2415 possessive_quantifier = TRUE;
2416 ptr++;
2417 }
2418 else if (ptr[1] == '?')
2419 {
2420 repeat_type = greedy_non_default;
2421 ptr++;
2422 }
2423 else repeat_type = greedy_default;
2424
2425 /* If previous was a recursion, we need to wrap it inside brackets so that
2426 it can be replicated if necessary. */
2427
2428 if (*previous == OP_RECURSE)
2429 {
2430 memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2431 code += 1 + LINK_SIZE;
2432 *previous = OP_BRA;
2433 PUT(previous, 1, code - previous);
2434 *code = OP_KET;
2435 PUT(code, 1, code - previous);
2436 code += 1 + LINK_SIZE;
2437 }
2438
2439 /* If previous was a character match, abolish the item and generate a
2440 repeat item instead. If a char item has a minumum of more than one, ensure
2441 that it is set in reqbyte - it might not be if a sequence such as x{3} is
2442 the first thing in a branch because the x will have gone into firstbyte
2443 instead. */
2444
2445 if (*previous == OP_CHAR || *previous == OP_CHARNC)
2446 {
2447 /* Deal with UTF-8 characters that take up more than one byte. It's
2448 easier to write this out separately than try to macrify it. Use c to
2449 hold the length of the character in bytes, plus 0x80 to flag that it's a
2450 length rather than a small character. */
2451
2452 #ifdef SUPPORT_UTF8
2453 if (utf8 && (code[-1] & 0x80) != 0)
2454 {
2455 uschar *lastchar = code - 1;
2456 while((*lastchar & 0xc0) == 0x80) lastchar--;
2457 c = code - lastchar; /* Length of UTF-8 character */
2458 memcpy(utf8_char, lastchar, c); /* Save the char */
2459 c |= 0x80; /* Flag c as a length */
2460 }
2461 else
2462 #endif
2463
2464 /* Handle the case of a single byte - either with no UTF8 support, or
2465 with UTF-8 disabled, or for a UTF-8 character < 128. */
2466
2467 {
2468 c = code[-1];
2469 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2470 }
2471
2472 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
2473 }
2474
2475 /* If previous was a single negated character ([^a] or similar), we use
2476 one of the special opcodes, replacing it. The code is shared with single-
2477 character repeats by setting opt_type to add a suitable offset into
2478 repeat_type. OP_NOT is currently used only for single-byte chars. */
2479
2480 else if (*previous == OP_NOT)
2481 {
2482 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
2483 c = previous[1];
2484 goto OUTPUT_SINGLE_REPEAT;
2485 }
2486
2487 /* If previous was a character type match (\d or similar), abolish it and
2488 create a suitable repeat item. The code is shared with single-character
2489 repeats by setting op_type to add a suitable offset into repeat_type. Note
2490 the the Unicode property types will be present only when SUPPORT_UCP is
2491 defined, but we don't wrap the little bits of code here because it just
2492 makes it horribly messy. */
2493
2494 else if (*previous < OP_EODN)
2495 {
2496 uschar *oldcode;
2497 int prop_type, prop_value;
2498 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
2499 c = *previous;
2500
2501 OUTPUT_SINGLE_REPEAT:
2502 if (*previous == OP_PROP || *previous == OP_NOTPROP)
2503 {
2504 prop_type = previous[1];
2505 prop_value = previous[2];
2506 }
2507 else prop_type = prop_value = -1;
2508
2509 oldcode = code;
2510 code = previous; /* Usually overwrite previous item */
2511
2512 /* If the maximum is zero then the minimum must also be zero; Perl allows
2513 this case, so we do too - by simply omitting the item altogether. */
2514
2515 if (repeat_max == 0) goto END_REPEAT;
2516
2517 /* All real repeats make it impossible to handle partial matching (maybe
2518 one day we will be able to remove this restriction). */
2519
2520 if (repeat_max != 1) cd->nopartial = TRUE;
2521
2522 /* Combine the op_type with the repeat_type */
2523
2524 repeat_type += op_type;
2525
2526 /* A minimum of zero is handled either as the special case * or ?, or as
2527 an UPTO, with the maximum given. */
2528
2529 if (repeat_min == 0)
2530 {
2531 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
2532 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
2533 else
2534 {
2535 *code++ = OP_UPTO + repeat_type;
2536 PUT2INC(code, 0, repeat_max);
2537 }
2538 }
2539
2540 /* A repeat minimum of 1 is optimized into some special cases. If the
2541 maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
2542 left in place and, if the maximum is greater than 1, we use OP_UPTO with
2543 one less than the maximum. */
2544
2545 else if (repeat_min == 1)
2546 {
2547 if (repeat_max == -1)
2548 *code++ = OP_PLUS + repeat_type;
2549 else
2550 {
2551 code = oldcode; /* leave previous item in place */
2552 if (repeat_max == 1) goto END_REPEAT;
2553 *code++ = OP_UPTO + repeat_type;
2554 PUT2INC(code, 0, repeat_max - 1);
2555 }
2556 }
2557
2558 /* The case {n,n} is just an EXACT, while the general case {n,m} is
2559 handled as an EXACT followed by an UPTO. */
2560
2561 else
2562 {
2563 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
2564 PUT2INC(code, 0, repeat_min);
2565
2566 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
2567 we have to insert the character for the previous code. For a repeated
2568 Unicode property match, there are two extra bytes that define the
2569 required property. In UTF-8 mode, long characters have their length in
2570 c, with the 0x80 bit as a flag. */
2571
2572 if (repeat_max < 0)
2573 {
2574 #ifdef SUPPORT_UTF8
2575 if (utf8 && c >= 128)
2576 {
2577 memcpy(code, utf8_char, c & 7);
2578 code += c & 7;
2579 }
2580 else
2581 #endif
2582 {
2583 *code++ = c;
2584 if (prop_type >= 0)
2585 {
2586 *code++ = prop_type;
2587 *code++ = prop_value;
2588 }
2589 }
2590 *code++ = OP_STAR + repeat_type;
2591 }
2592
2593 /* Else insert an UPTO if the max is greater than the min, again
2594 preceded by the character, for the previously inserted code. */
2595
2596 else if (repeat_max != repeat_min)
2597 {
2598 #ifdef SUPPORT_UTF8
2599 if (utf8 && c >= 128)
2600 {
2601 memcpy(code, utf8_char, c & 7);
2602 code += c & 7;
2603 }
2604 else
2605 #endif
2606 *code++ = c;
2607 if (prop_type >= 0)
2608 {
2609 *code++ = prop_type;
2610 *code++ = prop_value;
2611 }
2612 repeat_max -= repeat_min;
2613 *code++ = OP_UPTO + repeat_type;
2614 PUT2INC(code, 0, repeat_max);
2615 }
2616 }
2617
2618 /* The character or character type itself comes last in all cases. */
2619
2620 #ifdef SUPPORT_UTF8
2621 if (utf8 && c >= 128)
2622 {
2623 memcpy(code, utf8_char, c & 7);
2624 code += c & 7;
2625 }
2626 else
2627 #endif
2628 *code++ = c;
2629
2630 /* For a repeated Unicode property match, there are two extra bytes that
2631 define the required property. */
2632
2633 #ifdef SUPPORT_UCP
2634 if (prop_type >= 0)
2635 {
2636 *code++ = prop_type;
2637 *code++ = prop_value;
2638 }
2639 #endif
2640 }
2641
2642 /* If previous was a character class or a back reference, we put the repeat
2643 stuff after it, but just skip the item if the repeat was {0,0}. */
2644
2645 else if (*previous == OP_CLASS ||
2646 *previous == OP_NCLASS ||
2647 #ifdef SUPPORT_UTF8
2648 *previous == OP_XCLASS ||
2649 #endif
2650 *previous == OP_REF)
2651 {
2652 if (repeat_max == 0)
2653 {
2654 code = previous;
2655 goto END_REPEAT;
2656 }
2657
2658 /* All real repeats make it impossible to handle partial matching (maybe
2659 one day we will be able to remove this restriction). */
2660
2661 if (repeat_max != 1) cd->nopartial = TRUE;
2662
2663 if (repeat_min == 0 && repeat_max == -1)
2664 *code++ = OP_CRSTAR + repeat_type;
2665 else if (repeat_min == 1 && repeat_max == -1)
2666 *code++ = OP_CRPLUS + repeat_type;
2667 else if (repeat_min == 0 && repeat_max == 1)
2668 *code++ = OP_CRQUERY + repeat_type;
2669 else
2670 {
2671 *code++ = OP_CRRANGE + repeat_type;
2672 PUT2INC(code, 0, repeat_min);
2673 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
2674 PUT2INC(code, 0, repeat_max);
2675 }
2676 }
2677
2678 /* If previous was a bracket group, we may have to replicate it in certain
2679 cases. */
2680
2681 else if (*previous >= OP_BRA || *previous == OP_ONCE ||
2682 *previous == OP_COND)
2683 {
2684 register int i;
2685 int ketoffset = 0;
2686 int len = code - previous;
2687 uschar *bralink = NULL;
2688
2689 /* If the maximum repeat count is unlimited, find the end of the bracket
2690 by scanning through from the start, and compute the offset back to it
2691 from the current code pointer. There may be an OP_OPT setting following
2692 the final KET, so we can't find the end just by going back from the code
2693 pointer. */
2694
2695 if (repeat_max == -1)
2696 {
2697 register uschar *ket = previous;
2698 do ket += GET(ket, 1); while (*ket != OP_KET);
2699 ketoffset = code - ket;
2700 }
2701
2702 /* The case of a zero minimum is special because of the need to stick
2703 OP_BRAZERO in front of it, and because the group appears once in the
2704 data, whereas in other cases it appears the minimum number of times. For
2705 this reason, it is simplest to treat this case separately, as otherwise
2706 the code gets far too messy. There are several special subcases when the
2707 minimum is zero. */
2708
2709 if (repeat_min == 0)
2710 {
2711 /* If the maximum is also zero, we just omit the group from the output
2712 altogether. */
2713
2714 if (repeat_max == 0)
2715 {
2716 code = previous;
2717 goto END_REPEAT;
2718 }
2719
2720 /* If the maximum is 1 or unlimited, we just have to stick in the
2721 BRAZERO and do no more at this point. However, we do need to adjust
2722 any OP_RECURSE calls inside the group that refer to the group itself or
2723 any internal group, because the offset is from the start of the whole
2724 regex. Temporarily terminate the pattern while doing this. */
2725
2726 if (repeat_max <= 1)
2727 {
2728 *code = OP_END;
2729 adjust_recurse(previous, 1, utf8, cd);
2730 memmove(previous+1, previous, len);
2731 code++;
2732 *previous++ = OP_BRAZERO + repeat_type;
2733 }
2734
2735 /* If the maximum is greater than 1 and limited, we have to replicate
2736 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
2737 The first one has to be handled carefully because it's the original
2738 copy, which has to be moved up. The remainder can be handled by code
2739 that is common with the non-zero minimum case below. We have to
2740 adjust the value or repeat_max, since one less copy is required. Once
2741 again, we may have to adjust any OP_RECURSE calls inside the group. */
2742
2743 else
2744 {
2745 int offset;
2746 *code = OP_END;
2747 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);
2748 memmove(previous + 2 + LINK_SIZE, previous, len);
2749 code += 2 + LINK_SIZE;
2750 *previous++ = OP_BRAZERO + repeat_type;
2751 *previous++ = OP_BRA;
2752
2753 /* We chain together the bracket offset fields that have to be
2754 filled in later when the ends of the brackets are reached. */
2755
2756 offset = (bralink == NULL)? 0 : previous - bralink;
2757 bralink = previous;
2758 PUTINC(previous, 0, offset);
2759 }
2760
2761 repeat_max--;
2762 }
2763
2764 /* If the minimum is greater than zero, replicate the group as many
2765 times as necessary, and adjust the maximum to the number of subsequent
2766 copies that we need. If we set a first char from the group, and didn't
2767 set a required char, copy the latter from the former. */
2768
2769 else
2770 {
2771 if (repeat_min > 1)
2772 {
2773 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
2774 for (i = 1; i < repeat_min; i++)
2775 {
2776 memcpy(code, previous, len);
2777 code += len;
2778 }
2779 }
2780 if (repeat_max > 0) repeat_max -= repeat_min;
2781 }
2782
2783 /* This code is common to both the zero and non-zero minimum cases. If
2784 the maximum is limited, it replicates the group in a nested fashion,
2785 remembering the bracket starts on a stack. In the case of a zero minimum,
2786 the first one was set up above. In all cases the repeat_max now specifies
2787 the number of additional copies needed. */
2788
2789 if (repeat_max >= 0)
2790 {
2791 for (i = repeat_max - 1; i >= 0; i--)
2792 {
2793 *code++ = OP_BRAZERO + repeat_type;
2794
2795 /* All but the final copy start a new nesting, maintaining the
2796 chain of brackets outstanding. */
2797
2798 if (i != 0)
2799 {
2800 int offset;
2801 *code++ = OP_BRA;
2802 offset = (bralink == NULL)? 0 : code - bralink;
2803 bralink = code;
2804 PUTINC(code, 0, offset);
2805 }
2806
2807 memcpy(code, previous, len);
2808 code += len;
2809 }
2810
2811 /* Now chain through the pending brackets, and fill in their length
2812 fields (which are holding the chain links pro tem). */
2813
2814 while (bralink != NULL)
2815 {
2816 int oldlinkoffset;
2817 int offset = code - bralink + 1;
2818 uschar *bra = code - offset;
2819 oldlinkoffset = GET(bra, 1);
2820 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
2821 *code++ = OP_KET;
2822 PUTINC(code, 0, offset);
2823 PUT(bra, 1, offset);
2824 }
2825 }
2826
2827 /* If the maximum is unlimited, set a repeater in the final copy. We
2828 can't just offset backwards from the current code point, because we
2829 don't know if there's been an options resetting after the ket. The
2830 correct offset was computed above. */
2831
2832 else code[-ketoffset] = OP_KETRMAX + repeat_type;
2833 }
2834
2835 /* Else there's some kind of shambles */
2836
2837 else
2838 {
2839 *errorcodeptr = ERR11;
2840 goto FAILED;
2841 }
2842
2843 /* If the character following a repeat is '+', we wrap the entire repeated
2844 item inside OP_ONCE brackets. This is just syntactic sugar, taken from
2845 Sun's Java package. The repeated item starts at tempcode, not at previous,
2846 which might be the first part of a string whose (former) last char we
2847 repeated. However, we don't support '+' after a greediness '?'. */
2848
2849 if (possessive_quantifier)
2850 {
2851 int len = code - tempcode;
2852 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
2853 code += 1 + LINK_SIZE;
2854 len += 1 + LINK_SIZE;
2855 tempcode[0] = OP_ONCE;
2856 *code++ = OP_KET;
2857 PUTINC(code, 0, len);
2858 PUT(tempcode, 1, len);
2859 }
2860
2861 /* In all case we no longer have a previous item. We also set the
2862 "follows varying string" flag for subsequently encountered reqbytes if
2863 it isn't already set and we have just passed a varying length item. */
2864
2865 END_REPEAT:
2866 previous = NULL;
2867 cd->req_varyopt |= reqvary;
2868 break;
2869
2870
2871 /* Start of nested bracket sub-expression, or comment or lookahead or
2872 lookbehind or option setting or condition. First deal with special things
2873 that can come after a bracket; all are introduced by ?, and the appearance
2874 of any of them means that this is not a referencing group. They were
2875 checked for validity in the first pass over the string, so we don't have to
2876 check for syntax errors here. */
2877
2878 case '(':
2879 newoptions = options;
2880 skipbytes = 0;
2881
2882 if (*(++ptr) == '?')
2883 {
2884 int set, unset;
2885 int *optset;
2886
2887 switch (*(++ptr))
2888 {
2889 case '#': /* Comment; skip to ket */
2890 ptr++;
2891 while (*ptr != ')') ptr++;
2892 continue;
2893
2894 case ':': /* Non-extracting bracket */
2895 bravalue = OP_BRA;
2896 ptr++;
2897 break;
2898
2899 case '(':
2900 bravalue = OP_COND; /* Conditional group */
2901
2902 /* A condition can be a number, referring to a numbered group, a name,
2903 referring to a named group, 'R', referring to recursion, or an
2904 assertion. There are two unfortunate ambiguities, caused by history.
2905 (a) 'R' can be the recursive thing or the name 'R', and (b) a number
2906 could be a name that consists of digits. In both cases, we look for a
2907 name first; if not found, we try the other cases. If the first
2908 character after (?( is a word character, we know the rest up to ) will
2909 also be word characters because the syntax was checked in the first
2910 pass. */
2911
2912 if ((cd->ctypes[ptr[1]] & ctype_word) != 0)
2913 {
2914 int i, namelen;
2915 int condref = 0;
2916 const uschar *name;
2917 uschar *slot = cd->name_table;
2918
2919 /* This is needed for all successful cases. */
2920
2921 skipbytes = 3;
2922
2923 /* Read the name, but also get it as a number if it's all digits */
2924
2925 name = ++ptr;
2926 while (*ptr != ')')
2927 {
2928 if (condref >= 0)
2929 condref = ((digitab[*ptr] & ctype_digit) != 0)?
2930 condref * 10 + *ptr - '0' : -1;
2931 ptr++;
2932 }
2933 namelen = ptr - name;
2934 ptr++;
2935
2936 for (i = 0; i < cd->names_found; i++)
2937 {
2938 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
2939 slot += cd->name_entry_size;
2940 }
2941
2942 /* Found a previous named subpattern */
2943
2944 if (i < cd->names_found)
2945 {
2946 condref = GET2(slot, 0);
2947 code[1+LINK_SIZE] = OP_CREF;
2948 PUT2(code, 2+LINK_SIZE, condref);
2949 }
2950
2951 /* Search the pattern for a forward reference */
2952
2953 else if ((i = find_named_parens(ptr, *brackets, name, namelen)) > 0)
2954 {
2955 code[1+LINK_SIZE] = OP_CREF;
2956 PUT2(code, 2+LINK_SIZE, i);
2957 }
2958
2959 /* Check for 'R' for recursion */
2960
2961 else if (namelen == 1 && *name == 'R')
2962 {
2963 code[1+LINK_SIZE] = OP_CREF;
2964 PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
2965 }
2966
2967 /* Check for a subpattern number */
2968
2969 else if (condref > 0)
2970 {
2971 code[1+LINK_SIZE] = OP_CREF;
2972 PUT2(code, 2+LINK_SIZE, condref);
2973 }
2974
2975 /* Either an unidentified subpattern, or a reference to (?(0) */
2976
2977 else
2978 {
2979 *errorcodeptr = (condref == 0)? ERR35: ERR15;
2980 goto FAILED;
2981 }
2982 }
2983
2984 /* For conditions that are assertions, we just fall through, having
2985 set bravalue above. */
2986
2987 break;
2988
2989 case '=': /* Positive lookahead */
2990 bravalue = OP_ASSERT;
2991 ptr++;
2992 break;
2993
2994 case '!': /* Negative lookahead */
2995 bravalue = OP_ASSERT_NOT;
2996 ptr++;
2997 break;
2998
2999 case '<': /* Lookbehinds */
3000 switch (*(++ptr))
3001 {
3002 case '=': /* Positive lookbehind */
3003 bravalue = OP_ASSERTBACK;
3004 ptr++;
3005 break;
3006
3007 case '!': /* Negative lookbehind */
3008 bravalue = OP_ASSERTBACK_NOT;
3009 ptr++;
3010 break;
3011 }
3012 break;
3013
3014 case '>': /* One-time brackets */
3015 bravalue = OP_ONCE;
3016 ptr++;
3017 break;
3018
3019 case 'C': /* Callout - may be followed by digits; */
3020 previous_callout = code; /* Save for later completion */
3021 after_manual_callout = 1; /* Skip one item before completing */
3022 *code++ = OP_CALLOUT; /* Already checked that the terminating */
3023 { /* closing parenthesis is present. */
3024 int n = 0;
3025 while ((digitab[*(++ptr)] & ctype_digit) != 0)
3026 n = n * 10 + *ptr - '0';
3027 if (n > 255)
3028 {
3029 *errorcodeptr = ERR38;
3030 goto FAILED;
3031 }
3032 *code++ = n;
3033 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
3034 PUT(code, LINK_SIZE, 0); /* Default length */
3035 code += 2 * LINK_SIZE;
3036 }
3037 previous = NULL;
3038 continue;
3039
3040 case 'P': /* Named subpattern handling */
3041 if (*(++ptr) == '<') /* Definition */
3042 {
3043 int i, namelen;
3044 uschar *slot = cd->name_table;
3045 const uschar *name; /* Don't amalgamate; some compilers */
3046 name = ++ptr; /* grumble at autoincrement in declaration */
3047
3048 while (*ptr++ != '>');
3049 namelen = ptr - name - 1;
3050
3051 for (i = 0; i < cd->names_found; i++)
3052 {
3053 int crc = memcmp(name, slot+2, namelen);
3054 if (crc == 0)
3055 {
3056 if (slot[2+namelen] == 0)
3057 {
3058 if ((options & PCRE_DUPNAMES) == 0)
3059 {
3060 *errorcodeptr = ERR43;
3061 goto FAILED;
3062 }
3063 }
3064 else crc = -1; /* Current name is substring */
3065 }
3066 if (crc < 0)
3067 {
3068 memmove(slot + cd->name_entry_size, slot,
3069 (cd->names_found - i) * cd->name_entry_size);
3070 break;
3071 }
3072 slot += cd->name_entry_size;
3073 }
3074
3075 PUT2(slot, 0, *brackets + 1);
3076 memcpy(slot + 2, name, namelen);
3077 slot[2+namelen] = 0;
3078 cd->names_found++;
3079 goto NUMBERED_GROUP;
3080 }
3081
3082 if (*ptr == '=' || *ptr == '>') /* Reference or recursion */
3083 {
3084 int i, namelen;
3085 int type = *ptr++;
3086 const uschar *name = ptr;
3087 uschar *slot = cd->name_table;
3088
3089 while (*ptr != ')') ptr++;
3090 namelen = ptr - name;
3091
3092 for (i = 0; i < cd->names_found; i++)
3093 {
3094 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3095 slot += cd->name_entry_size;
3096 }
3097
3098 if (i < cd->names_found) /* Back reference */
3099 {
3100 recno = GET2(slot, 0);
3101 }
3102 else if ((recno = /* Forward back reference */
3103 find_named_parens(ptr, *brackets, name, namelen)) <= 0)
3104 {
3105 *errorcodeptr = ERR15;
3106 goto FAILED;
3107 }
3108
3109 if (type == '>') goto HANDLE_RECURSION; /* A few lines below */
3110
3111 /* Back reference */
3112
3113 previous = code;
3114 *code++ = OP_REF;
3115 PUT2INC(code, 0, recno);
3116 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
3117 if (recno > cd->top_backref) cd->top_backref = recno;
3118 continue;
3119 }
3120
3121 /* Should never happen */
3122 break;
3123
3124 case 'R': /* Pattern recursion */
3125 ptr++; /* Same as (?0) */
3126 /* Fall through */
3127
3128 /* Recursion or "subroutine" call */
3129
3130 case '0': case '1': case '2': case '3': case '4':
3131 case '5': case '6': case '7': case '8': case '9':
3132 {
3133 const uschar *called;
3134 recno = 0;
3135 while((digitab[*ptr] & ctype_digit) != 0)
3136 recno = recno * 10 + *ptr++ - '0';
3137
3138 /* Come here from code above that handles a named recursion */
3139
3140 HANDLE_RECURSION:
3141
3142 previous = code;
3143
3144 /* Find the bracket that is being referenced. Temporarily end the
3145 regex in case it doesn't exist. */
3146
3147 *code = OP_END;
3148 called = (recno == 0)? cd->start_code :
3149 find_bracket(cd->start_code, utf8, recno);
3150 if (called == NULL)
3151 {
3152 *errorcodeptr = ERR15;
3153 goto FAILED;
3154 }
3155
3156 /* If the subpattern is still open, this is a recursive call. We
3157 check to see if this is a left recursion that could loop for ever,
3158 and diagnose that case. */
3159
3160 if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
3161 {
3162 *errorcodeptr = ERR40;
3163 goto FAILED;
3164 }
3165
3166 /* Insert the recursion/subroutine item, automatically wrapped inside
3167 "once" brackets. */
3168
3169 *code = OP_ONCE;
3170 PUT(code, 1, 2 + 2*LINK_SIZE);
3171 code += 1 + LINK_SIZE;
3172
3173 *code = OP_RECURSE;
3174 PUT(code, 1, called - cd->start_code);
3175 code += 1 + LINK_SIZE;
3176
3177 *code = OP_KET;
3178 PUT(code, 1, 2 + 2*LINK_SIZE);
3179 code += 1 + LINK_SIZE;
3180 }
3181 continue;
3182
3183 /* Character after (? not specially recognized */
3184
3185 default: /* Option setting */
3186 set = unset = 0;
3187 optset = &set;
3188
3189 while (*ptr != ')' && *ptr != ':')
3190 {
3191 switch (*ptr++)
3192 {
3193 case '-': optset = &unset; break;
3194
3195 case 'i': *optset |= PCRE_CASELESS; break;
3196 case 'J': *optset |= PCRE_DUPNAMES; break;
3197 case 'm': *optset |= PCRE_MULTILINE; break;
3198 case 's': *optset |= PCRE_DOTALL; break;
3199 case 'x': *optset |= PCRE_EXTENDED; break;
3200 case 'U': *optset |= PCRE_UNGREEDY; break;
3201 case 'X': *optset |= PCRE_EXTRA; break;
3202 }
3203 }
3204
3205 /* Set up the changed option bits, but don't change anything yet. */
3206
3207 newoptions = (options | set) & (~unset);
3208
3209 /* If the options ended with ')' this is not the start of a nested
3210 group with option changes, so the options change at this level. Compile
3211 code to change the ims options if this setting actually changes any of
3212 them. We also pass the new setting back so that it can be put at the
3213 start of any following branches, and when this group ends (if we are in
3214 a group), a resetting item can be compiled.
3215
3216 Note that if this item is right at the start of the pattern, the
3217 options will have been abstracted and made global, so there will be no
3218 change to compile. */
3219
3220 if (*ptr == ')')
3221 {
3222 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
3223 {
3224 *code++ = OP_OPT;
3225 *code++ = newoptions & PCRE_IMS;
3226 }
3227
3228 /* Change options at this level, and pass them back for use
3229 in subsequent branches. Reset the greedy defaults and the case
3230 value for firstbyte and reqbyte. */
3231
3232 *optionsptr = options = newoptions;
3233 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
3234 greedy_non_default = greedy_default ^ 1;
3235 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3236
3237 previous = NULL; /* This item can't be repeated */
3238 continue; /* It is complete */
3239 }
3240
3241 /* If the options ended with ':' we are heading into a nested group
3242 with possible change of options. Such groups are non-capturing and are
3243 not assertions of any kind. All we need to do is skip over the ':';
3244 the newoptions value is handled below. */
3245
3246 bravalue = OP_BRA;
3247 ptr++;
3248 }
3249 }
3250
3251 /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
3252 non-capturing and behave like (?:...) brackets */
3253
3254 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
3255 {
3256 bravalue = OP_BRA;
3257 }
3258
3259 /* Else we have a referencing group; adjust the opcode. If the bracket
3260 number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
3261 arrange for the true number to follow later, in an OP_BRANUMBER item. */
3262
3263 else
3264 {
3265 NUMBERED_GROUP:
3266 if (++(*brackets) > EXTRACT_BASIC_MAX)
3267 {
3268 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
3269 code[1+LINK_SIZE] = OP_BRANUMBER;
3270 PUT2(code, 2+LINK_SIZE, *brackets);
3271 skipbytes = 3;
3272 }
3273 else bravalue = OP_BRA + *brackets;
3274 }
3275
3276 /* Process nested bracketed re. Assertions may not be repeated, but other
3277 kinds can be. We copy code into a non-register variable in order to be able
3278 to pass its address because some compilers complain otherwise. Pass in a
3279 new setting for the ims options if they have changed. */
3280
3281 previous = (bravalue >= OP_ONCE)? code : NULL;
3282 *code = bravalue;
3283 tempcode = code;
3284 tempreqvary = cd->req_varyopt; /* Save value before bracket */
3285
3286 if (!compile_regex(
3287 newoptions, /* The complete new option state */
3288 options & PCRE_IMS, /* The previous ims option state */
3289 brackets, /* Extracting bracket count */
3290 &tempcode, /* Where to put code (updated) */
3291 &ptr, /* Input pointer (updated) */
3292 errorcodeptr, /* Where to put an error message */
3293 (bravalue == OP_ASSERTBACK ||
3294 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
3295 skipbytes, /* Skip over OP_COND/OP_BRANUMBER */
3296 &subfirstbyte, /* For possible first char */
3297 &subreqbyte, /* For possible last char */
3298 bcptr, /* Current branch chain */
3299 cd)) /* Tables block */
3300 goto FAILED;
3301
3302 /* At the end of compiling, code is still pointing to the start of the
3303 group, while tempcode has been updated to point past the end of the group
3304 and any option resetting that may follow it. The pattern pointer (ptr)
3305 is on the bracket. */
3306
3307 /* If this is a conditional bracket, check that there are no more than
3308 two branches in the group. */
3309
3310 else if (bravalue == OP_COND)
3311 {
3312 uschar *tc = code;
3313 int condcount = 0;
3314
3315 do {
3316 condcount++;
3317 tc += GET(tc,1);
3318 }
3319 while (*tc != OP_KET);
3320
3321 if (condcount > 2)
3322 {
3323 *errorcodeptr = ERR27;
3324 goto FAILED;
3325 }
3326
3327 /* If there is just one branch, we must not make use of its firstbyte or
3328 reqbyte, because this is equivalent to an empty second branch. */
3329
3330 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
3331 }
3332
3333 /* Handle updating of the required and first characters. Update for normal
3334 brackets of all kinds, and conditions with two branches (see code above).
3335 If the bracket is followed by a quantifier with zero repeat, we have to
3336 back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
3337 main loop so that they can be accessed for the back off. */
3338
3339 zeroreqbyte = reqbyte;
3340 zerofirstbyte = firstbyte;
3341 groupsetfirstbyte = FALSE;
3342
3343 if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
3344 {
3345 /* If we have not yet set a firstbyte in this branch, take it from the
3346 subpattern, remembering that it was set here so that a repeat of more
3347 than one can replicate it as reqbyte if necessary. If the subpattern has
3348 no firstbyte, set "none" for the whole branch. In both cases, a zero
3349 repeat forces firstbyte to "none". */
3350
3351 if (firstbyte == REQ_UNSET)
3352 {
3353 if (subfirstbyte >= 0)
3354 {
3355 firstbyte = subfirstbyte;
3356 groupsetfirstbyte = TRUE;
3357 }
3358 else firstbyte = REQ_NONE;
3359 zerofirstbyte = REQ_NONE;
3360 }
3361
3362 /* If firstbyte was previously set, convert the subpattern's firstbyte
3363 into reqbyte if there wasn't one, using the vary flag that was in
3364 existence beforehand. */
3365
3366 else if (subfirstbyte >= 0 && subreqbyte < 0)
3367 subreqbyte = subfirstbyte | tempreqvary;
3368
3369 /* If the subpattern set a required byte (or set a first byte that isn't
3370 really the first byte - see above), set it. */
3371
3372 if (subreqbyte >= 0) reqbyte = subreqbyte;
3373 }
3374
3375 /* For a forward assertion, we take the reqbyte, if set. This can be
3376 helpful if the pattern that follows the assertion doesn't set a different
3377 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
3378 for an assertion, however because it leads to incorrect effect for patterns
3379 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
3380 of a firstbyte. This is overcome by a scan at the end if there's no
3381 firstbyte, looking for an asserted first char. */
3382
3383 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
3384
3385 /* Now update the main code pointer to the end of the group. */
3386
3387 code = tempcode;
3388
3389 /* Error if hit end of pattern */
3390
3391 if (*ptr != ')')
3392 {
3393 *errorcodeptr = ERR14;
3394 goto FAILED;
3395 }
3396 break;
3397
3398 /* Check \ for being a real metacharacter; if not, fall through and handle
3399 it as a data character at the start of a string. Escape items are checked
3400 for validity in the pre-compiling pass. */
3401
3402 case '\\':
3403 tempptr = ptr;
3404 c = check_escape(&ptr, errorcodeptr, *brackets, options, FALSE);
3405
3406 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
3407 are arranged to be the negation of the corresponding OP_values. For the
3408 back references, the values are ESC_REF plus the reference number. Only
3409 back references and those types that consume a character may be repeated.
3410 We can test for values between ESC_b and ESC_Z for the latter; this may
3411 have to change if any new ones are ever created. */
3412
3413 if (c < 0)
3414 {
3415 if (-c == ESC_Q) /* Handle start of quoted string */
3416 {
3417 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
3418 else inescq = TRUE;
3419 continue;
3420 }
3421
3422 /* For metasequences that actually match a character, we disable the
3423 setting of a first character if it hasn't already been set. */
3424
3425 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
3426 firstbyte = REQ_NONE;
3427
3428 /* Set values to reset to if this is followed by a zero repeat. */
3429
3430 zerofirstbyte = firstbyte;
3431 zeroreqbyte = reqbyte;
3432
3433 /* Back references are handled specially */
3434
3435 if (-c >= ESC_REF)
3436 {
3437 int number = -c - ESC_REF;
3438 previous = code;
3439 *code++ = OP_REF;
3440 PUT2INC(code, 0, number);
3441 }
3442
3443 /* So are Unicode property matches, if supported. We know that get_ucp
3444 won't fail because it was tested in the pre-pass. */
3445
3446 #ifdef SUPPORT_UCP
3447 else if (-c == ESC_P || -c == ESC_p)
3448 {
3449 BOOL negated;
3450 int pdata;
3451 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3452 previous = code;
3453 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
3454 *code++ = ptype;
3455 *code++ = pdata;
3456 }
3457 #endif
3458
3459 /* For the rest, we can obtain the OP value by negating the escape
3460 value */
3461
3462 else
3463 {
3464 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
3465 *code++ = -c;
3466 }
3467 continue;
3468 }
3469
3470 /* We have a data character whose value is in c. In UTF-8 mode it may have
3471 a value > 127. We set its representation in the length/buffer, and then
3472 handle it as a data character. */
3473
3474 #ifdef SUPPORT_UTF8
3475 if (utf8 && c > 127)
3476 mclength = _pcre_ord2utf8(c, mcbuffer);
3477 else
3478 #endif
3479
3480 {
3481 mcbuffer[0] = c;
3482 mclength = 1;
3483 }
3484
3485 goto ONE_CHAR;
3486
3487 /* Handle a literal character. It is guaranteed not to be whitespace or #
3488 when the extended flag is set. If we are in UTF-8 mode, it may be a
3489 multi-byte literal character. */
3490
3491 default:
3492 NORMAL_CHAR:
3493 mclength = 1;
3494 mcbuffer[0] = c;
3495
3496 #ifdef SUPPORT_UTF8
3497 if (utf8 && (c & 0xc0) == 0xc0)
3498 {
3499 while ((ptr[1] & 0xc0) == 0x80)
3500 mcbuffer[mclength++] = *(++ptr);
3501 }
3502 #endif
3503
3504 /* At this point we have the character's bytes in mcbuffer, and the length
3505 in mclength. When not in UTF-8 mode, the length is always 1. */
3506
3507 ONE_CHAR:
3508 previous = code;
3509 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
3510 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
3511
3512 /* Set the first and required bytes appropriately. If no previous first
3513 byte, set it from this character, but revert to none on a zero repeat.
3514 Otherwise, leave the firstbyte value alone, and don't change it on a zero
3515 repeat. */
3516
3517 if (firstbyte == REQ_UNSET)
3518 {
3519 zerofirstbyte = REQ_NONE;
3520 zeroreqbyte = reqbyte;
3521
3522 /* If the character is more than one byte long, we can set firstbyte
3523 only if it is not to be matched caselessly. */
3524
3525 if (mclength == 1 || req_caseopt == 0)
3526 {
3527 firstbyte = mcbuffer[0] | req_caseopt;
3528 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
3529 }
3530 else firstbyte = reqbyte = REQ_NONE;
3531 }
3532
3533 /* firstbyte was previously set; we can set reqbyte only the length is
3534 1 or the matching is caseful. */
3535
3536 else
3537 {
3538 zerofirstbyte = firstbyte;
3539 zeroreqbyte = reqbyte;
3540 if (mclength == 1 || req_caseopt == 0)
3541 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3542 }
3543
3544 break; /* End of literal character handling */
3545 }
3546 } /* end of big loop */
3547
3548 /* Control never reaches here by falling through, only by a goto for all the
3549 error states. Pass back the position in the pattern so that it can be displayed
3550 to the user for diagnosing the error. */
3551
3552 FAILED:
3553 *ptrptr = ptr;
3554 return FALSE;
3555 }
3556
3557
3558
3559
3560 /*************************************************
3561 * Compile sequence of alternatives *
3562 *************************************************/
3563
3564 /* On entry, ptr is pointing past the bracket character, but on return
3565 it points to the closing bracket, or vertical bar, or end of string.
3566 The code variable is pointing at the byte into which the BRA operator has been
3567 stored. If the ims options are changed at the start (for a (?ims: group) or
3568 during any branch, we need to insert an OP_OPT item at the start of every
3569 following branch to ensure they get set correctly at run time, and also pass
3570 the new options into every subsequent branch compile.
3571
3572 Argument:
3573 options option bits, including any changes for this subpattern
3574 oldims previous settings of ims option bits
3575 brackets -> int containing the number of extracting brackets used
3576 codeptr -> the address of the current code pointer
3577 ptrptr -> the address of the current pattern pointer
3578 errorcodeptr -> pointer to error code variable
3579 lookbehind TRUE if this is a lookbehind assertion
3580 skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER)
3581 firstbyteptr place to put the first required character, or a negative number
3582 reqbyteptr place to put the last required character, or a negative number
3583 bcptr pointer to the chain of currently open branches
3584 cd points to the data block with tables pointers etc.
3585
3586 Returns: TRUE on success
3587 */
3588
3589 static BOOL
3590 compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
3591 const uschar **ptrptr, int *errorcodeptr, BOOL lookbehind, int skipbytes,
3592 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
3593 {
3594 const uschar *ptr = *ptrptr;
3595 uschar *code = *codeptr;
3596 uschar *last_branch = code;
3597 uschar *start_bracket = code;
3598 uschar *reverse_count = NULL;
3599 int firstbyte, reqbyte;
3600 int branchfirstbyte, branchreqbyte;
3601 branch_chain bc;
3602
3603 bc.outer = bcptr;
3604 bc.current = code;
3605
3606 firstbyte = reqbyte = REQ_UNSET;
3607
3608 /* Offset is set zero to mark that this bracket is still open */
3609
3610 PUT(code, 1, 0);
3611 code += 1 + LINK_SIZE + skipbytes;
3612
3613 /* Loop for each alternative branch */
3614
3615 for (;;)
3616 {
3617 /* Handle a change of ims options at the start of the branch */
3618
3619 if ((options & PCRE_IMS) != oldims)
3620 {
3621 *code++ = OP_OPT;
3622 *code++ = options & PCRE_IMS;
3623 }
3624
3625 /* Set up dummy OP_REVERSE if lookbehind assertion */
3626
3627 if (lookbehind)
3628 {
3629 *code++ = OP_REVERSE;
3630 reverse_count = code;
3631 PUTINC(code, 0, 0);
3632 }
3633
3634 /* Now compile the branch */
3635
3636 if (!compile_branch(&options, brackets, &code, &ptr, errorcodeptr,
3637 &branchfirstbyte, &branchreqbyte, &bc, cd))
3638 {
3639 *ptrptr = ptr;
3640 return FALSE;
3641 }
3642
3643 /* If this is the first branch, the firstbyte and reqbyte values for the
3644 branch become the values for the regex. */
3645
3646 if (*last_branch != OP_ALT)
3647 {
3648 firstbyte = branchfirstbyte;
3649 reqbyte = branchreqbyte;
3650 }
3651
3652 /* If this is not the first branch, the first char and reqbyte have to
3653 match the values from all the previous branches, except that if the previous
3654 value for reqbyte didn't have REQ_VARY set, it can still match, and we set
3655 REQ_VARY for the regex. */
3656
3657 else
3658 {
3659 /* If we previously had a firstbyte, but it doesn't match the new branch,
3660 we have to abandon the firstbyte for the regex, but if there was previously
3661 no reqbyte, it takes on the value of the old firstbyte. */
3662
3663 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
3664 {
3665 if (reqbyte < 0) reqbyte = firstbyte;
3666 firstbyte = REQ_NONE;
3667 }
3668
3669 /* If we (now or from before) have no firstbyte, a firstbyte from the
3670 branch becomes a reqbyte if there isn't a branch reqbyte. */
3671
3672 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
3673 branchreqbyte = branchfirstbyte;
3674
3675 /* Now ensure that the reqbytes match */
3676
3677 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
3678 reqbyte = REQ_NONE;
3679 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
3680 }
3681
3682 /* If lookbehind, check that this branch matches a fixed-length string,
3683 and put the length into the OP_REVERSE item. Temporarily mark the end of
3684 the branch with OP_END. */
3685
3686 if (lookbehind)
3687 {
3688 int length;
3689 *code = OP_END;
3690 length = find_fixedlength(last_branch, options);
3691 DPRINTF(("fixed length = %d\n", length));
3692 if (length < 0)
3693 {
3694 *errorcodeptr = (length == -2)? ERR36 : ERR25;
3695 *ptrptr = ptr;
3696 return FALSE;
3697 }
3698 PUT(reverse_count, 0, length);
3699 }
3700
3701 /* Reached end of expression, either ')' or end of pattern. Go back through
3702 the alternative branches and reverse the chain of offsets, with the field in
3703 the BRA item now becoming an offset to the first alternative. If there are
3704 no alternatives, it points to the end of the group. The length in the
3705 terminating ket is always the length of the whole bracketed item. If any of
3706 the ims options were changed inside the group, compile a resetting op-code
3707 following, except at the very end of the pattern. Return leaving the pointer
3708 at the terminating char. */
3709
3710 if (*ptr != '|')
3711 {
3712 int length = code - last_branch;
3713 do
3714 {
3715 int prev_length = GET(last_branch, 1);
3716 PUT(last_branch, 1, length);
3717 length = prev_length;
3718 last_branch -= length;
3719 }
3720 while (length > 0);
3721
3722 /* Fill in the ket */
3723
3724 *code = OP_KET;
3725 PUT(code, 1, code - start_bracket);
3726 code += 1 + LINK_SIZE;
3727
3728 /* Resetting option if needed */
3729
3730 if ((options & PCRE_IMS) != oldims && *ptr == ')')
3731 {
3732 *code++ = OP_OPT;
3733 *code++ = oldims;
3734 }
3735
3736 /* Set values to pass back */
3737
3738 *codeptr = code;
3739 *ptrptr = ptr;
3740 *firstbyteptr = firstbyte;
3741 *reqbyteptr = reqbyte;
3742 return TRUE;
3743 }
3744
3745 /* Another branch follows; insert an "or" node. Its length field points back
3746 to the previous branch while the bracket remains open. At the end the chain
3747 is reversed. It's done like this so that the start of the bracket has a
3748 zero offset until it is closed, making it possible to detect recursion. */
3749
3750 *code = OP_ALT;
3751 PUT(code, 1, code - last_branch);
3752 bc.current = last_branch = code;
3753 code += 1 + LINK_SIZE;
3754 ptr++;
3755 }
3756 /* Control never reaches here */
3757 }
3758
3759
3760
3761
3762 /*************************************************
3763 * Check for anchored expression *
3764 *************************************************/
3765
3766 /* Try to find out if this is an anchored regular expression. Consider each
3767 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
3768 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
3769 it's anchored. However, if this is a multiline pattern, then only OP_SOD
3770 counts, since OP_CIRC can match in the middle.
3771
3772 We can also consider a regex to be anchored if OP_SOM starts all its branches.
3773 This is the code for \G, which means "match at start of match position, taking
3774 into account the match offset".
3775
3776 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
3777 because that will try the rest of the pattern at all possible matching points,
3778 so there is no point trying again.... er ....
3779
3780 .... except when the .* appears inside capturing parentheses, and there is a
3781 subsequent back reference to those parentheses. We haven't enough information
3782 to catch that case precisely.
3783
3784 At first, the best we could do was to detect when .* was in capturing brackets
3785 and the highest back reference was greater than or equal to that level.
3786 However, by keeping a bitmap of the first 31 back references, we can catch some
3787 of the more common cases more precisely.
3788
3789 Arguments:
3790 code points to start of expression (the bracket)
3791 options points to the options setting
3792 bracket_map a bitmap of which brackets we are inside while testing; this
3793 handles up to substring 31; after that we just have to take
3794 the less precise approach
3795 backref_map the back reference bitmap
3796
3797 Returns: TRUE or FALSE
3798 */
3799
3800 static BOOL
3801 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
3802 unsigned int backref_map)
3803 {
3804 do {
3805 const uschar *scode =
3806 first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE);
3807 register int op = *scode;
3808
3809 /* Capturing brackets */
3810
3811 if (op > OP_BRA)
3812 {
3813 int new_map;
3814 op -= OP_BRA;
3815 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3816 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3817 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
3818 }
3819
3820 /* Other brackets */
3821
3822 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3823 {
3824 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
3825 }
3826
3827 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3828 are or may be referenced. */
3829
3830 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
3831 (*options & PCRE_DOTALL) != 0)
3832 {
3833 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3834 }
3835
3836 /* Check for explicit anchoring */
3837
3838 else if (op != OP_SOD && op != OP_SOM &&
3839 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
3840 return FALSE;
3841 code += GET(code, 1);
3842 }
3843 while (*code == OP_ALT); /* Loop for each alternative */
3844 return TRUE;
3845 }
3846
3847
3848
3849 /*************************************************
3850 * Check for starting with ^ or .* *
3851 *************************************************/
3852
3853 /* This is called to find out if every branch starts with ^ or .* so that
3854 "first char" processing can be done to speed things up in multiline
3855 matching and for non-DOTALL patterns that start with .* (which must start at
3856 the beginning or after \n). As in the case of is_anchored() (see above), we
3857 have to take account of back references to capturing brackets that contain .*
3858 because in that case we can't make the assumption.
3859
3860 Arguments:
3861 code points to start of expression (the bracket)
3862 bracket_map a bitmap of which brackets we are inside while testing; this
3863 handles up to substring 31; after that we just have to take
3864 the less precise approach
3865 backref_map the back reference bitmap
3866
3867 Returns: TRUE or FALSE
3868 */
3869
3870 static BOOL
3871 is_startline(const uschar *code, unsigned int bracket_map,
3872 unsigned int backref_map)
3873 {
3874 do {
3875 const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0,
3876 FALSE);
3877 register int op = *scode;
3878
3879 /* Capturing brackets */
3880
3881 if (op > OP_BRA)
3882 {
3883 int new_map;
3884 op -= OP_BRA;
3885 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3886 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3887 if (!is_startline(scode, new_map, backref_map)) return FALSE;
3888 }
3889
3890 /* Other brackets */
3891
3892 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3893 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
3894
3895 /* .* means "start at start or after \n" if it isn't in brackets that
3896 may be referenced. */
3897
3898 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
3899 {
3900 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3901 }
3902
3903 /* Check for explicit circumflex */
3904
3905 else if (op != OP_CIRC) return FALSE;
3906
3907 /* Move on to the next alternative */
3908
3909 code += GET(code, 1);
3910 }
3911 while (*code == OP_ALT); /* Loop for each alternative */
3912 return TRUE;
3913 }
3914
3915
3916
3917 /*************************************************
3918 * Check for asserted fixed first char *
3919 *************************************************/
3920
3921 /* During compilation, the "first char" settings from forward assertions are
3922 discarded, because they can cause conflicts with actual literals that follow.
3923 However, if we end up without a first char setting for an unanchored pattern,
3924 it is worth scanning the regex to see if there is an initial asserted first
3925 char. If all branches start with the same asserted char, or with a bracket all
3926 of whose alternatives start with the same asserted char (recurse ad lib), then
3927 we return that char, otherwise -1.
3928
3929 Arguments:
3930 code points to start of expression (the bracket)
3931 options pointer to the options (used to check casing changes)
3932 inassert TRUE if in an assertion
3933
3934 Returns: -1 or the fixed first char
3935 */
3936
3937 static int
3938 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
3939 {
3940 register int c = -1;
3941 do {
3942 int d;
3943 const uschar *scode =
3944 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
3945 register int op = *scode;
3946
3947 if (op >= OP_BRA) op = OP_BRA;
3948
3949 switch(op)
3950 {
3951 default:
3952 return -1;
3953
3954 case OP_BRA:
3955 case OP_ASSERT:
3956 case OP_ONCE:
3957 case OP_COND:
3958 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
3959 return -1;
3960 if (c < 0) c = d; else if (c != d) return -1;
3961 break;
3962
3963 case OP_EXACT: /* Fall through */
3964 scode += 2;
3965
3966 case OP_CHAR:
3967 case OP_CHARNC:
3968 case OP_PLUS:
3969 case OP_MINPLUS:
3970 if (!inassert) return -1;
3971 if (c < 0)
3972 {
3973 c = scode[1];
3974 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
3975 }
3976 else if (c != scode[1]) return -1;
3977 break;
3978 }
3979
3980 code += GET(code, 1);
3981 }
3982 while (*code == OP_ALT);
3983 return c;
3984 }
3985
3986
3987
3988 /*************************************************
3989 * Compile a Regular Expression *
3990 *************************************************/
3991
3992 /* This function takes a string and returns a pointer to a block of store
3993 holding a compiled version of the expression. The original API for this
3994 function had no error code return variable; it is retained for backwards
3995 compatibility. The new function is given a new name.
3996
3997 Arguments:
3998 pattern the regular expression
3999 options various option bits
4000 errorcodeptr pointer to error code variable (pcre_compile2() only)
4001 can be NULL if you don't want a code value
4002 errorptr pointer to pointer to error text
4003 erroroffset ptr offset in pattern where error was detected
4004 tables pointer to character tables or NULL
4005
4006 Returns: pointer to compiled data block, or NULL on error,
4007 with errorptr and erroroffset set
4008 */
4009
4010 PCRE_DATA_SCOPE pcre *
4011 pcre_compile(const char *pattern, int options, const char **errorptr,
4012 int *erroroffset, const unsigned char *tables)
4013 {
4014 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
4015 }
4016
4017
4018
4019 PCRE_DATA_SCOPE pcre *
4020 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
4021 const char **errorptr, int *erroroffset, const unsigned char *tables)
4022 {
4023 real_pcre *re;
4024 int length = 1 + LINK_SIZE; /* For initial BRA plus length */
4025 int c, firstbyte, reqbyte, newline;
4026 int bracount = 0;
4027 int branch_extra = 0;
4028 int branch_newextra;
4029 int item_count = -1;
4030 int name_count = 0;
4031 int max_name_size = 0;
4032 int lastitemlength = 0;
4033 int errorcode = 0;
4034 #ifdef SUPPORT_UTF8
4035 BOOL utf8;
4036 BOOL class_utf8;
4037 #endif
4038 BOOL inescq = FALSE;
4039 BOOL capturing;
4040 unsigned int brastackptr = 0;
4041 size_t size;
4042 uschar *code;
4043 const uschar *codestart;
4044 const uschar *ptr;
4045 compile_data compile_block;
4046 compile_data *cd = &compile_block;
4047 int brastack[BRASTACK_SIZE];
4048 uschar bralenstack[BRASTACK_SIZE];
4049
4050 /* We can't pass back an error message if errorptr is NULL; I guess the best we
4051 can do is just return NULL, but we can set a code value if there is a code
4052 pointer. */
4053
4054 if (errorptr == NULL)
4055 {
4056 if (errorcodeptr != NULL) *errorcodeptr = 99;
4057 return NULL;
4058 }
4059
4060 *errorptr = NULL;
4061 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
4062
4063 /* However, we can give a message for this error */
4064
4065 if (erroroffset == NULL)
4066 {
4067 errorcode = ERR16;
4068 goto PCRE_EARLY_ERROR_RETURN;
4069 }
4070
4071 *erroroffset = 0;
4072
4073 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
4074
4075 #ifdef SUPPORT_UTF8
4076 utf8 = (options & PCRE_UTF8) != 0;
4077 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
4078 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
4079 {
4080 errorcode = ERR44;
4081 goto PCRE_EARLY_ERROR_RETURN;
4082 }
4083 #else
4084 if ((options & PCRE_UTF8) != 0)
4085 {
4086 errorcode = ERR32;
4087 goto PCRE_EARLY_ERROR_RETURN;
4088 }
4089 #endif
4090
4091 if ((options & ~PUBLIC_OPTIONS) != 0)
4092 {
4093 errorcode = ERR17;
4094 goto PCRE_EARLY_ERROR_RETURN;
4095 }
4096
4097 /* Set up pointers to the individual character tables */
4098
4099 if (tables == NULL) tables = _pcre_default_tables;
4100 cd->lcc = tables + lcc_offset;
4101 cd->fcc = tables + fcc_offset;
4102 cd->cbits = tables + cbits_offset;
4103 cd->ctypes = tables + ctypes_offset;
4104
4105 /* Handle different types of newline. The two bits give four cases. The current
4106 code allows for one- or two-byte sequences. */
4107
4108 switch (options & PCRE_NEWLINE_CRLF)
4109 {
4110 default: newline = NEWLINE; break; /* Compile-time default */
4111 case PCRE_NEWLINE_CR: newline = '\r'; break;
4112 case PCRE_NEWLINE_LF: newline = '\n'; break;
4113 case PCRE_NEWLINE_CR+
4114 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
4115 }
4116
4117 if (newline > 255)
4118 {
4119 cd->nllen = 2;
4120 cd->nl[0] = (newline >> 8) & 255;
4121 cd->nl[1] = newline & 255;
4122 }
4123 else
4124 {
4125 cd->nllen = 1;
4126 cd->nl[0] = newline;
4127 }
4128
4129 /* Maximum back reference and backref bitmap. This is updated for numeric
4130 references during the first pass, but for named references during the actual
4131 compile pass. The bitmap records up to 31 back references to help in deciding
4132 whether (.*) can be treated as anchored or not. */
4133
4134 cd->top_backref = 0;
4135 cd->backref_map = 0;
4136
4137 /* Reflect pattern for debugging output */
4138
4139 DPRINTF(("------------------------------------------------------------------\n"));
4140 DPRINTF(("%s\n", pattern));
4141
4142 /* The first thing to do is to make a pass over the pattern to compute the
4143 amount of store required to hold the compiled code. This does not have to be
4144 perfect as long as errors are overestimates. At the same time we can detect any
4145 flag settings right at the start, and extract them. Make an attempt to correct
4146 for any counted white space if an "extended" flag setting appears late in the
4147 pattern. We can't be so clever for #-comments. */
4148
4149 ptr = (const uschar *)(pattern - 1);
4150 while ((c = *(++ptr)) != 0)
4151 {
4152 int min, max;
4153 int class_optcount;
4154 int bracket_length;
4155 int duplength;
4156
4157 /* If we are inside a \Q...\E sequence, all chars are literal */
4158
4159 if (inescq)
4160 {
4161 if ((options & PCRE_AUTO_CALLOUT) != 0) length += 2 + 2*LINK_SIZE;
4162 goto NORMAL_CHAR;
4163 }
4164
4165 /* Otherwise, first check for ignored whitespace and comments */
4166
4167 if ((options & PCRE_EXTENDED) != 0)
4168 {
4169 if ((cd->ctypes[c] & ctype_space) != 0) continue;
4170 if (c == '#')
4171 {
4172 while (*(++ptr) != 0) if (IS_NEWLINE(ptr)) break;
4173 if (*ptr != 0)
4174 {
4175 ptr += cd->nllen - 1;
4176 continue;
4177 }
4178 break; /* End loop at end of pattern */
4179 }
4180 }
4181
4182 item_count++; /* Is zero for the first non-comment item */
4183
4184 /* Allow space for auto callout before every item except quantifiers. */
4185
4186 if ((options & PCRE_AUTO_CALLOUT) != 0 &&
4187 c != '*' && c != '+' && c != '?' &&
4188 (c != '{' || !is_counted_repeat(ptr + 1)))
4189 length += 2 + 2*LINK_SIZE;
4190
4191 switch(c)
4192 {
4193 /* A backslashed item may be an escaped data character or it may be a
4194 character type. */
4195
4196 case '\\':
4197 c = check_escape(&ptr, &errorcode, bracount, options, FALSE);
4198 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4199
4200 lastitemlength = 1; /* Default length of last item for repeats */
4201
4202 if (c >= 0) /* Data character */
4203 {
4204 length += 2; /* For a one-byte character */
4205
4206 #ifdef SUPPORT_UTF8
4207 if (utf8 && c > 127)
4208 {
4209 int i;
4210 for (i = 0; i < _pcre_utf8_table1_size; i++)
4211 if (c <= _pcre_utf8_table1[i]) break;
4212 length += i;
4213 lastitemlength += i;
4214 }
4215 #endif
4216
4217 continue;
4218 }
4219
4220 /* If \Q, enter "literal" mode */
4221
4222 if (-c == ESC_Q)
4223 {
4224 inescq = TRUE;
4225 continue;
4226 }
4227
4228 /* \X is supported only if Unicode property support is compiled */
4229
4230 #ifndef SUPPORT_UCP
4231 if (-c == ESC_X)
4232 {
4233 errorcode = ERR45;
4234 goto PCRE_ERROR_RETURN;
4235 }
4236 #endif
4237
4238 /* \P and \p are for Unicode properties, but only when the support has
4239 been compiled. Each item needs 3 bytes. */
4240
4241 else if (-c == ESC_P || -c == ESC_p)
4242 {
4243 #ifdef SUPPORT_UCP
4244 BOOL negated;
4245 BOOL pdata;
4246 length += 3;
4247 lastitemlength = 3;
4248 if (get_ucp(&ptr, &negated, &pdata, &errorcode) < 0)
4249 goto PCRE_ERROR_RETURN;
4250 continue;
4251 #else
4252 errorcode = ERR45;
4253 goto PCRE_ERROR_RETURN;
4254 #endif
4255 }
4256
4257 /* Other escapes need one byte */
4258
4259 length++;
4260
4261 /* A back reference needs an additional 2 bytes, plus either one or 5
4262 bytes for a repeat. We also need to keep the value of the highest
4263 back reference. */
4264
4265 if (c <= -ESC_REF)
4266 {
4267 int refnum = -c - ESC_REF;
4268 cd->backref_map |= (refnum < 32)? (1 << refnum) : 1;
4269 if (refnum > cd->top_backref)
4270 cd->top_backref = refnum;
4271 length += 2; /* For single back reference */
4272 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
4273 {
4274 ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
4275 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4276 if ((min == 0 && (max == 1 || max == -1)) ||
4277 (min == 1 && max == -1))
4278 length++;
4279 else length += 5;
4280 if (ptr[1] == '?') ptr++;
4281 }
4282 }
4283 continue;
4284
4285 case '^': /* Single-byte metacharacters */
4286 case '.':
4287 case '$':
4288 length++;
4289 lastitemlength = 1;
4290 continue;
4291
4292 case '*': /* These repeats won't be after brackets; */
4293 case '+': /* those are handled separately */
4294 case '?':
4295 length++;
4296 goto POSESSIVE; /* A few lines below */
4297
4298 /* This covers the cases of braced repeats after a single char, metachar,
4299 class, or back reference. */
4300
4301 case '{':
4302 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
4303 ptr = read_repeat_counts(ptr+1, &min, &max, &errorcode);
4304 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4305
4306 /* These special cases just insert one extra opcode */
4307
4308 if ((min == 0 && (max == 1 || max == -1)) ||
4309 (min == 1 && max == -1))
4310 length++;
4311
4312 /* These cases might insert additional copies of a preceding character. */
4313
4314 else
4315 {
4316 if (min != 1)
4317 {
4318 length -= lastitemlength; /* Uncount the original char or metachar */
4319 if (min > 0) length += 3 + lastitemlength;
4320 }
4321 length += lastitemlength + ((max > 0)? 3 : 1);
4322 }
4323
4324 if (ptr[1] == '?') ptr++; /* Needs no extra length */
4325
4326 POSESSIVE: /* Test for possessive quantifier */
4327 if (ptr[1] == '+')
4328 {
4329 ptr++;
4330 length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */
4331 }
4332 continue;
4333
4334 /* An alternation contains an offset to the next branch or ket. If any ims
4335 options changed in the previous branch(es), and/or if we are in a
4336 lookbehind assertion, extra space will be needed at the start of the
4337 branch. This is handled by branch_extra. */
4338
4339 case '|':
4340 length += 1 + LINK_SIZE + branch_extra;
4341 continue;
4342
4343 /* A character class uses 33 characters provided that all the character
4344 values are less than 256. Otherwise, it uses a bit map for low valued
4345 characters, and individual items for others. Don't worry about character
4346 types that aren't allowed in classes - they'll get picked up during the
4347 compile. A character class that contains only one single-byte character
4348 uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
4349 where we can. (In UTF-8 mode we can do this only for chars < 128.) */
4350
4351 case '[':
4352 if (*(++ptr) == '^')
4353 {
4354 class_optcount = 10; /* Greater than one */
4355 ptr++;
4356 }
4357 else class_optcount = 0;
4358
4359 #ifdef SUPPORT_UTF8
4360 class_utf8 = FALSE;
4361 #endif
4362
4363 /* Written as a "do" so that an initial ']' is taken as data */
4364
4365 if (*ptr != 0) do
4366 {
4367 /* Inside \Q...\E everything is literal except \E */
4368
4369 if (inescq)
4370 {
4371 if (*ptr != '\\' || ptr[1] != 'E') goto GET_ONE_CHARACTER;
4372 inescq = FALSE;
4373 ptr += 1;
4374 continue;
4375 }
4376
4377 /* Outside \Q...\E, check for escapes */
4378
4379 if (*ptr == '\\')
4380 {
4381 c = check_escape(&ptr, &errorcode, bracount, options, TRUE);
4382 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4383
4384 /* \b is backspace inside a class; \X is literal */
4385
4386 if (-c == ESC_b) c = '\b';
4387 else if (-c == ESC_X) c = 'X';
4388
4389 /* \Q enters quoting mode */
4390
4391 else if (-c == ESC_Q)
4392 {
4393 inescq = TRUE;
4394 continue;
4395 }
4396
4397 /* Handle escapes that turn into characters */
4398
4399 if (c >= 0) goto NON_SPECIAL_CHARACTER;
4400
4401 /* Escapes that are meta-things. The normal ones just affect the
4402 bit map, but Unicode properties require an XCLASS extended item. */
4403
4404 else
4405 {
4406 class_optcount = 10; /* \d, \s etc; make sure > 1 */
4407 #ifdef SUPPORT_UTF8
4408 if (-c == ESC_p || -c == ESC_P)
4409 {
4410 if (!class_utf8)
4411 {
4412 class_utf8 = TRUE;
4413 length += LINK_SIZE + 2;
4414 }
4415 length += 3;
4416 }
4417 #endif
4418 }
4419 }
4420
4421 /* Check the syntax for POSIX stuff. The bits we actually handle are
4422 checked during the real compile phase. */
4423
4424 else if (*ptr == '[' &&
4425 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
4426 check_posix_syntax(ptr, &ptr, cd))
4427 {
4428 ptr++;
4429 class_optcount = 10; /* Make sure > 1 */
4430 }
4431
4432 /* Anything else increments the possible optimization count. We have to
4433 detect ranges here so that we can compute the number of extra ranges for
4434 caseless wide characters when UCP support is available. If there are wide
4435 characters, we are going to have to use an XCLASS, even for single
4436 characters. */
4437
4438 else
4439 {
4440 int d;
4441
4442 GET_ONE_CHARACTER:
4443
4444 #ifdef SUPPORT_UTF8
4445 if (utf8)
4446 {
4447 int extra = 0;
4448 GETCHARLEN(c, ptr, extra);
4449 ptr += extra;
4450 }
4451 else c = *ptr;
4452 #else
4453 c = *ptr;
4454 #endif
4455
4456 /* Come here from handling \ above when it escapes to a char value */
4457
4458 NON_SPECIAL_CHARACTER:
4459 class_optcount++;
4460
4461 d = -1;
4462 if (ptr[1] == '-')
4463 {
4464 uschar const *hyptr = ptr++;
4465 if (ptr[1] == '\\')
4466 {
4467 ptr++;
4468 d = check_escape(&ptr, &errorcode, bracount, options, TRUE);
4469 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4470 if (-d == ESC_b) d = '\b'; /* backspace */
4471 else if (-d == ESC_X) d = 'X'; /* literal X in a class */
4472 }
4473 else if (ptr[1] != 0 && ptr[1] != ']')
4474 {
4475 ptr++;
4476 #ifdef SUPPORT_UTF8
4477 if (utf8)
4478 {
4479 int extra = 0;
4480 GETCHARLEN(d, ptr, extra);
4481 ptr += extra;
4482 }
4483 else
4484 #endif
4485 d = *ptr;
4486 }
4487 if (d < 0) ptr = hyptr; /* go back to hyphen as data */
4488 }
4489
4490 /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
4491 127 for caseless matching, we will need to use an XCLASS. */
4492
4493 if (d >= 0)
4494 {
4495 class_optcount = 10; /* Ensure > 1 */
4496 if (d < c)
4497 {
4498 errorcode = ERR8;
4499 goto PCRE_ERROR_RETURN;
4500 }
4501
4502 #ifdef SUPPORT_UTF8
4503 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
4504 {
4505 uschar buffer[6];
4506 if (!class_utf8) /* Allow for XCLASS overhead */
4507 {
4508 class_utf8 = TRUE;
4509 length += LINK_SIZE + 2;
4510 }
4511
4512 #ifdef SUPPORT_UCP
4513 /* If we have UCP support, find out how many extra ranges are
4514 needed to map the other case of characters within this range. We
4515 have to mimic the range optimization here, because extending the
4516 range upwards might push d over a boundary that makes is use
4517 another byte in the UTF-8 representation. */
4518
4519 if ((options & PCRE_CASELESS) != 0)
4520 {
4521 int occ, ocd;
4522 int cc = c;
4523 int origd = d;
4524 while (get_othercase_range(&cc, origd, &occ, &ocd))
4525 {
4526 if (occ >= c && ocd <= d) continue; /* Skip embedded */
4527
4528 if (occ < c && ocd >= c - 1) /* Extend the basic range */
4529 { /* if there is overlap, */
4530 c = occ; /* noting that if occ < c */
4531 continue; /* we can't have ocd > d */
4532 } /* because a subrange is */
4533 if (ocd > d && occ <= d + 1) /* always shorter than */
4534 { /* the basic range. */
4535 d = ocd;
4536 continue;
4537 }
4538
4539 /* An extra item is needed */
4540
4541 length += 1 + _pcre_ord2utf8(occ, buffer) +
4542 ((occ == ocd)? 0 : _pcre_ord2utf8(ocd, buffer));
4543 }
4544 }
4545 #endif /* SUPPORT_UCP */
4546
4547 /* The length of the (possibly extended) range */
4548
4549 length += 1 + _pcre_ord2utf8(c, buffer) + _pcre_ord2utf8(d, buffer);
4550 }
4551 #endif /* SUPPORT_UTF8 */
4552
4553 }
4554
4555 /* We have a single character. There is nothing to be done unless we
4556 are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
4557 allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
4558 support. */
4559
4560 else
4561 {
4562 #ifdef SUPPORT_UTF8
4563 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
4564 {
4565 uschar buffer[6];
4566 class_optcount = 10; /* Ensure > 1 */
4567 if (!class_utf8) /* Allow for XCLASS overhead */
4568 {
4569 class_utf8 = TRUE;
4570 length += LINK_SIZE + 2;
4571 }
4572 #ifdef SUPPORT_UCP
4573 length += (((options & PCRE_CASELESS) != 0)? 2 : 1) *
4574 (1 + _pcre_ord2utf8(c, buffer));
4575 #else /* SUPPORT_UCP */
4576 length += 1 + _pcre_ord2utf8(c, buffer);
4577 #endif /* SUPPORT_UCP */
4578 }
4579 #endif /* SUPPORT_UTF8 */
4580 }
4581 }
4582 }
4583 while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */
4584
4585 if (*ptr == 0) /* Missing terminating ']' */
4586 {
4587 errorcode = ERR6;
4588 goto PCRE_ERROR_RETURN;
4589 }
4590
4591 /* We can optimize when there was only one optimizable character. Repeats
4592 for positive and negated single one-byte chars are handled by the general
4593 code. Here, we handle repeats for the class opcodes. */
4594
4595 if (class_optcount == 1) length += 3; else
4596 {
4597 length += 33;
4598
4599 /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
4600 we also need extra for wrapping the whole thing in a sub-pattern. */
4601
4602 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
4603 {
4604 ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
4605 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4606 if ((min == 0 && (max == 1 || max == -1)) ||
4607 (min == 1 && max == -1))
4608 length++;
4609 else length += 5;
4610 if (ptr[1] == '+')
4611 {
4612 ptr++;
4613 length += 2 + 2*LINK_SIZE;
4614 }
4615 else if (ptr[1] == '?') ptr++;
4616 }
4617 }
4618 continue;
4619
4620 /* Brackets may be genuine groups or special things */
4621
4622 case '(':
4623 branch_newextra = 0;
4624 bracket_length = 1 + LINK_SIZE;
4625 capturing = FALSE;
4626
4627 /* Handle special forms of bracket, which all start (? */
4628
4629 if (ptr[1] == '?')
4630 {
4631 int set, unset;
4632 int *optset;
4633
4634 switch (c = ptr[2])
4635 {
4636 /* Skip over comments entirely */
4637 case '#':
4638 ptr += 3;
4639 while (*ptr != 0 && *ptr != ')') ptr++;
4640 if (*ptr == 0)
4641 {
4642 errorcode = ERR18;
4643 goto PCRE_ERROR_RETURN;
4644 }
4645 continue;
4646
4647 /* Non-referencing groups and lookaheads just move the pointer on, and
4648 then behave like a non-special bracket, except that they don't increment
4649 the count of extracting brackets. Ditto for the "once only" bracket,
4650 which is in Perl from version 5.005. */
4651
4652 case ':':
4653 case '=':
4654 case '!':
4655 case '>':
4656 ptr += 2;
4657 break;
4658
4659 /* Named subpatterns are an extension copied from Python */
4660
4661 case 'P':
4662 ptr += 3;
4663
4664 /* Handle the definition of a named subpattern */
4665
4666 if (*ptr == '<')
4667 {
4668 const uschar *p; /* Don't amalgamate; some compilers */
4669 p = ++ptr; /* grumble at autoincrement in declaration */
4670 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4671 if (*ptr != '>')
4672 {
4673 errorcode = ERR42;
4674 goto PCRE_ERROR_RETURN;
4675 }
4676 name_count++;
4677 if (name_count > MAX_NAME_COUNT)
4678 {
4679 errorcode = ERR49;
4680 goto PCRE_ERROR_RETURN;
4681 }
4682 if (ptr - p > max_name_size)
4683 {
4684 max_name_size = (ptr - p);
4685 if (max_name_size > MAX_NAME_SIZE)
4686 {
4687 errorcode = ERR48;
4688 goto PCRE_ERROR_RETURN;
4689 }
4690 }
4691 capturing = TRUE; /* Named parentheses are always capturing */
4692 break; /* Go handle capturing parentheses */
4693 }
4694
4695 /* Handle back references and recursive calls to named subpatterns */
4696
4697 if (*ptr == '=' || *ptr == '>')
4698 {
4699 length += 3 + 3*LINK_SIZE; /* Allow for the automatic "once" */
4700 while ((cd->ctypes[*(++ptr)] & ctype_word) != 0);
4701 if (*ptr != ')')
4702 {
4703 errorcode = ERR42;
4704 goto PCRE_ERROR_RETURN;
4705 }
4706 goto RECURSE_CHECK_QUANTIFIED;
4707 }
4708
4709 /* Unknown character after (?P */
4710
4711 errorcode = ERR41;
4712 goto PCRE_ERROR_RETURN;
4713
4714 /* (?R) specifies a recursive call to the regex, which is an extension
4715 to provide the facility which can be obtained by (?p{perl-code}) in
4716 Perl 5.6. In Perl 5.8 this has become (??{perl-code}).
4717
4718 From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to
4719 the appropriate numbered brackets. This includes both recursive and
4720 non-recursive calls. (?R) is now synonymous with (?0). */
4721
4722 case 'R':
4723 ptr++;
4724
4725 case '0': case '1': case '2': case '3': case '4':
4726 case '5': case '6': case '7': case '8': case '9':
4727 ptr += 2;
4728 if (c != 'R')
4729 while ((digitab[*(++ptr)] & ctype_digit) != 0);
4730 if (*ptr != ')')
4731 {
4732 errorcode = ERR29;
4733 goto PCRE_ERROR_RETURN;
4734 }
4735 length += 3 + 3*LINK_SIZE; /* Allows for the automatic "once" */
4736
4737 /* If this item is quantified, it will get wrapped inside brackets so
4738 as to use the code for quantified brackets. We jump down and use the
4739 code that handles this for real brackets. Come here from code for
4740 named recursions/subroutines. */
4741
4742 RECURSE_CHECK_QUANTIFIED:
4743 if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{')
4744 {
4745 length += 2 + 2 * LINK_SIZE; /* to make bracketed */
4746 duplength = 5 + 3 * LINK_SIZE;
4747 goto HANDLE_QUANTIFIED_BRACKETS;
4748 }
4749 continue;
4750
4751 /* (?C) is an extension which provides "callout" - to provide a bit of
4752 the functionality of the Perl (?{...}) feature. An optional number may
4753 follow (default is zero). */
4754
4755 case 'C':
4756 ptr += 2;
4757 while ((digitab[*(++ptr)] & ctype_digit) != 0);
4758 if (*ptr != ')')
4759 {
4760 errorcode = ERR39;
4761 goto PCRE_ERROR_RETURN;
4762 }
4763 length += 2 + 2*LINK_SIZE;
4764 continue;
4765
4766 /* Lookbehinds are in Perl from version 5.005 */
4767
4768 case '<':
4769 ptr += 3;
4770 if (*ptr == '=' || *ptr == '!')
4771 {
4772 branch_newextra = 1 + LINK_SIZE;
4773 length += 1 + LINK_SIZE; /* For the first branch */
4774 break;
4775 }
4776 errorcode = ERR24;
4777 goto PCRE_ERROR_RETURN;
4778
4779 /* Conditionals are in Perl from version 5.005. The bracket must either
4780 be followed by a number (for bracket reference) or by an assertion
4781 group. PCRE extends this by allowing a name to reference a named group;
4782 unfortunately, previously 'R' was implemented for a recursion test.
4783 When this is compiled, we look for the named group 'R' first. At this
4784 point we just do a basic syntax check. */
4785
4786 case '(':
4787 if ((cd->ctypes[ptr[3]] & ctype_word) != 0)
4788 {
4789 ptr += 4;
4790 length += 3;
4791 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4792 if (*ptr != ')')
4793 {
4794 errorcode = ERR26;
4795 goto PCRE_ERROR_RETURN;
4796 }
4797 }
4798 else /* An assertion must follow */
4799 {
4800 ptr++; /* Can treat like ':' as far as spacing is concerned */
4801 if (ptr[2] != '?' ||
4802 (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
4803 {
4804 ptr += 2; /* To get right offset in message */
4805 errorcode = ERR28;
4806 goto PCRE_ERROR_RETURN;
4807 }
4808 }
4809 break;
4810
4811 /* Else loop checking valid options until ) is met. Anything else is an
4812 error. If we are without any brackets, i.e. at top level, the settings
4813 act as if specified in the options, so massage the options immediately.
4814 This is for backward compatibility with Perl 5.004. */
4815
4816 default:
4817 set = unset = 0;
4818 optset = &set;
4819 ptr += 2;
4820
4821 for (;; ptr++)
4822 {
4823 c = *ptr;
4824 switch (c)
4825 {
4826 case 'i':
4827 *optset |= PCRE_CASELESS;
4828 continue;
4829
4830 case 'J':
4831 *optset |= PCRE_DUPNAMES;
4832 options |= PCRE_JCHANGED; /* Record that it changed */
4833 continue;
4834
4835 case 'm':
4836 *optset |= PCRE_MULTILINE;
4837 continue;
4838
4839 case 's':
4840 *optset |= PCRE_DOTALL;
4841 continue;
4842
4843 case 'x':
4844 *optset |= PCRE_EXTENDED;
4845 continue;
4846
4847 case 'X':
4848 *optset |= PCRE_EXTRA;
4849 continue;
4850
4851 case 'U':
4852 *optset |= PCRE_UNGREEDY;
4853 continue;
4854
4855 case '-':
4856 optset = &unset;
4857 continue;
4858
4859 /* A termination by ')' indicates an options-setting-only item; if
4860 this is at the very start of the pattern (indicated by item_count
4861 being zero), we use it to set the global options. This is helpful
4862 when analyzing the pattern for first characters, etc. Otherwise
4863 nothing is done here and it is handled during the compiling
4864 process.
4865
4866 We allow for more than one options setting at the start. If such
4867 settings do not change the existing options, nothing is compiled.
4868 However, we must leave space just in case something is compiled.
4869 This can happen for pathological sequences such as (?i)(?-i)
4870 because the global options will end up with -i set. The space is
4871 small and not significant. (Before I did this there was a reported
4872 bug with (?i)(?-i) in a machine-generated pattern.)
4873
4874 [Historical note: Up to Perl 5.8, options settings at top level
4875 were always global settings, wherever they appeared in the pattern.
4876 That is, they were equivalent to an external setting. From 5.8
4877 onwards, they apply only to what follows (which is what you might
4878 expect).] */
4879
4880 case ')':
4881 if (item_count == 0)
4882 {
4883 options = (options | set) & (~unset);
4884 set = unset = 0; /* To save length */
4885 item_count--; /* To allow for several */
4886 length += 2;
4887 }
4888
4889 /* Fall through */
4890
4891 /* A termination by ':' indicates the start of a nested group with
4892 the given options set. This is again handled at compile time, but
4893 we must allow for compiled space if any of the ims options are
4894 set. We also have to allow for resetting space at the end of
4895 the group, which is why 4 is added to the length and not just 2.
4896 If there are several changes of options within the same group, this
4897 will lead to an over-estimate on the length, but this shouldn't
4898 matter very much. We also have to allow for resetting options at
4899 the start of any alternations, which we do by setting
4900 branch_newextra to 2. */
4901
4902 case ':':
4903 if (((set|unset) & PCRE_IMS) != 0)
4904 {
4905 length += 4;
4906 branch_newextra = 2;
4907 }
4908 goto END_OPTIONS;
4909
4910 /* Unrecognized option character */
4911
4912 default:
4913 errorcode = ERR12;
4914 goto PCRE_ERROR_RETURN;
4915 }
4916 }
4917
4918 /* If we hit a closing bracket, that's it - this is a freestanding
4919 option-setting. We need to ensure that branch_extra is updated if
4920 necessary. The only values branch_newextra can have here are 0 or 2.
4921 If the value is 2, then branch_extra must either be 2 or 5, depending
4922 on whether this is a lookbehind group or not. */
4923
4924 END_OPTIONS:
4925 if (c == ')')
4926 {
4927 if (branch_newextra == 2 &&
4928 (branch_extra == 0 || branch_extra == 1+LINK_SIZE))
4929 branch_extra += branch_newextra;
4930 continue;
4931 }
4932
4933 /* If options were terminated by ':' control comes here. This is a
4934 non-capturing group with an options change. There is nothing more that
4935 needs to be done because "capturing" is already set FALSE by default;
4936 we can just fall through. */
4937
4938 }
4939 }
4940
4941 /* Ordinary parentheses, not followed by '?', are capturing unless
4942 PCRE_NO_AUTO_CAPTURE is set. */
4943
4944 else capturing = (options & PCRE_NO_AUTO_CAPTURE) == 0;
4945
4946 /* Capturing brackets must be counted so we can process escapes in a
4947 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to need
4948 an additional 3 bytes of memory per capturing bracket. */
4949
4950 if (capturing)
4951 {
4952 bracount++;
4953 if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
4954 }
4955
4956 /* Save length for computing whole length at end if there's a repeat that
4957 requires duplication of the group. Also save the current value of
4958 branch_extra, and start the new group with the new value. If non-zero, this
4959 will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
4960
4961 if (brastackptr >= sizeof(brastack)/sizeof(int))
4962 {
4963 errorcode = ERR19;
4964 goto PCRE_ERROR_RETURN;
4965 }
4966
4967 bralenstack[brastackptr] = branch_extra;
4968 branch_extra = branch_newextra;
4969
4970 brastack[brastackptr++] = length;
4971 length += bracket_length;
4972 continue;
4973
4974 /* Handle ket. Look for subsequent max/min; for certain sets of values we
4975 have to replicate this bracket up to that many times. If brastackptr is
4976 0 this is an unmatched bracket which will generate an error, but take care
4977 not to try to access brastack[-1] when computing the length and restoring
4978 the branch_extra value. */
4979
4980 case ')':
4981 length += 1 + LINK_SIZE;
4982 if (brastackptr > 0)
4983 {
4984 duplength = length - brastack[--brastackptr];
4985 branch_extra = bralenstack[brastackptr];
4986 /* This is a paranoid check to stop integer overflow later on */
4987 if (duplength > MAX_DUPLENGTH)
4988 {
4989 errorcode = ERR50;
4990 goto PCRE_ERROR_RETURN;
4991 }
4992 }
4993 else duplength = 0;
4994
4995 /* The following code is also used when a recursion such as (?3) is
4996 followed by a quantifier, because in that case, it has to be wrapped inside
4997 brackets so that the quantifier works. The value of duplength must be
4998 set before arrival. */
4999
5000 HANDLE_QUANTIFIED_BRACKETS:
5001
5002 /* Leave ptr at the final char; for read_repeat_counts this happens
5003 automatically; for the others we need an increment. */
5004
5005 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
5006 {
5007 ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
5008 if (errorcode != 0) goto PCRE_ERROR_RETURN;
5009 }
5010 else if (c == '*') { min = 0; max = -1; ptr++; }
5011 else if (c == '+') { min = 1; max = -1; ptr++; }
5012 else if (c == '?') { min = 0; max = 1; ptr++; }
5013 else { min = 1; max = 1; }
5014
5015 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
5016 group, and if the maximum is greater than zero, we have to replicate
5017 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
5018 bracket set. */
5019
5020 if (min == 0)
5021 {
5022 length++;
5023 if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
5024 }
5025
5026 /* When the minimum is greater than zero, we have to replicate up to
5027 minval-1 times, with no additions required in the copies. Then, if there
5028 is a limited maximum we have to replicate up to maxval-1 times allowing
5029 for a BRAZERO item before each optional copy and nesting brackets for all
5030 but one of the optional copies. */
5031
5032 else
5033 {
5034 length += (min - 1) * duplength;
5035 if (max > min) /* Need this test as max=-1 means no limit */
5036 length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
5037 - (2 + 2*LINK_SIZE);
5038 }
5039
5040 /* Allow space for once brackets for "possessive quantifier" */
5041
5042 if (ptr[1] == '+')
5043 {
5044 ptr++;
5045 length += 2 + 2*LINK_SIZE;
5046 }
5047 continue;
5048
5049 /* Non-special character. It won't be space or # in extended mode, so it is
5050 always a genuine character. If we are in a \Q...\E sequence, check for the
5051 end; if not, we have a literal. */
5052
5053 default:
5054 NORMAL_CHAR:
5055
5056 if (inescq && c == '\\' && ptr[1] == 'E')
5057 {
5058 inescq = FALSE;
5059 ptr++;
5060 continue;
5061 }
5062
5063 length += 2; /* For a one-byte character */
5064 lastitemlength = 1; /* Default length of last item for repeats */
5065
5066 /* In UTF-8 mode, check for additional bytes. */
5067
5068 #ifdef SUPPORT_UTF8
5069 if (utf8 && (c & 0xc0) == 0xc0)
5070 {
5071 while ((ptr[1] & 0xc0) == 0x80) /* Can't flow over the end */
5072 { /* because the end is marked */
5073 lastitemlength++; /* by a zero byte. */
5074 length++;
5075 ptr++;
5076 }
5077 }
5078 #endif
5079
5080 continue;
5081 }
5082 }
5083
5084 length += 2 + LINK_SIZE; /* For final KET and END */
5085
5086 if ((options & PCRE_AUTO_CALLOUT) != 0)
5087 length += 2 + 2*LINK_SIZE; /* For final callout */
5088
5089 if (length > MAX_PATTERN_SIZE)
5090 {
5091 errorcode = ERR20;
5092 goto PCRE_EARLY_ERROR_RETURN;
5093 }
5094
5095 /* Compute the size of data block needed and get it, either from malloc or
5096 externally provided function. Integer overflow should no longer be possible
5097 because nowadays we limit the maximum value of name_count and max_name size. */
5098
5099 size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
5100 re = (real_pcre *)(pcre_malloc)(size);
5101
5102 if (re == NULL)
5103 {
5104 errorcode = ERR21;
5105 goto PCRE_EARLY_ERROR_RETURN;
5106 }
5107
5108 /* Put in the magic number, and save the sizes, options, and character table
5109 pointer. NULL is used for the default character tables. The nullpad field is at
5110 the end; it's there to help in the case when a regex compiled on a system with
5111 4-byte pointers is run on another with 8-byte pointers. */
5112
5113 re->magic_number = MAGIC_NUMBER;
5114 re->size = size;
5115 re->options = options;
5116 re->dummy1 = 0;
5117 re->name_table_offset = sizeof(real_pcre);
5118 re->name_entry_size = max_name_size + 3;
5119 re->name_count = name_count;
5120 re->ref_count = 0;
5121 re->tables = (tables == _pcre_default_tables)? NULL : tables;
5122 re->nullpad = NULL;
5123
5124 /* The starting points of the name/number translation table and of the code are
5125 passed around in the compile data block. */
5126
5127 cd->names_found = 0;
5128 cd->name_entry_size = max_name_size + 3;
5129 cd->name_table = (uschar *)re + re->name_table_offset;
5130 codestart = cd->name_table + re->name_entry_size * re->name_count;
5131 cd->start_code = codestart;
5132 cd->start_pattern = (const uschar *)pattern;
5133 cd->req_varyopt = 0;
5134 cd->nopartial = FALSE;
5135
5136 /* Set up a starting, non-extracting bracket, then compile the expression. On
5137 error, errorcode will be set non-zero, so we don't need to look at the result
5138 of the function here. */
5139
5140 ptr = (const uschar *)pattern;
5141 code = (uschar *)codestart;
5142 *code = OP_BRA;
5143 bracount = 0;
5144 (void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr,
5145 &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd);
5146 re->top_bracket = bracount;
5147 re->top_backref = cd->top_backref;
5148
5149 if (cd->nopartial) re->options |= PCRE_NOPARTIAL;
5150
5151 /* If not reached end of pattern on success, there's an excess bracket. */
5152
5153 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
5154
5155 /* Fill in the terminating state and check for disastrous overflow, but
5156 if debugging, leave the test till after things are printed out. */
5157
5158 *code++ = OP_END;
5159
5160 #ifndef DEBUG
5161 if (code - codestart > length) errorcode = ERR23;
5162 #endif
5163
5164 /* Give an error if there's back reference to a non-existent capturing
5165 subpattern. */
5166
5167 if (re->top_backref > re->top_bracket) errorcode = ERR15;
5168
5169 /* Failed to compile, or error while post-processing */
5170
5171 if (errorcode != 0)
5172 {
5173 (pcre_free)(re);
5174 PCRE_ERROR_RETURN:
5175 *erroroffset = ptr - (const uschar *)pattern;
5176 PCRE_EARLY_ERROR_RETURN:
5177 *errorptr = error_texts[errorcode];
5178 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
5179 return NULL;
5180 }
5181
5182 /* If the anchored option was not passed, set the flag if we can determine that
5183 the pattern is anchored by virtue of ^ characters or \A or anything else (such
5184 as starting with .* when DOTALL is set).
5185
5186 Otherwise, if we know what the first character has to be, save it, because that
5187 speeds up unanchored matches no end. If not, see if we can set the
5188 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
5189 start with ^. and also when all branches start with .* for non-DOTALL matches.
5190 */
5191
5192 if ((options & PCRE_ANCHORED) == 0)
5193 {
5194 int temp_options = options;
5195 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
5196 re->options |= PCRE_ANCHORED;
5197 else
5198 {
5199 if (firstbyte < 0)
5200 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
5201 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
5202 {
5203 int ch = firstbyte & 255;
5204 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
5205 cd->fcc[ch] == ch)? ch : firstbyte;
5206 re->options |= PCRE_FIRSTSET;
5207 }
5208 else if (is_startline(codestart, 0, cd->backref_map))
5209 re->options |= PCRE_STARTLINE;
5210 }
5211 }
5212
5213 /* For an anchored pattern, we use the "required byte" only if it follows a
5214 variable length item in the regex. Remove the caseless flag for non-caseable
5215 bytes. */
5216
5217 if (reqbyte >= 0 &&
5218 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
5219 {
5220 int ch = reqbyte & 255;
5221 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
5222 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5223 re->options |= PCRE_REQCHSET;
5224 }
5225
5226 /* Print out the compiled data if debugging is enabled. This is never the
5227 case when building a production library. */
5228
5229 #ifdef DEBUG
5230
5231 printf("Length = %d top_bracket = %d top_backref = %d\n",
5232 length, re->top_bracket, re->top_backref);
5233
5234 if (re->options != 0)
5235 {
5236 printf("%s%s%s%s%s%s%s%s%s\n",
5237 ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5238 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5239 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5240 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5241 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5242 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5243 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5244 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5245 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5246 }
5247
5248 if ((re->options & PCRE_FIRSTSET) != 0)
5249 {
5250 int ch = re->first_byte & 255;
5251 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
5252 "" : " (caseless)";
5253 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5254 else printf("First char = \\x%02x%s\n", ch, caseless);
5255 }
5256
5257 if ((re->options & PCRE_REQCHSET) != 0)
5258 {
5259 int ch = re->req_byte & 255;
5260 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
5261 "" : " (caseless)";
5262 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5263 else printf("Req char = \\x%02x%s\n", ch, caseless);
5264 }
5265
5266 pcre_printint(re, stdout);
5267
5268 /* This check is done here in the debugging case so that the code that
5269 was compiled can be seen. */
5270
5271 if (code - codestart > length)
5272 {
5273 (pcre_free)(re);
5274 *errorptr = error_texts[ERR23];
5275 *erroroffset = ptr - (uschar *)pattern;
5276 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
5277 return NULL;
5278 }
5279 #endif
5280
5281 return (pcre *)re;
5282 }
5283
5284 /* End of pcre_compile.c */