Installed PCRE release 7.0.
[exim.git] / src / src / pcre / pcre_compile.c
1 /* $Cambridge: exim/src/src/pcre/pcre_compile.c,v 1.4 2007/01/23 15:08:45 ph10 Exp $ */
2
3 /*************************************************
4 * Perl-Compatible Regular Expressions *
5 *************************************************/
6
7 /* PCRE is a library of functions to support regular expressions whose syntax
8 and semantics are as close as possible to those of the Perl 5 language.
9
10 Written by Philip Hazel
11 Copyright (c) 1997-2006 University of Cambridge
12
13 -----------------------------------------------------------------------------
14 Redistribution and use in source and binary forms, with or without
15 modification, are permitted provided that the following conditions are met:
16
17 * Redistributions of source code must retain the above copyright notice,
18 this list of conditions and the following disclaimer.
19
20 * Redistributions in binary form must reproduce the above copyright
21 notice, this list of conditions and the following disclaimer in the
22 documentation and/or other materials provided with the distribution.
23
24 * Neither the name of the University of Cambridge nor the names of its
25 contributors may be used to endorse or promote products derived from
26 this software without specific prior written permission.
27
28 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 POSSIBILITY OF SUCH DAMAGE.
39 -----------------------------------------------------------------------------
40 */
41
42
43 /* This module contains the external function pcre_compile(), along with
44 supporting internal functions that are not used by other modules. */
45
46
47 #define NLBLOCK cd /* Block containing newline information */
48 #define PSSTART start_pattern /* Field containing processed string start */
49 #define PSEND end_pattern /* Field containing processed string end */
50
51
52 #include "pcre_internal.h"
53
54
55 /* When DEBUG is defined, we need the pcre_printint() function, which is also
56 used by pcretest. DEBUG is not defined when building a production library. */
57
58 #ifdef DEBUG
59 #include "pcre_printint.src"
60 #endif
61
62
63 /*************************************************
64 * Code parameters and static tables *
65 *************************************************/
66
67 /* This value specifies the size of stack workspace that is used during the
68 first pre-compile phase that determines how much memory is required. The regex
69 is partly compiled into this space, but the compiled parts are discarded as
70 soon as they can be, so that hopefully there will never be an overrun. The code
71 does, however, check for an overrun. The largest amount I've seen used is 218,
72 so this number is very generous.
73
74 The same workspace is used during the second, actual compile phase for
75 remembering forward references to groups so that they can be filled in at the
76 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
77 is 4 there is plenty of room. */
78
79 #define COMPILE_WORK_SIZE (4096)
80
81
82 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
83 are simple data values; negative values are for special things like \d and so
84 on. Zero means further processing is needed (for things like \x), or the escape
85 is invalid. */
86
87 #if !EBCDIC /* This is the "normal" table for ASCII systems */
88 static const short int escapes[] = {
89 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
90 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
91 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
92 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
93 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
94 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
95 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
96 0, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
97 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
98 0, 0, -ESC_z /* x - z */
99 };
100
101 #else /* This is the "abnormal" table for EBCDIC systems */
102 static const short int escapes[] = {
103 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
104 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
105 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
106 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
107 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
108 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
109 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
110 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
111 /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
112 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
113 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
114 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
115 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
116 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
117 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
118 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
119 /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
120 /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
121 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
122 /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
123 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
124 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
125 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
126 };
127 #endif
128
129
130 /* Tables of names of POSIX character classes and their lengths. The list is
131 terminated by a zero length entry. The first three must be alpha, lower, upper,
132 as this is assumed for handling case independence. */
133
134 static const char *const posix_names[] = {
135 "alpha", "lower", "upper",
136 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
137 "print", "punct", "space", "word", "xdigit" };
138
139 static const uschar posix_name_lengths[] = {
140 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
141
142 /* Table of class bit maps for each POSIX class. Each class is formed from a
143 base map, with an optional addition or removal of another map. Then, for some
144 classes, there is some additional tweaking: for [:blank:] the vertical space
145 characters are removed, and for [:alpha:] and [:alnum:] the underscore
146 character is removed. The triples in the table consist of the base map offset,
147 second map offset or -1 if no second map, and a non-negative value for map
148 addition or a negative value for map subtraction (if there are two maps). The
149 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
150 remove vertical space characters, 2 => remove underscore. */
151
152 static const int posix_class_maps[] = {
153 cbit_word, cbit_digit, -2, /* alpha */
154 cbit_lower, -1, 0, /* lower */
155 cbit_upper, -1, 0, /* upper */
156 cbit_word, -1, 2, /* alnum - word without underscore */
157 cbit_print, cbit_cntrl, 0, /* ascii */
158 cbit_space, -1, 1, /* blank - a GNU extension */
159 cbit_cntrl, -1, 0, /* cntrl */
160 cbit_digit, -1, 0, /* digit */
161 cbit_graph, -1, 0, /* graph */
162 cbit_print, -1, 0, /* print */
163 cbit_punct, -1, 0, /* punct */
164 cbit_space, -1, 0, /* space */
165 cbit_word, -1, 0, /* word - a Perl extension */
166 cbit_xdigit,-1, 0 /* xdigit */
167 };
168
169
170 #define STRING(a) # a
171 #define XSTRING(s) STRING(s)
172
173 /* The texts of compile-time error messages. These are "char *" because they
174 are passed to the outside world. Do not ever re-use any error number, because
175 they are documented. Always add a new error instead. Messages marked DEAD below
176 are no longer used. */
177
178 static const char *error_texts[] = {
179 "no error",
180 "\\ at end of pattern",
181 "\\c at end of pattern",
182 "unrecognized character follows \\",
183 "numbers out of order in {} quantifier",
184 /* 5 */
185 "number too big in {} quantifier",
186 "missing terminating ] for character class",
187 "invalid escape sequence in character class",
188 "range out of order in character class",
189 "nothing to repeat",
190 /* 10 */
191 "operand of unlimited repeat could match the empty string", /** DEAD **/
192 "internal error: unexpected repeat",
193 "unrecognized character after (?",
194 "POSIX named classes are supported only within a class",
195 "missing )",
196 /* 15 */
197 "reference to non-existent subpattern",
198 "erroffset passed as NULL",
199 "unknown option bit(s) set",
200 "missing ) after comment",
201 "parentheses nested too deeply", /** DEAD **/
202 /* 20 */
203 "regular expression too large",
204 "failed to get memory",
205 "unmatched parentheses",
206 "internal error: code overflow",
207 "unrecognized character after (?<",
208 /* 25 */
209 "lookbehind assertion is not fixed length",
210 "malformed number or name after (?(",
211 "conditional group contains more than two branches",
212 "assertion expected after (?(",
213 "(?R or (?digits must be followed by )",
214 /* 30 */
215 "unknown POSIX class name",
216 "POSIX collating elements are not supported",
217 "this version of PCRE is not compiled with PCRE_UTF8 support",
218 "spare error", /** DEAD **/
219 "character value in \\x{...} sequence is too large",
220 /* 35 */
221 "invalid condition (?(0)",
222 "\\C not allowed in lookbehind assertion",
223 "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
224 "number after (?C is > 255",
225 "closing ) for (?C expected",
226 /* 40 */
227 "recursive call could loop indefinitely",
228 "unrecognized character after (?P",
229 "syntax error in subpattern name (missing terminator)",
230 "two named subpatterns have the same name",
231 "invalid UTF-8 string",
232 /* 45 */
233 "support for \\P, \\p, and \\X has not been compiled",
234 "malformed \\P or \\p sequence",
235 "unknown property name after \\P or \\p",
236 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
237 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
238 /* 50 */
239 "repeated subpattern is too long",
240 "octal value is greater than \\377 (not in UTF-8 mode)",
241 "internal error: overran compiling workspace",
242 "internal error: previously-checked referenced subpattern not found",
243 "DEFINE group contains more than one branch",
244 /* 55 */
245 "repeating a DEFINE group is not allowed",
246 "inconsistent NEWLINE options",
247 "\\g is not followed by an (optionally braced) non-zero number"
248 };
249
250
251 /* Table to identify digits and hex digits. This is used when compiling
252 patterns. Note that the tables in chartables are dependent on the locale, and
253 may mark arbitrary characters as digits - but the PCRE compiling code expects
254 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
255 a private table here. It costs 256 bytes, but it is a lot faster than doing
256 character value tests (at least in some simple cases I timed), and in some
257 applications one wants PCRE to compile efficiently as well as match
258 efficiently.
259
260 For convenience, we use the same bit definitions as in chartables:
261
262 0x04 decimal digit
263 0x08 hexadecimal digit
264
265 Then we can use ctype_digit and ctype_xdigit in the code. */
266
267 #if !EBCDIC /* This is the "normal" case, for ASCII systems */
268 static const unsigned char digitab[] =
269 {
270 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
271 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
272 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
273 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
274 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
275 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
276 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
277 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
278 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
279 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
280 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
281 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
282 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
283 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
284 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
285 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
286 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
287 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
288 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
289 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
290 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
291 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
292 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
293 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
294 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
295 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
296 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
297 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
298 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
299 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
300 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
301 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
302
303 #else /* This is the "abnormal" case, for EBCDIC systems */
304 static const unsigned char digitab[] =
305 {
306 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
307 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
308 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
309 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
310 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
311 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
312 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
313 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
314 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
315 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
316 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
317 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- ¬ */
318 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
319 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
320 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
321 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
322 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
323 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
324 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
325 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
326 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
327 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
328 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
329 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
330 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
331 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
332 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
333 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
334 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
335 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
336 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
337 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
338
339 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
340 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
341 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
342 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
343 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
344 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
345 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
346 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
347 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
348 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
349 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
350 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
351 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- ¬ */
352 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
353 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
354 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
355 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
356 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
357 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
358 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
359 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
360 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
361 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
362 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
363 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
364 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
365 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
366 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
367 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
368 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
369 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
370 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
371 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
372 #endif
373
374
375 /* Definition to allow mutual recursion */
376
377 static BOOL
378 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, int, int *,
379 int *, branch_chain *, compile_data *, int *);
380
381
382
383 /*************************************************
384 * Handle escapes *
385 *************************************************/
386
387 /* This function is called when a \ has been encountered. It either returns a
388 positive value for a simple escape such as \n, or a negative value which
389 encodes one of the more complicated things such as \d. A backreference to group
390 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
391 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
392 ptr is pointing at the \. On exit, it is on the final character of the escape
393 sequence.
394
395 Arguments:
396 ptrptr points to the pattern position pointer
397 errorcodeptr points to the errorcode variable
398 bracount number of previous extracting brackets
399 options the options bits
400 isclass TRUE if inside a character class
401
402 Returns: zero or positive => a data character
403 negative => a special escape sequence
404 on error, errorptr is set
405 */
406
407 static int
408 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
409 int options, BOOL isclass)
410 {
411 BOOL utf8 = (options & PCRE_UTF8) != 0;
412 const uschar *ptr = *ptrptr + 1;
413 int c, i;
414
415 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
416 ptr--; /* Set pointer back to the last byte */
417
418 /* If backslash is at the end of the pattern, it's an error. */
419
420 if (c == 0) *errorcodeptr = ERR1;
421
422 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
423 a table. A non-zero result is something that can be returned immediately.
424 Otherwise further processing may be required. */
425
426 #if !EBCDIC /* ASCII coding */
427 else if (c < '0' || c > 'z') {} /* Not alphameric */
428 else if ((i = escapes[c - '0']) != 0) c = i;
429
430 #else /* EBCDIC coding */
431 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
432 else if ((i = escapes[c - 0x48]) != 0) c = i;
433 #endif
434
435 /* Escapes that need further processing, or are illegal. */
436
437 else
438 {
439 const uschar *oldptr;
440 BOOL braced, negated;
441
442 switch (c)
443 {
444 /* A number of Perl escapes are not handled by PCRE. We give an explicit
445 error. */
446
447 case 'l':
448 case 'L':
449 case 'N':
450 case 'u':
451 case 'U':
452 *errorcodeptr = ERR37;
453 break;
454
455 /* \g must be followed by a number, either plain or braced. If positive, it
456 is an absolute backreference. If negative, it is a relative backreference.
457 This is a Perl 5.10 feature. */
458
459 case 'g':
460 if (ptr[1] == '{')
461 {
462 braced = TRUE;
463 ptr++;
464 }
465 else braced = FALSE;
466
467 if (ptr[1] == '-')
468 {
469 negated = TRUE;
470 ptr++;
471 }
472 else negated = FALSE;
473
474 c = 0;
475 while ((digitab[ptr[1]] & ctype_digit) != 0)
476 c = c * 10 + *(++ptr) - '0';
477
478 if (c == 0 || (braced && *(++ptr) != '}'))
479 {
480 *errorcodeptr = ERR57;
481 return 0;
482 }
483
484 if (negated)
485 {
486 if (c > bracount)
487 {
488 *errorcodeptr = ERR15;
489 return 0;
490 }
491 c = bracount - (c - 1);
492 }
493
494 c = -(ESC_REF + c);
495 break;
496
497 /* The handling of escape sequences consisting of a string of digits
498 starting with one that is not zero is not straightforward. By experiment,
499 the way Perl works seems to be as follows:
500
501 Outside a character class, the digits are read as a decimal number. If the
502 number is less than 10, or if there are that many previous extracting
503 left brackets, then it is a back reference. Otherwise, up to three octal
504 digits are read to form an escaped byte. Thus \123 is likely to be octal
505 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
506 value is greater than 377, the least significant 8 bits are taken. Inside a
507 character class, \ followed by a digit is always an octal number. */
508
509 case '1': case '2': case '3': case '4': case '5':
510 case '6': case '7': case '8': case '9':
511
512 if (!isclass)
513 {
514 oldptr = ptr;
515 c -= '0';
516 while ((digitab[ptr[1]] & ctype_digit) != 0)
517 c = c * 10 + *(++ptr) - '0';
518 if (c < 10 || c <= bracount)
519 {
520 c = -(ESC_REF + c);
521 break;
522 }
523 ptr = oldptr; /* Put the pointer back and fall through */
524 }
525
526 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
527 generates a binary zero byte and treats the digit as a following literal.
528 Thus we have to pull back the pointer by one. */
529
530 if ((c = *ptr) >= '8')
531 {
532 ptr--;
533 c = 0;
534 break;
535 }
536
537 /* \0 always starts an octal number, but we may drop through to here with a
538 larger first octal digit. The original code used just to take the least
539 significant 8 bits of octal numbers (I think this is what early Perls used
540 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
541 than 3 octal digits. */
542
543 case '0':
544 c -= '0';
545 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
546 c = c * 8 + *(++ptr) - '0';
547 if (!utf8 && c > 255) *errorcodeptr = ERR51;
548 break;
549
550 /* \x is complicated. \x{ddd} is a character number which can be greater
551 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
552 treated as a data character. */
553
554 case 'x':
555 if (ptr[1] == '{')
556 {
557 const uschar *pt = ptr + 2;
558 int count = 0;
559
560 c = 0;
561 while ((digitab[*pt] & ctype_xdigit) != 0)
562 {
563 register int cc = *pt++;
564 if (c == 0 && cc == '0') continue; /* Leading zeroes */
565 count++;
566
567 #if !EBCDIC /* ASCII coding */
568 if (cc >= 'a') cc -= 32; /* Convert to upper case */
569 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
570 #else /* EBCDIC coding */
571 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
572 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
573 #endif
574 }
575
576 if (*pt == '}')
577 {
578 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
579 ptr = pt;
580 break;
581 }
582
583 /* If the sequence of hex digits does not end with '}', then we don't
584 recognize this construct; fall through to the normal \x handling. */
585 }
586
587 /* Read just a single-byte hex-defined char */
588
589 c = 0;
590 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
591 {
592 int cc; /* Some compilers don't like ++ */
593 cc = *(++ptr); /* in initializers */
594 #if !EBCDIC /* ASCII coding */
595 if (cc >= 'a') cc -= 32; /* Convert to upper case */
596 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
597 #else /* EBCDIC coding */
598 if (cc <= 'z') cc += 64; /* Convert to upper case */
599 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
600 #endif
601 }
602 break;
603
604 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
605 This coding is ASCII-specific, but then the whole concept of \cx is
606 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
607
608 case 'c':
609 c = *(++ptr);
610 if (c == 0)
611 {
612 *errorcodeptr = ERR2;
613 return 0;
614 }
615
616 #if !EBCDIC /* ASCII coding */
617 if (c >= 'a' && c <= 'z') c -= 32;
618 c ^= 0x40;
619 #else /* EBCDIC coding */
620 if (c >= 'a' && c <= 'z') c += 64;
621 c ^= 0xC0;
622 #endif
623 break;
624
625 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
626 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
627 for Perl compatibility, it is a literal. This code looks a bit odd, but
628 there used to be some cases other than the default, and there may be again
629 in future, so I haven't "optimized" it. */
630
631 default:
632 if ((options & PCRE_EXTRA) != 0) switch(c)
633 {
634 default:
635 *errorcodeptr = ERR3;
636 break;
637 }
638 break;
639 }
640 }
641
642 *ptrptr = ptr;
643 return c;
644 }
645
646
647
648 #ifdef SUPPORT_UCP
649 /*************************************************
650 * Handle \P and \p *
651 *************************************************/
652
653 /* This function is called after \P or \p has been encountered, provided that
654 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
655 pointing at the P or p. On exit, it is pointing at the final character of the
656 escape sequence.
657
658 Argument:
659 ptrptr points to the pattern position pointer
660 negptr points to a boolean that is set TRUE for negation else FALSE
661 dptr points to an int that is set to the detailed property value
662 errorcodeptr points to the error code variable
663
664 Returns: type value from ucp_type_table, or -1 for an invalid type
665 */
666
667 static int
668 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
669 {
670 int c, i, bot, top;
671 const uschar *ptr = *ptrptr;
672 char name[32];
673
674 c = *(++ptr);
675 if (c == 0) goto ERROR_RETURN;
676
677 *negptr = FALSE;
678
679 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
680 negation. */
681
682 if (c == '{')
683 {
684 if (ptr[1] == '^')
685 {
686 *negptr = TRUE;
687 ptr++;
688 }
689 for (i = 0; i < sizeof(name) - 1; i++)
690 {
691 c = *(++ptr);
692 if (c == 0) goto ERROR_RETURN;
693 if (c == '}') break;
694 name[i] = c;
695 }
696 if (c !='}') goto ERROR_RETURN;
697 name[i] = 0;
698 }
699
700 /* Otherwise there is just one following character */
701
702 else
703 {
704 name[0] = c;
705 name[1] = 0;
706 }
707
708 *ptrptr = ptr;
709
710 /* Search for a recognized property name using binary chop */
711
712 bot = 0;
713 top = _pcre_utt_size;
714
715 while (bot < top)
716 {
717 i = (bot + top) >> 1;
718 c = strcmp(name, _pcre_utt[i].name);
719 if (c == 0)
720 {
721 *dptr = _pcre_utt[i].value;
722 return _pcre_utt[i].type;
723 }
724 if (c > 0) bot = i + 1; else top = i;
725 }
726
727 *errorcodeptr = ERR47;
728 *ptrptr = ptr;
729 return -1;
730
731 ERROR_RETURN:
732 *errorcodeptr = ERR46;
733 *ptrptr = ptr;
734 return -1;
735 }
736 #endif
737
738
739
740
741 /*************************************************
742 * Check for counted repeat *
743 *************************************************/
744
745 /* This function is called when a '{' is encountered in a place where it might
746 start a quantifier. It looks ahead to see if it really is a quantifier or not.
747 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
748 where the ddds are digits.
749
750 Arguments:
751 p pointer to the first char after '{'
752
753 Returns: TRUE or FALSE
754 */
755
756 static BOOL
757 is_counted_repeat(const uschar *p)
758 {
759 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
760 while ((digitab[*p] & ctype_digit) != 0) p++;
761 if (*p == '}') return TRUE;
762
763 if (*p++ != ',') return FALSE;
764 if (*p == '}') return TRUE;
765
766 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
767 while ((digitab[*p] & ctype_digit) != 0) p++;
768
769 return (*p == '}');
770 }
771
772
773
774 /*************************************************
775 * Read repeat counts *
776 *************************************************/
777
778 /* Read an item of the form {n,m} and return the values. This is called only
779 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
780 so the syntax is guaranteed to be correct, but we need to check the values.
781
782 Arguments:
783 p pointer to first char after '{'
784 minp pointer to int for min
785 maxp pointer to int for max
786 returned as -1 if no max
787 errorcodeptr points to error code variable
788
789 Returns: pointer to '}' on success;
790 current ptr on error, with errorcodeptr set non-zero
791 */
792
793 static const uschar *
794 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
795 {
796 int min = 0;
797 int max = -1;
798
799 /* Read the minimum value and do a paranoid check: a negative value indicates
800 an integer overflow. */
801
802 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
803 if (min < 0 || min > 65535)
804 {
805 *errorcodeptr = ERR5;
806 return p;
807 }
808
809 /* Read the maximum value if there is one, and again do a paranoid on its size.
810 Also, max must not be less than min. */
811
812 if (*p == '}') max = min; else
813 {
814 if (*(++p) != '}')
815 {
816 max = 0;
817 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
818 if (max < 0 || max > 65535)
819 {
820 *errorcodeptr = ERR5;
821 return p;
822 }
823 if (max < min)
824 {
825 *errorcodeptr = ERR4;
826 return p;
827 }
828 }
829 }
830
831 /* Fill in the required variables, and pass back the pointer to the terminating
832 '}'. */
833
834 *minp = min;
835 *maxp = max;
836 return p;
837 }
838
839
840
841 /*************************************************
842 * Find forward referenced subpattern *
843 *************************************************/
844
845 /* This function scans along a pattern's text looking for capturing
846 subpatterns, and counting them. If it finds a named pattern that matches the
847 name it is given, it returns its number. Alternatively, if the name is NULL, it
848 returns when it reaches a given numbered subpattern. This is used for forward
849 references to subpatterns. We know that if (?P< is encountered, the name will
850 be terminated by '>' because that is checked in the first pass.
851
852 Arguments:
853 ptr current position in the pattern
854 count current count of capturing parens so far encountered
855 name name to seek, or NULL if seeking a numbered subpattern
856 lorn name length, or subpattern number if name is NULL
857 xmode TRUE if we are in /x mode
858
859 Returns: the number of the named subpattern, or -1 if not found
860 */
861
862 static int
863 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
864 BOOL xmode)
865 {
866 const uschar *thisname;
867
868 for (; *ptr != 0; ptr++)
869 {
870 int term;
871
872 /* Skip over backslashed characters and also entire \Q...\E */
873
874 if (*ptr == '\\')
875 {
876 if (*(++ptr) == 0) return -1;
877 if (*ptr == 'Q') for (;;)
878 {
879 while (*(++ptr) != 0 && *ptr != '\\');
880 if (*ptr == 0) return -1;
881 if (*(++ptr) == 'E') break;
882 }
883 continue;
884 }
885
886 /* Skip over character classes */
887
888 if (*ptr == '[')
889 {
890 while (*(++ptr) != ']')
891 {
892 if (*ptr == '\\')
893 {
894 if (*(++ptr) == 0) return -1;
895 if (*ptr == 'Q') for (;;)
896 {
897 while (*(++ptr) != 0 && *ptr != '\\');
898 if (*ptr == 0) return -1;
899 if (*(++ptr) == 'E') break;
900 }
901 continue;
902 }
903 }
904 continue;
905 }
906
907 /* Skip comments in /x mode */
908
909 if (xmode && *ptr == '#')
910 {
911 while (*(++ptr) != 0 && *ptr != '\n');
912 if (*ptr == 0) return -1;
913 continue;
914 }
915
916 /* An opening parens must now be a real metacharacter */
917
918 if (*ptr != '(') continue;
919 if (ptr[1] != '?')
920 {
921 count++;
922 if (name == NULL && count == lorn) return count;
923 continue;
924 }
925
926 ptr += 2;
927 if (*ptr == 'P') ptr++; /* Allow optional P */
928
929 /* We have to disambiguate (?<! and (?<= from (?<name> */
930
931 if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
932 *ptr != '\'')
933 continue;
934
935 count++;
936
937 if (name == NULL && count == lorn) return count;
938 term = *ptr++;
939 if (term == '<') term = '>';
940 thisname = ptr;
941 while (*ptr != term) ptr++;
942 if (name != NULL && lorn == ptr - thisname &&
943 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
944 return count;
945 }
946
947 return -1;
948 }
949
950
951
952 /*************************************************
953 * Find first significant op code *
954 *************************************************/
955
956 /* This is called by several functions that scan a compiled expression looking
957 for a fixed first character, or an anchoring op code etc. It skips over things
958 that do not influence this. For some calls, a change of option is important.
959 For some calls, it makes sense to skip negative forward and all backward
960 assertions, and also the \b assertion; for others it does not.
961
962 Arguments:
963 code pointer to the start of the group
964 options pointer to external options
965 optbit the option bit whose changing is significant, or
966 zero if none are
967 skipassert TRUE if certain assertions are to be skipped
968
969 Returns: pointer to the first significant opcode
970 */
971
972 static const uschar*
973 first_significant_code(const uschar *code, int *options, int optbit,
974 BOOL skipassert)
975 {
976 for (;;)
977 {
978 switch ((int)*code)
979 {
980 case OP_OPT:
981 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
982 *options = (int)code[1];
983 code += 2;
984 break;
985
986 case OP_ASSERT_NOT:
987 case OP_ASSERTBACK:
988 case OP_ASSERTBACK_NOT:
989 if (!skipassert) return code;
990 do code += GET(code, 1); while (*code == OP_ALT);
991 code += _pcre_OP_lengths[*code];
992 break;
993
994 case OP_WORD_BOUNDARY:
995 case OP_NOT_WORD_BOUNDARY:
996 if (!skipassert) return code;
997 /* Fall through */
998
999 case OP_CALLOUT:
1000 case OP_CREF:
1001 case OP_RREF:
1002 case OP_DEF:
1003 code += _pcre_OP_lengths[*code];
1004 break;
1005
1006 default:
1007 return code;
1008 }
1009 }
1010 /* Control never reaches here */
1011 }
1012
1013
1014
1015
1016 /*************************************************
1017 * Find the fixed length of a pattern *
1018 *************************************************/
1019
1020 /* Scan a pattern and compute the fixed length of subject that will match it,
1021 if the length is fixed. This is needed for dealing with backward assertions.
1022 In UTF8 mode, the result is in characters rather than bytes.
1023
1024 Arguments:
1025 code points to the start of the pattern (the bracket)
1026 options the compiling options
1027
1028 Returns: the fixed length, or -1 if there is no fixed length,
1029 or -2 if \C was encountered
1030 */
1031
1032 static int
1033 find_fixedlength(uschar *code, int options)
1034 {
1035 int length = -1;
1036
1037 register int branchlength = 0;
1038 register uschar *cc = code + 1 + LINK_SIZE;
1039
1040 /* Scan along the opcodes for this branch. If we get to the end of the
1041 branch, check the length against that of the other branches. */
1042
1043 for (;;)
1044 {
1045 int d;
1046 register int op = *cc;
1047
1048 switch (op)
1049 {
1050 case OP_CBRA:
1051 case OP_BRA:
1052 case OP_ONCE:
1053 case OP_COND:
1054 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1055 if (d < 0) return d;
1056 branchlength += d;
1057 do cc += GET(cc, 1); while (*cc == OP_ALT);
1058 cc += 1 + LINK_SIZE;
1059 break;
1060
1061 /* Reached end of a branch; if it's a ket it is the end of a nested
1062 call. If it's ALT it is an alternation in a nested call. If it is
1063 END it's the end of the outer call. All can be handled by the same code. */
1064
1065 case OP_ALT:
1066 case OP_KET:
1067 case OP_KETRMAX:
1068 case OP_KETRMIN:
1069 case OP_END:
1070 if (length < 0) length = branchlength;
1071 else if (length != branchlength) return -1;
1072 if (*cc != OP_ALT) return length;
1073 cc += 1 + LINK_SIZE;
1074 branchlength = 0;
1075 break;
1076
1077 /* Skip over assertive subpatterns */
1078
1079 case OP_ASSERT:
1080 case OP_ASSERT_NOT:
1081 case OP_ASSERTBACK:
1082 case OP_ASSERTBACK_NOT:
1083 do cc += GET(cc, 1); while (*cc == OP_ALT);
1084 /* Fall through */
1085
1086 /* Skip over things that don't match chars */
1087
1088 case OP_REVERSE:
1089 case OP_CREF:
1090 case OP_RREF:
1091 case OP_DEF:
1092 case OP_OPT:
1093 case OP_CALLOUT:
1094 case OP_SOD:
1095 case OP_SOM:
1096 case OP_EOD:
1097 case OP_EODN:
1098 case OP_CIRC:
1099 case OP_DOLL:
1100 case OP_NOT_WORD_BOUNDARY:
1101 case OP_WORD_BOUNDARY:
1102 cc += _pcre_OP_lengths[*cc];
1103 break;
1104
1105 /* Handle literal characters */
1106
1107 case OP_CHAR:
1108 case OP_CHARNC:
1109 case OP_NOT:
1110 branchlength++;
1111 cc += 2;
1112 #ifdef SUPPORT_UTF8
1113 if ((options & PCRE_UTF8) != 0)
1114 {
1115 while ((*cc & 0xc0) == 0x80) cc++;
1116 }
1117 #endif
1118 break;
1119
1120 /* Handle exact repetitions. The count is already in characters, but we
1121 need to skip over a multibyte character in UTF8 mode. */
1122
1123 case OP_EXACT:
1124 branchlength += GET2(cc,1);
1125 cc += 4;
1126 #ifdef SUPPORT_UTF8
1127 if ((options & PCRE_UTF8) != 0)
1128 {
1129 while((*cc & 0x80) == 0x80) cc++;
1130 }
1131 #endif
1132 break;
1133
1134 case OP_TYPEEXACT:
1135 branchlength += GET2(cc,1);
1136 cc += 4;
1137 break;
1138
1139 /* Handle single-char matchers */
1140
1141 case OP_PROP:
1142 case OP_NOTPROP:
1143 cc += 2;
1144 /* Fall through */
1145
1146 case OP_NOT_DIGIT:
1147 case OP_DIGIT:
1148 case OP_NOT_WHITESPACE:
1149 case OP_WHITESPACE:
1150 case OP_NOT_WORDCHAR:
1151 case OP_WORDCHAR:
1152 case OP_ANY:
1153 branchlength++;
1154 cc++;
1155 break;
1156
1157 /* The single-byte matcher isn't allowed */
1158
1159 case OP_ANYBYTE:
1160 return -2;
1161
1162 /* Check a class for variable quantification */
1163
1164 #ifdef SUPPORT_UTF8
1165 case OP_XCLASS:
1166 cc += GET(cc, 1) - 33;
1167 /* Fall through */
1168 #endif
1169
1170 case OP_CLASS:
1171 case OP_NCLASS:
1172 cc += 33;
1173
1174 switch (*cc)
1175 {
1176 case OP_CRSTAR:
1177 case OP_CRMINSTAR:
1178 case OP_CRQUERY:
1179 case OP_CRMINQUERY:
1180 return -1;
1181
1182 case OP_CRRANGE:
1183 case OP_CRMINRANGE:
1184 if (GET2(cc,1) != GET2(cc,3)) return -1;
1185 branchlength += GET2(cc,1);
1186 cc += 5;
1187 break;
1188
1189 default:
1190 branchlength++;
1191 }
1192 break;
1193
1194 /* Anything else is variable length */
1195
1196 default:
1197 return -1;
1198 }
1199 }
1200 /* Control never gets here */
1201 }
1202
1203
1204
1205
1206 /*************************************************
1207 * Scan compiled regex for numbered bracket *
1208 *************************************************/
1209
1210 /* This little function scans through a compiled pattern until it finds a
1211 capturing bracket with the given number.
1212
1213 Arguments:
1214 code points to start of expression
1215 utf8 TRUE in UTF-8 mode
1216 number the required bracket number
1217
1218 Returns: pointer to the opcode for the bracket, or NULL if not found
1219 */
1220
1221 static const uschar *
1222 find_bracket(const uschar *code, BOOL utf8, int number)
1223 {
1224 for (;;)
1225 {
1226 register int c = *code;
1227 if (c == OP_END) return NULL;
1228
1229 /* XCLASS is used for classes that cannot be represented just by a bit
1230 map. This includes negated single high-valued characters. The length in
1231 the table is zero; the actual length is stored in the compiled code. */
1232
1233 if (c == OP_XCLASS) code += GET(code, 1);
1234
1235 /* Handle capturing bracket */
1236
1237 else if (c == OP_CBRA)
1238 {
1239 int n = GET2(code, 1+LINK_SIZE);
1240 if (n == number) return (uschar *)code;
1241 code += _pcre_OP_lengths[c];
1242 }
1243
1244 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1245 a multi-byte character. The length in the table is a minimum, so we have to
1246 arrange to skip the extra bytes. */
1247
1248 else
1249 {
1250 code += _pcre_OP_lengths[c];
1251 if (utf8) switch(c)
1252 {
1253 case OP_CHAR:
1254 case OP_CHARNC:
1255 case OP_EXACT:
1256 case OP_UPTO:
1257 case OP_MINUPTO:
1258 case OP_POSUPTO:
1259 case OP_STAR:
1260 case OP_MINSTAR:
1261 case OP_POSSTAR:
1262 case OP_PLUS:
1263 case OP_MINPLUS:
1264 case OP_POSPLUS:
1265 case OP_QUERY:
1266 case OP_MINQUERY:
1267 case OP_POSQUERY:
1268 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1269 break;
1270 }
1271 }
1272 }
1273 }
1274
1275
1276
1277 /*************************************************
1278 * Scan compiled regex for recursion reference *
1279 *************************************************/
1280
1281 /* This little function scans through a compiled pattern until it finds an
1282 instance of OP_RECURSE.
1283
1284 Arguments:
1285 code points to start of expression
1286 utf8 TRUE in UTF-8 mode
1287
1288 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1289 */
1290
1291 static const uschar *
1292 find_recurse(const uschar *code, BOOL utf8)
1293 {
1294 for (;;)
1295 {
1296 register int c = *code;
1297 if (c == OP_END) return NULL;
1298 if (c == OP_RECURSE) return code;
1299
1300 /* XCLASS is used for classes that cannot be represented just by a bit
1301 map. This includes negated single high-valued characters. The length in
1302 the table is zero; the actual length is stored in the compiled code. */
1303
1304 if (c == OP_XCLASS) code += GET(code, 1);
1305
1306 /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1307 that are followed by a character may be followed by a multi-byte character.
1308 The length in the table is a minimum, so we have to arrange to skip the extra
1309 bytes. */
1310
1311 else
1312 {
1313 code += _pcre_OP_lengths[c];
1314 if (utf8) switch(c)
1315 {
1316 case OP_CHAR:
1317 case OP_CHARNC:
1318 case OP_EXACT:
1319 case OP_UPTO:
1320 case OP_MINUPTO:
1321 case OP_POSUPTO:
1322 case OP_STAR:
1323 case OP_MINSTAR:
1324 case OP_POSSTAR:
1325 case OP_PLUS:
1326 case OP_MINPLUS:
1327 case OP_POSPLUS:
1328 case OP_QUERY:
1329 case OP_MINQUERY:
1330 case OP_POSQUERY:
1331 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1332 break;
1333 }
1334 }
1335 }
1336 }
1337
1338
1339
1340 /*************************************************
1341 * Scan compiled branch for non-emptiness *
1342 *************************************************/
1343
1344 /* This function scans through a branch of a compiled pattern to see whether it
1345 can match the empty string or not. It is called from could_be_empty()
1346 below and from compile_branch() when checking for an unlimited repeat of a
1347 group that can match nothing. Note that first_significant_code() skips over
1348 assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1349 struck an inner bracket whose current branch will already have been scanned.
1350
1351 Arguments:
1352 code points to start of search
1353 endcode points to where to stop
1354 utf8 TRUE if in UTF8 mode
1355
1356 Returns: TRUE if what is matched could be empty
1357 */
1358
1359 static BOOL
1360 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1361 {
1362 register int c;
1363 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1364 code < endcode;
1365 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1366 {
1367 const uschar *ccode;
1368
1369 c = *code;
1370
1371 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1372 {
1373 BOOL empty_branch;
1374 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1375
1376 /* Scan a closed bracket */
1377
1378 empty_branch = FALSE;
1379 do
1380 {
1381 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1382 empty_branch = TRUE;
1383 code += GET(code, 1);
1384 }
1385 while (*code == OP_ALT);
1386 if (!empty_branch) return FALSE; /* All branches are non-empty */
1387
1388 /* Move past the KET and fudge things so that the increment in the "for"
1389 above has no effect. */
1390
1391 c = OP_END;
1392 code += 1 + LINK_SIZE - _pcre_OP_lengths[c];
1393 continue;
1394 }
1395
1396 /* Handle the other opcodes */
1397
1398 switch (c)
1399 {
1400 /* Check for quantifiers after a class */
1401
1402 #ifdef SUPPORT_UTF8
1403 case OP_XCLASS:
1404 ccode = code + GET(code, 1);
1405 goto CHECK_CLASS_REPEAT;
1406 #endif
1407
1408 case OP_CLASS:
1409 case OP_NCLASS:
1410 ccode = code + 33;
1411
1412 #ifdef SUPPORT_UTF8
1413 CHECK_CLASS_REPEAT:
1414 #endif
1415
1416 switch (*ccode)
1417 {
1418 case OP_CRSTAR: /* These could be empty; continue */
1419 case OP_CRMINSTAR:
1420 case OP_CRQUERY:
1421 case OP_CRMINQUERY:
1422 break;
1423
1424 default: /* Non-repeat => class must match */
1425 case OP_CRPLUS: /* These repeats aren't empty */
1426 case OP_CRMINPLUS:
1427 return FALSE;
1428
1429 case OP_CRRANGE:
1430 case OP_CRMINRANGE:
1431 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1432 break;
1433 }
1434 break;
1435
1436 /* Opcodes that must match a character */
1437
1438 case OP_PROP:
1439 case OP_NOTPROP:
1440 case OP_EXTUNI:
1441 case OP_NOT_DIGIT:
1442 case OP_DIGIT:
1443 case OP_NOT_WHITESPACE:
1444 case OP_WHITESPACE:
1445 case OP_NOT_WORDCHAR:
1446 case OP_WORDCHAR:
1447 case OP_ANY:
1448 case OP_ANYBYTE:
1449 case OP_CHAR:
1450 case OP_CHARNC:
1451 case OP_NOT:
1452 case OP_PLUS:
1453 case OP_MINPLUS:
1454 case OP_POSPLUS:
1455 case OP_EXACT:
1456 case OP_NOTPLUS:
1457 case OP_NOTMINPLUS:
1458 case OP_NOTPOSPLUS:
1459 case OP_NOTEXACT:
1460 case OP_TYPEPLUS:
1461 case OP_TYPEMINPLUS:
1462 case OP_TYPEPOSPLUS:
1463 case OP_TYPEEXACT:
1464 return FALSE;
1465
1466 /* End of branch */
1467
1468 case OP_KET:
1469 case OP_KETRMAX:
1470 case OP_KETRMIN:
1471 case OP_ALT:
1472 return TRUE;
1473
1474 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1475 MINUPTO, and POSUPTO may be followed by a multibyte character */
1476
1477 #ifdef SUPPORT_UTF8
1478 case OP_STAR:
1479 case OP_MINSTAR:
1480 case OP_POSSTAR:
1481 case OP_QUERY:
1482 case OP_MINQUERY:
1483 case OP_POSQUERY:
1484 case OP_UPTO:
1485 case OP_MINUPTO:
1486 case OP_POSUPTO:
1487 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1488 break;
1489 #endif
1490 }
1491 }
1492
1493 return TRUE;
1494 }
1495
1496
1497
1498 /*************************************************
1499 * Scan compiled regex for non-emptiness *
1500 *************************************************/
1501
1502 /* This function is called to check for left recursive calls. We want to check
1503 the current branch of the current pattern to see if it could match the empty
1504 string. If it could, we must look outwards for branches at other levels,
1505 stopping when we pass beyond the bracket which is the subject of the recursion.
1506
1507 Arguments:
1508 code points to start of the recursion
1509 endcode points to where to stop (current RECURSE item)
1510 bcptr points to the chain of current (unclosed) branch starts
1511 utf8 TRUE if in UTF-8 mode
1512
1513 Returns: TRUE if what is matched could be empty
1514 */
1515
1516 static BOOL
1517 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1518 BOOL utf8)
1519 {
1520 while (bcptr != NULL && bcptr->current >= code)
1521 {
1522 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1523 bcptr = bcptr->outer;
1524 }
1525 return TRUE;
1526 }
1527
1528
1529
1530 /*************************************************
1531 * Check for POSIX class syntax *
1532 *************************************************/
1533
1534 /* This function is called when the sequence "[:" or "[." or "[=" is
1535 encountered in a character class. It checks whether this is followed by an
1536 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1537 ".]" or "=]".
1538
1539 Argument:
1540 ptr pointer to the initial [
1541 endptr where to return the end pointer
1542 cd pointer to compile data
1543
1544 Returns: TRUE or FALSE
1545 */
1546
1547 static BOOL
1548 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1549 {
1550 int terminator; /* Don't combine these lines; the Solaris cc */
1551 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1552 if (*(++ptr) == '^') ptr++;
1553 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1554 if (*ptr == terminator && ptr[1] == ']')
1555 {
1556 *endptr = ptr;
1557 return TRUE;
1558 }
1559 return FALSE;
1560 }
1561
1562
1563
1564
1565 /*************************************************
1566 * Check POSIX class name *
1567 *************************************************/
1568
1569 /* This function is called to check the name given in a POSIX-style class entry
1570 such as [:alnum:].
1571
1572 Arguments:
1573 ptr points to the first letter
1574 len the length of the name
1575
1576 Returns: a value representing the name, or -1 if unknown
1577 */
1578
1579 static int
1580 check_posix_name(const uschar *ptr, int len)
1581 {
1582 register int yield = 0;
1583 while (posix_name_lengths[yield] != 0)
1584 {
1585 if (len == posix_name_lengths[yield] &&
1586 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1587 yield++;
1588 }
1589 return -1;
1590 }
1591
1592
1593 /*************************************************
1594 * Adjust OP_RECURSE items in repeated group *
1595 *************************************************/
1596
1597 /* OP_RECURSE items contain an offset from the start of the regex to the group
1598 that is referenced. This means that groups can be replicated for fixed
1599 repetition simply by copying (because the recursion is allowed to refer to
1600 earlier groups that are outside the current group). However, when a group is
1601 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1602 it, after it has been compiled. This means that any OP_RECURSE items within it
1603 that refer to the group itself or any contained groups have to have their
1604 offsets adjusted. That one of the jobs of this function. Before it is called,
1605 the partially compiled regex must be temporarily terminated with OP_END.
1606
1607 This function has been extended with the possibility of forward references for
1608 recursions and subroutine calls. It must also check the list of such references
1609 for the group we are dealing with. If it finds that one of the recursions in
1610 the current group is on this list, it adjusts the offset in the list, not the
1611 value in the reference (which is a group number).
1612
1613 Arguments:
1614 group points to the start of the group
1615 adjust the amount by which the group is to be moved
1616 utf8 TRUE in UTF-8 mode
1617 cd contains pointers to tables etc.
1618 save_hwm the hwm forward reference pointer at the start of the group
1619
1620 Returns: nothing
1621 */
1622
1623 static void
1624 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1625 uschar *save_hwm)
1626 {
1627 uschar *ptr = group;
1628 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1629 {
1630 int offset;
1631 uschar *hc;
1632
1633 /* See if this recursion is on the forward reference list. If so, adjust the
1634 reference. */
1635
1636 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1637 {
1638 offset = GET(hc, 0);
1639 if (cd->start_code + offset == ptr + 1)
1640 {
1641 PUT(hc, 0, offset + adjust);
1642 break;
1643 }
1644 }
1645
1646 /* Otherwise, adjust the recursion offset if it's after the start of this
1647 group. */
1648
1649 if (hc >= cd->hwm)
1650 {
1651 offset = GET(ptr, 1);
1652 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1653 }
1654
1655 ptr += 1 + LINK_SIZE;
1656 }
1657 }
1658
1659
1660
1661 /*************************************************
1662 * Insert an automatic callout point *
1663 *************************************************/
1664
1665 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1666 callout points before each pattern item.
1667
1668 Arguments:
1669 code current code pointer
1670 ptr current pattern pointer
1671 cd pointers to tables etc
1672
1673 Returns: new code pointer
1674 */
1675
1676 static uschar *
1677 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1678 {
1679 *code++ = OP_CALLOUT;
1680 *code++ = 255;
1681 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1682 PUT(code, LINK_SIZE, 0); /* Default length */
1683 return code + 2*LINK_SIZE;
1684 }
1685
1686
1687
1688 /*************************************************
1689 * Complete a callout item *
1690 *************************************************/
1691
1692 /* A callout item contains the length of the next item in the pattern, which
1693 we can't fill in till after we have reached the relevant point. This is used
1694 for both automatic and manual callouts.
1695
1696 Arguments:
1697 previous_callout points to previous callout item
1698 ptr current pattern pointer
1699 cd pointers to tables etc
1700
1701 Returns: nothing
1702 */
1703
1704 static void
1705 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1706 {
1707 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1708 PUT(previous_callout, 2 + LINK_SIZE, length);
1709 }
1710
1711
1712
1713 #ifdef SUPPORT_UCP
1714 /*************************************************
1715 * Get othercase range *
1716 *************************************************/
1717
1718 /* This function is passed the start and end of a class range, in UTF-8 mode
1719 with UCP support. It searches up the characters, looking for internal ranges of
1720 characters in the "other" case. Each call returns the next one, updating the
1721 start address.
1722
1723 Arguments:
1724 cptr points to starting character value; updated
1725 d end value
1726 ocptr where to put start of othercase range
1727 odptr where to put end of othercase range
1728
1729 Yield: TRUE when range returned; FALSE when no more
1730 */
1731
1732 static BOOL
1733 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1734 unsigned int *odptr)
1735 {
1736 unsigned int c, othercase, next;
1737
1738 for (c = *cptr; c <= d; c++)
1739 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1740
1741 if (c > d) return FALSE;
1742
1743 *ocptr = othercase;
1744 next = othercase + 1;
1745
1746 for (++c; c <= d; c++)
1747 {
1748 if (_pcre_ucp_othercase(c) != next) break;
1749 next++;
1750 }
1751
1752 *odptr = next - 1;
1753 *cptr = c;
1754
1755 return TRUE;
1756 }
1757 #endif /* SUPPORT_UCP */
1758
1759
1760
1761 /*************************************************
1762 * Check if auto-possessifying is possible *
1763 *************************************************/
1764
1765 /* This function is called for unlimited repeats of certain items, to see
1766 whether the next thing could possibly match the repeated item. If not, it makes
1767 sense to automatically possessify the repeated item.
1768
1769 Arguments:
1770 op_code the repeated op code
1771 this data for this item, depends on the opcode
1772 utf8 TRUE in UTF-8 mode
1773 utf8_char used for utf8 character bytes, NULL if not relevant
1774 ptr next character in pattern
1775 options options bits
1776 cd contains pointers to tables etc.
1777
1778 Returns: TRUE if possessifying is wanted
1779 */
1780
1781 static BOOL
1782 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1783 const uschar *ptr, int options, compile_data *cd)
1784 {
1785 int next;
1786
1787 /* Skip whitespace and comments in extended mode */
1788
1789 if ((options & PCRE_EXTENDED) != 0)
1790 {
1791 for (;;)
1792 {
1793 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1794 if (*ptr == '#')
1795 {
1796 while (*(++ptr) != 0)
1797 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1798 }
1799 else break;
1800 }
1801 }
1802
1803 /* If the next item is one that we can handle, get its value. A non-negative
1804 value is a character, a negative value is an escape value. */
1805
1806 if (*ptr == '\\')
1807 {
1808 int temperrorcode = 0;
1809 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1810 if (temperrorcode != 0) return FALSE;
1811 ptr++; /* Point after the escape sequence */
1812 }
1813
1814 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1815 {
1816 #ifdef SUPPORT_UTF8
1817 if (utf8) { GETCHARINC(next, ptr); } else
1818 #endif
1819 next = *ptr++;
1820 }
1821
1822 else return FALSE;
1823
1824 /* Skip whitespace and comments in extended mode */
1825
1826 if ((options & PCRE_EXTENDED) != 0)
1827 {
1828 for (;;)
1829 {
1830 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1831 if (*ptr == '#')
1832 {
1833 while (*(++ptr) != 0)
1834 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1835 }
1836 else break;
1837 }
1838 }
1839
1840 /* If the next thing is itself optional, we have to give up. */
1841
1842 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1843 return FALSE;
1844
1845 /* Now compare the next item with the previous opcode. If the previous is a
1846 positive single character match, "item" either contains the character or, if
1847 "item" is greater than 127 in utf8 mode, the character's bytes are in
1848 utf8_char. */
1849
1850
1851 /* Handle cases when the next item is a character. */
1852
1853 if (next >= 0) switch(op_code)
1854 {
1855 case OP_CHAR:
1856 #ifdef SUPPORT_UTF8
1857 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1858 #endif
1859 return item != next;
1860
1861 /* For CHARNC (caseless character) we must check the other case. If we have
1862 Unicode property support, we can use it to test the other case of
1863 high-valued characters. */
1864
1865 case OP_CHARNC:
1866 #ifdef SUPPORT_UTF8
1867 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1868 #endif
1869 if (item == next) return FALSE;
1870 #ifdef SUPPORT_UTF8
1871 if (utf8)
1872 {
1873 unsigned int othercase;
1874 if (next < 128) othercase = cd->fcc[next]; else
1875 #ifdef SUPPORT_UCP
1876 othercase = _pcre_ucp_othercase((unsigned int)next);
1877 #else
1878 othercase = NOTACHAR;
1879 #endif
1880 return (unsigned int)item != othercase;
1881 }
1882 else
1883 #endif /* SUPPORT_UTF8 */
1884 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
1885
1886 /* For OP_NOT, "item" must be a single-byte character. */
1887
1888 case OP_NOT:
1889 if (next < 0) return FALSE; /* Not a character */
1890 if (item == next) return TRUE;
1891 if ((options & PCRE_CASELESS) == 0) return FALSE;
1892 #ifdef SUPPORT_UTF8
1893 if (utf8)
1894 {
1895 unsigned int othercase;
1896 if (next < 128) othercase = cd->fcc[next]; else
1897 #ifdef SUPPORT_UCP
1898 othercase = _pcre_ucp_othercase(next);
1899 #else
1900 othercase = NOTACHAR;
1901 #endif
1902 return (unsigned int)item == othercase;
1903 }
1904 else
1905 #endif /* SUPPORT_UTF8 */
1906 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
1907
1908 case OP_DIGIT:
1909 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1910
1911 case OP_NOT_DIGIT:
1912 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1913
1914 case OP_WHITESPACE:
1915 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1916
1917 case OP_NOT_WHITESPACE:
1918 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1919
1920 case OP_WORDCHAR:
1921 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1922
1923 case OP_NOT_WORDCHAR:
1924 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1925
1926 default:
1927 return FALSE;
1928 }
1929
1930
1931 /* Handle the case when the next item is \d, \s, etc. */
1932
1933 switch(op_code)
1934 {
1935 case OP_CHAR:
1936 case OP_CHARNC:
1937 #ifdef SUPPORT_UTF8
1938 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1939 #endif
1940 switch(-next)
1941 {
1942 case ESC_d:
1943 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
1944
1945 case ESC_D:
1946 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
1947
1948 case ESC_s:
1949 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
1950
1951 case ESC_S:
1952 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
1953
1954 case ESC_w:
1955 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
1956
1957 case ESC_W:
1958 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
1959
1960 default:
1961 return FALSE;
1962 }
1963
1964 case OP_DIGIT:
1965 return next == -ESC_D || next == -ESC_s || next == -ESC_W;
1966
1967 case OP_NOT_DIGIT:
1968 return next == -ESC_d;
1969
1970 case OP_WHITESPACE:
1971 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
1972
1973 case OP_NOT_WHITESPACE:
1974 return next == -ESC_s;
1975
1976 case OP_WORDCHAR:
1977 return next == -ESC_W || next == -ESC_s;
1978
1979 case OP_NOT_WORDCHAR:
1980 return next == -ESC_w || next == -ESC_d;
1981
1982 default:
1983 return FALSE;
1984 }
1985
1986 /* Control does not reach here */
1987 }
1988
1989
1990
1991 /*************************************************
1992 * Compile one branch *
1993 *************************************************/
1994
1995 /* Scan the pattern, compiling it into the a vector. If the options are
1996 changed during the branch, the pointer is used to change the external options
1997 bits. This function is used during the pre-compile phase when we are trying
1998 to find out the amount of memory needed, as well as during the real compile
1999 phase. The value of lengthptr distinguishes the two phases.
2000
2001 Arguments:
2002 optionsptr pointer to the option bits
2003 codeptr points to the pointer to the current code point
2004 ptrptr points to the current pattern pointer
2005 errorcodeptr points to error code variable
2006 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2007 reqbyteptr set to the last literal character required, else < 0
2008 bcptr points to current branch chain
2009 cd contains pointers to tables etc.
2010 lengthptr NULL during the real compile phase
2011 points to length accumulator during pre-compile phase
2012
2013 Returns: TRUE on success
2014 FALSE, with *errorcodeptr set non-zero on error
2015 */
2016
2017 static BOOL
2018 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2019 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2020 compile_data *cd, int *lengthptr)
2021 {
2022 int repeat_type, op_type;
2023 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2024 int bravalue = 0;
2025 int greedy_default, greedy_non_default;
2026 int firstbyte, reqbyte;
2027 int zeroreqbyte, zerofirstbyte;
2028 int req_caseopt, reqvary, tempreqvary;
2029 int options = *optionsptr;
2030 int after_manual_callout = 0;
2031 int length_prevgroup = 0;
2032 register int c;
2033 register uschar *code = *codeptr;
2034 uschar *last_code = code;
2035 uschar *orig_code = code;
2036 uschar *tempcode;
2037 BOOL inescq = FALSE;
2038 BOOL groupsetfirstbyte = FALSE;
2039 const uschar *ptr = *ptrptr;
2040 const uschar *tempptr;
2041 uschar *previous = NULL;
2042 uschar *previous_callout = NULL;
2043 uschar *save_hwm = NULL;
2044 uschar classbits[32];
2045
2046 #ifdef SUPPORT_UTF8
2047 BOOL class_utf8;
2048 BOOL utf8 = (options & PCRE_UTF8) != 0;
2049 uschar *class_utf8data;
2050 uschar utf8_char[6];
2051 #else
2052 BOOL utf8 = FALSE;
2053 uschar *utf8_char = NULL;
2054 #endif
2055
2056 #ifdef DEBUG
2057 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2058 #endif
2059
2060 /* Set up the default and non-default settings for greediness */
2061
2062 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2063 greedy_non_default = greedy_default ^ 1;
2064
2065 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2066 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2067 matches a non-fixed char first char; reqbyte just remains unset if we never
2068 find one.
2069
2070 When we hit a repeat whose minimum is zero, we may have to adjust these values
2071 to take the zero repeat into account. This is implemented by setting them to
2072 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2073 item types that can be repeated set these backoff variables appropriately. */
2074
2075 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2076
2077 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2078 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2079 value > 255. It is added into the firstbyte or reqbyte variables to record the
2080 case status of the value. This is used only for ASCII characters. */
2081
2082 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2083
2084 /* Switch on next character until the end of the branch */
2085
2086 for (;; ptr++)
2087 {
2088 BOOL negate_class;
2089 BOOL possessive_quantifier;
2090 BOOL is_quantifier;
2091 BOOL is_recurse;
2092 int class_charcount;
2093 int class_lastchar;
2094 int newoptions;
2095 int recno;
2096 int skipbytes;
2097 int subreqbyte;
2098 int subfirstbyte;
2099 int terminator;
2100 int mclength;
2101 uschar mcbuffer[8];
2102
2103 /* Get next byte in the pattern */
2104
2105 c = *ptr;
2106
2107 /* If we are in the pre-compile phase, accumulate the length used for the
2108 previous cycle of this loop. */
2109
2110 if (lengthptr != NULL)
2111 {
2112 #ifdef DEBUG
2113 if (code > cd->hwm) cd->hwm = code; /* High water info */
2114 #endif
2115 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2116 {
2117 *errorcodeptr = ERR52;
2118 goto FAILED;
2119 }
2120
2121 /* There is at least one situation where code goes backwards: this is the
2122 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2123 the class is simply eliminated. However, it is created first, so we have to
2124 allow memory for it. Therefore, don't ever reduce the length at this point.
2125 */
2126
2127 if (code < last_code) code = last_code;
2128 *lengthptr += code - last_code;
2129 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2130
2131 /* If "previous" is set and it is not at the start of the work space, move
2132 it back to there, in order to avoid filling up the work space. Otherwise,
2133 if "previous" is NULL, reset the current code pointer to the start. */
2134
2135 if (previous != NULL)
2136 {
2137 if (previous > orig_code)
2138 {
2139 memmove(orig_code, previous, code - previous);
2140 code -= previous - orig_code;
2141 previous = orig_code;
2142 }
2143 }
2144 else code = orig_code;
2145
2146 /* Remember where this code item starts so we can pick up the length
2147 next time round. */
2148
2149 last_code = code;
2150 }
2151
2152 /* In the real compile phase, just check the workspace used by the forward
2153 reference list. */
2154
2155 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2156 {
2157 *errorcodeptr = ERR52;
2158 goto FAILED;
2159 }
2160
2161 /* If in \Q...\E, check for the end; if not, we have a literal */
2162
2163 if (inescq && c != 0)
2164 {
2165 if (c == '\\' && ptr[1] == 'E')
2166 {
2167 inescq = FALSE;
2168 ptr++;
2169 continue;
2170 }
2171 else
2172 {
2173 if (previous_callout != NULL)
2174 {
2175 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2176 complete_callout(previous_callout, ptr, cd);
2177 previous_callout = NULL;
2178 }
2179 if ((options & PCRE_AUTO_CALLOUT) != 0)
2180 {
2181 previous_callout = code;
2182 code = auto_callout(code, ptr, cd);
2183 }
2184 goto NORMAL_CHAR;
2185 }
2186 }
2187
2188 /* Fill in length of a previous callout, except when the next thing is
2189 a quantifier. */
2190
2191 is_quantifier = c == '*' || c == '+' || c == '?' ||
2192 (c == '{' && is_counted_repeat(ptr+1));
2193
2194 if (!is_quantifier && previous_callout != NULL &&
2195 after_manual_callout-- <= 0)
2196 {
2197 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2198 complete_callout(previous_callout, ptr, cd);
2199 previous_callout = NULL;
2200 }
2201
2202 /* In extended mode, skip white space and comments */
2203
2204 if ((options & PCRE_EXTENDED) != 0)
2205 {
2206 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2207 if (c == '#')
2208 {
2209 while (*(++ptr) != 0)
2210 {
2211 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2212 }
2213 if (*ptr != 0) continue;
2214
2215 /* Else fall through to handle end of string */
2216 c = 0;
2217 }
2218 }
2219
2220 /* No auto callout for quantifiers. */
2221
2222 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2223 {
2224 previous_callout = code;
2225 code = auto_callout(code, ptr, cd);
2226 }
2227
2228 switch(c)
2229 {
2230 /* ===================================================================*/
2231 case 0: /* The branch terminates at string end */
2232 case '|': /* or | or ) */
2233 case ')':
2234 *firstbyteptr = firstbyte;
2235 *reqbyteptr = reqbyte;
2236 *codeptr = code;
2237 *ptrptr = ptr;
2238 if (lengthptr != NULL)
2239 {
2240 *lengthptr += code - last_code; /* To include callout length */
2241 DPRINTF((">> end branch\n"));
2242 }
2243 return TRUE;
2244
2245
2246 /* ===================================================================*/
2247 /* Handle single-character metacharacters. In multiline mode, ^ disables
2248 the setting of any following char as a first character. */
2249
2250 case '^':
2251 if ((options & PCRE_MULTILINE) != 0)
2252 {
2253 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2254 }
2255 previous = NULL;
2256 *code++ = OP_CIRC;
2257 break;
2258
2259 case '$':
2260 previous = NULL;
2261 *code++ = OP_DOLL;
2262 break;
2263
2264 /* There can never be a first char if '.' is first, whatever happens about
2265 repeats. The value of reqbyte doesn't change either. */
2266
2267 case '.':
2268 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2269 zerofirstbyte = firstbyte;
2270 zeroreqbyte = reqbyte;
2271 previous = code;
2272 *code++ = OP_ANY;
2273 break;
2274
2275
2276 /* ===================================================================*/
2277 /* Character classes. If the included characters are all < 256, we build a
2278 32-byte bitmap of the permitted characters, except in the special case
2279 where there is only one such character. For negated classes, we build the
2280 map as usual, then invert it at the end. However, we use a different opcode
2281 so that data characters > 255 can be handled correctly.
2282
2283 If the class contains characters outside the 0-255 range, a different
2284 opcode is compiled. It may optionally have a bit map for characters < 256,
2285 but those above are are explicitly listed afterwards. A flag byte tells
2286 whether the bitmap is present, and whether this is a negated class or not.
2287 */
2288
2289 case '[':
2290 previous = code;
2291
2292 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2293 they are encountered at the top level, so we'll do that too. */
2294
2295 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2296 check_posix_syntax(ptr, &tempptr, cd))
2297 {
2298 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2299 goto FAILED;
2300 }
2301
2302 /* If the first character is '^', set the negation flag and skip it. */
2303
2304 if ((c = *(++ptr)) == '^')
2305 {
2306 negate_class = TRUE;
2307 c = *(++ptr);
2308 }
2309 else
2310 {
2311 negate_class = FALSE;
2312 }
2313
2314 /* Keep a count of chars with values < 256 so that we can optimize the case
2315 of just a single character (as long as it's < 256). However, For higher
2316 valued UTF-8 characters, we don't yet do any optimization. */
2317
2318 class_charcount = 0;
2319 class_lastchar = -1;
2320
2321 /* Initialize the 32-char bit map to all zeros. We build the map in a
2322 temporary bit of memory, in case the class contains only 1 character (less
2323 than 256), because in that case the compiled code doesn't use the bit map.
2324 */
2325
2326 memset(classbits, 0, 32 * sizeof(uschar));
2327
2328 #ifdef SUPPORT_UTF8
2329 class_utf8 = FALSE; /* No chars >= 256 */
2330 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2331 #endif
2332
2333 /* Process characters until ] is reached. By writing this as a "do" it
2334 means that an initial ] is taken as a data character. At the start of the
2335 loop, c contains the first byte of the character. */
2336
2337 if (c != 0) do
2338 {
2339 const uschar *oldptr;
2340
2341 #ifdef SUPPORT_UTF8
2342 if (utf8 && c > 127)
2343 { /* Braces are required because the */
2344 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2345 }
2346 #endif
2347
2348 /* Inside \Q...\E everything is literal except \E */
2349
2350 if (inescq)
2351 {
2352 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2353 {
2354 inescq = FALSE; /* Reset literal state */
2355 ptr++; /* Skip the 'E' */
2356 continue; /* Carry on with next */
2357 }
2358 goto CHECK_RANGE; /* Could be range if \E follows */
2359 }
2360
2361 /* Handle POSIX class names. Perl allows a negation extension of the
2362 form [:^name:]. A square bracket that doesn't match the syntax is
2363 treated as a literal. We also recognize the POSIX constructions
2364 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2365 5.6 and 5.8 do. */
2366
2367 if (c == '[' &&
2368 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2369 check_posix_syntax(ptr, &tempptr, cd))
2370 {
2371 BOOL local_negate = FALSE;
2372 int posix_class, taboffset, tabopt;
2373 register const uschar *cbits = cd->cbits;
2374 uschar pbits[32];
2375
2376 if (ptr[1] != ':')
2377 {
2378 *errorcodeptr = ERR31;
2379 goto FAILED;
2380 }
2381
2382 ptr += 2;
2383 if (*ptr == '^')
2384 {
2385 local_negate = TRUE;
2386 ptr++;
2387 }
2388
2389 posix_class = check_posix_name(ptr, tempptr - ptr);
2390 if (posix_class < 0)
2391 {
2392 *errorcodeptr = ERR30;
2393 goto FAILED;
2394 }
2395
2396 /* If matching is caseless, upper and lower are converted to
2397 alpha. This relies on the fact that the class table starts with
2398 alpha, lower, upper as the first 3 entries. */
2399
2400 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2401 posix_class = 0;
2402
2403 /* We build the bit map for the POSIX class in a chunk of local store
2404 because we may be adding and subtracting from it, and we don't want to
2405 subtract bits that may be in the main map already. At the end we or the
2406 result into the bit map that is being built. */
2407
2408 posix_class *= 3;
2409
2410 /* Copy in the first table (always present) */
2411
2412 memcpy(pbits, cbits + posix_class_maps[posix_class],
2413 32 * sizeof(uschar));
2414
2415 /* If there is a second table, add or remove it as required. */
2416
2417 taboffset = posix_class_maps[posix_class + 1];
2418 tabopt = posix_class_maps[posix_class + 2];
2419
2420 if (taboffset >= 0)
2421 {
2422 if (tabopt >= 0)
2423 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2424 else
2425 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2426 }
2427
2428 /* Not see if we need to remove any special characters. An option
2429 value of 1 removes vertical space and 2 removes underscore. */
2430
2431 if (tabopt < 0) tabopt = -tabopt;
2432 if (tabopt == 1) pbits[1] &= ~0x3c;
2433 else if (tabopt == 2) pbits[11] &= 0x7f;
2434
2435 /* Add the POSIX table or its complement into the main table that is
2436 being built and we are done. */
2437
2438 if (local_negate)
2439 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2440 else
2441 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2442
2443 ptr = tempptr + 1;
2444 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2445 continue; /* End of POSIX syntax handling */
2446 }
2447
2448 /* Backslash may introduce a single character, or it may introduce one
2449 of the specials, which just set a flag. The sequence \b is a special
2450 case. Inside a class (and only there) it is treated as backspace.
2451 Elsewhere it marks a word boundary. Other escapes have preset maps ready
2452 to or into the one we are building. We assume they have more than one
2453 character in them, so set class_charcount bigger than one. */
2454
2455 if (c == '\\')
2456 {
2457 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2458 if (*errorcodeptr != 0) goto FAILED;
2459
2460 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2461 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2462 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2463 else if (-c == ESC_Q) /* Handle start of quoted string */
2464 {
2465 if (ptr[1] == '\\' && ptr[2] == 'E')
2466 {
2467 ptr += 2; /* avoid empty string */
2468 }
2469 else inescq = TRUE;
2470 continue;
2471 }
2472
2473 if (c < 0)
2474 {
2475 register const uschar *cbits = cd->cbits;
2476 class_charcount += 2; /* Greater than 1 is what matters */
2477
2478 /* Save time by not doing this in the pre-compile phase. */
2479
2480 if (lengthptr == NULL) switch (-c)
2481 {
2482 case ESC_d:
2483 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2484 continue;
2485
2486 case ESC_D:
2487 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2488 continue;
2489
2490 case ESC_w:
2491 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2492 continue;
2493
2494 case ESC_W:
2495 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2496 continue;
2497
2498 case ESC_s:
2499 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2500 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2501 continue;
2502
2503 case ESC_S:
2504 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2505 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2506 continue;
2507
2508 case ESC_E: /* Perl ignores an orphan \E */
2509 continue;
2510
2511 default: /* Not recognized; fall through */
2512 break; /* Need "default" setting to stop compiler warning. */
2513 }
2514
2515 /* In the pre-compile phase, just do the recognition. */
2516
2517 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2518 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2519
2520 /* We need to deal with \P and \p in both phases. */
2521
2522 #ifdef SUPPORT_UCP
2523 if (-c == ESC_p || -c == ESC_P)
2524 {
2525 BOOL negated;
2526 int pdata;
2527 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2528 if (ptype < 0) goto FAILED;
2529 class_utf8 = TRUE;
2530 *class_utf8data++ = ((-c == ESC_p) != negated)?
2531 XCL_PROP : XCL_NOTPROP;
2532 *class_utf8data++ = ptype;
2533 *class_utf8data++ = pdata;
2534 class_charcount -= 2; /* Not a < 256 character */
2535 continue;
2536 }
2537 #endif
2538 /* Unrecognized escapes are faulted if PCRE is running in its
2539 strict mode. By default, for compatibility with Perl, they are
2540 treated as literals. */
2541
2542 if ((options & PCRE_EXTRA) != 0)
2543 {
2544 *errorcodeptr = ERR7;
2545 goto FAILED;
2546 }
2547
2548 class_charcount -= 2; /* Undo the default count from above */
2549 c = *ptr; /* Get the final character and fall through */
2550 }
2551
2552 /* Fall through if we have a single character (c >= 0). This may be
2553 greater than 256 in UTF-8 mode. */
2554
2555 } /* End of backslash handling */
2556
2557 /* A single character may be followed by '-' to form a range. However,
2558 Perl does not permit ']' to be the end of the range. A '-' character
2559 at the end is treated as a literal. Perl ignores orphaned \E sequences
2560 entirely. The code for handling \Q and \E is messy. */
2561
2562 CHECK_RANGE:
2563 while (ptr[1] == '\\' && ptr[2] == 'E')
2564 {
2565 inescq = FALSE;
2566 ptr += 2;
2567 }
2568
2569 oldptr = ptr;
2570
2571 if (!inescq && ptr[1] == '-')
2572 {
2573 int d;
2574 ptr += 2;
2575 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2576
2577 /* If we hit \Q (not followed by \E) at this point, go into escaped
2578 mode. */
2579
2580 while (*ptr == '\\' && ptr[1] == 'Q')
2581 {
2582 ptr += 2;
2583 if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2584 inescq = TRUE;
2585 break;
2586 }
2587
2588 if (*ptr == 0 || (!inescq && *ptr == ']'))
2589 {
2590 ptr = oldptr;
2591 goto LONE_SINGLE_CHARACTER;
2592 }
2593
2594 #ifdef SUPPORT_UTF8
2595 if (utf8)
2596 { /* Braces are required because the */
2597 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2598 }
2599 else
2600 #endif
2601 d = *ptr; /* Not UTF-8 mode */
2602
2603 /* The second part of a range can be a single-character escape, but
2604 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2605 in such circumstances. */
2606
2607 if (!inescq && d == '\\')
2608 {
2609 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2610 if (*errorcodeptr != 0) goto FAILED;
2611
2612 /* \b is backslash; \X is literal X; \R is literal R; any other
2613 special means the '-' was literal */
2614
2615 if (d < 0)
2616 {
2617 if (d == -ESC_b) d = '\b';
2618 else if (d == -ESC_X) d = 'X';
2619 else if (d == -ESC_R) d = 'R'; else
2620 {
2621 ptr = oldptr;
2622 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2623 }
2624 }
2625 }
2626
2627 /* Check that the two values are in the correct order. Optimize
2628 one-character ranges */
2629
2630 if (d < c)
2631 {
2632 *errorcodeptr = ERR8;
2633 goto FAILED;
2634 }
2635
2636 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2637
2638 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2639 matching, we have to use an XCLASS with extra data items. Caseless
2640 matching for characters > 127 is available only if UCP support is
2641 available. */
2642
2643 #ifdef SUPPORT_UTF8
2644 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2645 {
2646 class_utf8 = TRUE;
2647
2648 /* With UCP support, we can find the other case equivalents of
2649 the relevant characters. There may be several ranges. Optimize how
2650 they fit with the basic range. */
2651
2652 #ifdef SUPPORT_UCP
2653 if ((options & PCRE_CASELESS) != 0)
2654 {
2655 unsigned int occ, ocd;
2656 unsigned int cc = c;
2657 unsigned int origd = d;
2658 while (get_othercase_range(&cc, origd, &occ, &ocd))
2659 {
2660 if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
2661
2662 if (occ < c && ocd >= c - 1) /* Extend the basic range */
2663 { /* if there is overlap, */
2664 c = occ; /* noting that if occ < c */
2665 continue; /* we can't have ocd > d */
2666 } /* because a subrange is */
2667 if (ocd > d && occ <= d + 1) /* always shorter than */
2668 { /* the basic range. */
2669 d = ocd;
2670 continue;
2671 }
2672
2673 if (occ == ocd)
2674 {
2675 *class_utf8data++ = XCL_SINGLE;
2676 }
2677 else
2678 {
2679 *class_utf8data++ = XCL_RANGE;
2680 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2681 }
2682 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2683 }
2684 }
2685 #endif /* SUPPORT_UCP */
2686
2687 /* Now record the original range, possibly modified for UCP caseless
2688 overlapping ranges. */
2689
2690 *class_utf8data++ = XCL_RANGE;
2691 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2692 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2693
2694 /* With UCP support, we are done. Without UCP support, there is no
2695 caseless matching for UTF-8 characters > 127; we can use the bit map
2696 for the smaller ones. */
2697
2698 #ifdef SUPPORT_UCP
2699 continue; /* With next character in the class */
2700 #else
2701 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2702
2703 /* Adjust upper limit and fall through to set up the map */
2704
2705 d = 127;
2706
2707 #endif /* SUPPORT_UCP */
2708 }
2709 #endif /* SUPPORT_UTF8 */
2710
2711 /* We use the bit map for all cases when not in UTF-8 mode; else
2712 ranges that lie entirely within 0-127 when there is UCP support; else
2713 for partial ranges without UCP support. */
2714
2715 class_charcount += d - c + 1;
2716 class_lastchar = d;
2717
2718 /* We can save a bit of time by skipping this in the pre-compile. */
2719
2720 if (lengthptr == NULL) for (; c <= d; c++)
2721 {
2722 classbits[c/8] |= (1 << (c&7));
2723 if ((options & PCRE_CASELESS) != 0)
2724 {
2725 int uc = cd->fcc[c]; /* flip case */
2726 classbits[uc/8] |= (1 << (uc&7));
2727 }
2728 }
2729
2730 continue; /* Go get the next char in the class */
2731 }
2732
2733 /* Handle a lone single character - we can get here for a normal
2734 non-escape char, or after \ that introduces a single character or for an
2735 apparent range that isn't. */
2736
2737 LONE_SINGLE_CHARACTER:
2738
2739 /* Handle a character that cannot go in the bit map */
2740
2741 #ifdef SUPPORT_UTF8
2742 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2743 {
2744 class_utf8 = TRUE;
2745 *class_utf8data++ = XCL_SINGLE;
2746 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2747
2748 #ifdef SUPPORT_UCP
2749 if ((options & PCRE_CASELESS) != 0)
2750 {
2751 unsigned int othercase;
2752 if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
2753 {
2754 *class_utf8data++ = XCL_SINGLE;
2755 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
2756 }
2757 }
2758 #endif /* SUPPORT_UCP */
2759
2760 }
2761 else
2762 #endif /* SUPPORT_UTF8 */
2763
2764 /* Handle a single-byte character */
2765 {
2766 classbits[c/8] |= (1 << (c&7));
2767 if ((options & PCRE_CASELESS) != 0)
2768 {
2769 c = cd->fcc[c]; /* flip case */
2770 classbits[c/8] |= (1 << (c&7));
2771 }
2772 class_charcount++;
2773 class_lastchar = c;
2774 }
2775 }
2776
2777 /* Loop until ']' reached. This "while" is the end of the "do" above. */
2778
2779 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
2780
2781 if (c == 0) /* Missing terminating ']' */
2782 {
2783 *errorcodeptr = ERR6;
2784 goto FAILED;
2785 }
2786
2787 /* If class_charcount is 1, we saw precisely one character whose value is
2788 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2789 can optimize the negative case only if there were no characters >= 128
2790 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2791 single-bytes only. This is an historical hangover. Maybe one day we can
2792 tidy these opcodes to handle multi-byte characters.
2793
2794 The optimization throws away the bit map. We turn the item into a
2795 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2796 that OP_NOT does not support multibyte characters. In the positive case, it
2797 can cause firstbyte to be set. Otherwise, there can be no first char if
2798 this item is first, whatever repeat count may follow. In the case of
2799 reqbyte, save the previous value for reinstating. */
2800
2801 #ifdef SUPPORT_UTF8
2802 if (class_charcount == 1 &&
2803 (!utf8 ||
2804 (!class_utf8 && (!negate_class || class_lastchar < 128))))
2805
2806 #else
2807 if (class_charcount == 1)
2808 #endif
2809 {
2810 zeroreqbyte = reqbyte;
2811
2812 /* The OP_NOT opcode works on one-byte characters only. */
2813
2814 if (negate_class)
2815 {
2816 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2817 zerofirstbyte = firstbyte;
2818 *code++ = OP_NOT;
2819 *code++ = class_lastchar;
2820 break;
2821 }
2822
2823 /* For a single, positive character, get the value into mcbuffer, and
2824 then we can handle this with the normal one-character code. */
2825
2826 #ifdef SUPPORT_UTF8
2827 if (utf8 && class_lastchar > 127)
2828 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
2829 else
2830 #endif
2831 {
2832 mcbuffer[0] = class_lastchar;
2833 mclength = 1;
2834 }
2835 goto ONE_CHAR;
2836 } /* End of 1-char optimization */
2837
2838 /* The general case - not the one-char optimization. If this is the first
2839 thing in the branch, there can be no first char setting, whatever the
2840 repeat count. Any reqbyte setting must remain unchanged after any kind of
2841 repeat. */
2842
2843 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2844 zerofirstbyte = firstbyte;
2845 zeroreqbyte = reqbyte;
2846
2847 /* If there are characters with values > 255, we have to compile an
2848 extended class, with its own opcode. If there are no characters < 256,
2849 we can omit the bitmap in the actual compiled code. */
2850
2851 #ifdef SUPPORT_UTF8
2852 if (class_utf8)
2853 {
2854 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2855 *code++ = OP_XCLASS;
2856 code += LINK_SIZE;
2857 *code = negate_class? XCL_NOT : 0;
2858
2859 /* If the map is required, move up the extra data to make room for it;
2860 otherwise just move the code pointer to the end of the extra data. */
2861
2862 if (class_charcount > 0)
2863 {
2864 *code++ |= XCL_MAP;
2865 memmove(code + 32, code, class_utf8data - code);
2866 memcpy(code, classbits, 32);
2867 code = class_utf8data + 32;
2868 }
2869 else code = class_utf8data;
2870
2871 /* Now fill in the complete length of the item */
2872
2873 PUT(previous, 1, code - previous);
2874 break; /* End of class handling */
2875 }
2876 #endif
2877
2878 /* If there are no characters > 255, negate the 32-byte map if necessary,
2879 and copy it into the code vector. If this is the first thing in the branch,
2880 there can be no first char setting, whatever the repeat count. Any reqbyte
2881 setting must remain unchanged after any kind of repeat. */
2882
2883 if (negate_class)
2884 {
2885 *code++ = OP_NCLASS;
2886 if (lengthptr == NULL) /* Save time in the pre-compile phase */
2887 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2888 }
2889 else
2890 {
2891 *code++ = OP_CLASS;
2892 memcpy(code, classbits, 32);
2893 }
2894 code += 32;
2895 break;
2896
2897
2898 /* ===================================================================*/
2899 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2900 has been tested above. */
2901
2902 case '{':
2903 if (!is_quantifier) goto NORMAL_CHAR;
2904 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
2905 if (*errorcodeptr != 0) goto FAILED;
2906 goto REPEAT;
2907
2908 case '*':
2909 repeat_min = 0;
2910 repeat_max = -1;
2911 goto REPEAT;
2912
2913 case '+':
2914 repeat_min = 1;
2915 repeat_max = -1;
2916 goto REPEAT;
2917
2918 case '?':
2919 repeat_min = 0;
2920 repeat_max = 1;
2921
2922 REPEAT:
2923 if (previous == NULL)
2924 {
2925 *errorcodeptr = ERR9;
2926 goto FAILED;
2927 }
2928
2929 if (repeat_min == 0)
2930 {
2931 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2932 reqbyte = zeroreqbyte; /* Ditto */
2933 }
2934
2935 /* Remember whether this is a variable length repeat */
2936
2937 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2938
2939 op_type = 0; /* Default single-char op codes */
2940 possessive_quantifier = FALSE; /* Default not possessive quantifier */
2941
2942 /* Save start of previous item, in case we have to move it up to make space
2943 for an inserted OP_ONCE for the additional '+' extension. */
2944
2945 tempcode = previous;
2946
2947 /* If the next character is '+', we have a possessive quantifier. This
2948 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2949 If the next character is '?' this is a minimizing repeat, by default,
2950 but if PCRE_UNGREEDY is set, it works the other way round. We change the
2951 repeat type to the non-default. */
2952
2953 if (ptr[1] == '+')
2954 {
2955 repeat_type = 0; /* Force greedy */
2956 possessive_quantifier = TRUE;
2957 ptr++;
2958 }
2959 else if (ptr[1] == '?')
2960 {
2961 repeat_type = greedy_non_default;
2962 ptr++;
2963 }
2964 else repeat_type = greedy_default;
2965
2966 /* If previous was a character match, abolish the item and generate a
2967 repeat item instead. If a char item has a minumum of more than one, ensure
2968 that it is set in reqbyte - it might not be if a sequence such as x{3} is
2969 the first thing in a branch because the x will have gone into firstbyte
2970 instead. */
2971
2972 if (*previous == OP_CHAR || *previous == OP_CHARNC)
2973 {
2974 /* Deal with UTF-8 characters that take up more than one byte. It's
2975 easier to write this out separately than try to macrify it. Use c to
2976 hold the length of the character in bytes, plus 0x80 to flag that it's a
2977 length rather than a small character. */
2978
2979 #ifdef SUPPORT_UTF8
2980 if (utf8 && (code[-1] & 0x80) != 0)
2981 {
2982 uschar *lastchar = code - 1;
2983 while((*lastchar & 0xc0) == 0x80) lastchar--;
2984 c = code - lastchar; /* Length of UTF-8 character */
2985 memcpy(utf8_char, lastchar, c); /* Save the char */
2986 c |= 0x80; /* Flag c as a length */
2987 }
2988 else
2989 #endif
2990
2991 /* Handle the case of a single byte - either with no UTF8 support, or
2992 with UTF-8 disabled, or for a UTF-8 character < 128. */
2993
2994 {
2995 c = code[-1];
2996 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2997 }
2998
2999 /* If the repetition is unlimited, it pays to see if the next thing on
3000 the line is something that cannot possibly match this character. If so,
3001 automatically possessifying this item gains some performance in the case
3002 where the match fails. */
3003
3004 if (!possessive_quantifier &&
3005 repeat_max < 0 &&
3006 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3007 options, cd))
3008 {
3009 repeat_type = 0; /* Force greedy */
3010 possessive_quantifier = TRUE;
3011 }
3012
3013 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3014 }
3015
3016 /* If previous was a single negated character ([^a] or similar), we use
3017 one of the special opcodes, replacing it. The code is shared with single-
3018 character repeats by setting opt_type to add a suitable offset into
3019 repeat_type. We can also test for auto-possessification. OP_NOT is
3020 currently used only for single-byte chars. */
3021
3022 else if (*previous == OP_NOT)
3023 {
3024 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3025 c = previous[1];
3026 if (!possessive_quantifier &&
3027 repeat_max < 0 &&
3028 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3029 {
3030 repeat_type = 0; /* Force greedy */
3031 possessive_quantifier = TRUE;
3032 }
3033 goto OUTPUT_SINGLE_REPEAT;
3034 }
3035
3036 /* If previous was a character type match (\d or similar), abolish it and
3037 create a suitable repeat item. The code is shared with single-character
3038 repeats by setting op_type to add a suitable offset into repeat_type. Note
3039 the the Unicode property types will be present only when SUPPORT_UCP is
3040 defined, but we don't wrap the little bits of code here because it just
3041 makes it horribly messy. */
3042
3043 else if (*previous < OP_EODN)
3044 {
3045 uschar *oldcode;
3046 int prop_type, prop_value;
3047 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3048 c = *previous;
3049
3050 if (!possessive_quantifier &&
3051 repeat_max < 0 &&
3052 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3053 {
3054 repeat_type = 0; /* Force greedy */
3055 possessive_quantifier = TRUE;
3056 }
3057
3058 OUTPUT_SINGLE_REPEAT:
3059 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3060 {
3061 prop_type = previous[1];
3062 prop_value = previous[2];
3063 }
3064 else prop_type = prop_value = -1;
3065
3066 oldcode = code;
3067 code = previous; /* Usually overwrite previous item */
3068
3069 /* If the maximum is zero then the minimum must also be zero; Perl allows
3070 this case, so we do too - by simply omitting the item altogether. */
3071
3072 if (repeat_max == 0) goto END_REPEAT;
3073
3074 /* All real repeats make it impossible to handle partial matching (maybe
3075 one day we will be able to remove this restriction). */
3076
3077 if (repeat_max != 1) cd->nopartial = TRUE;
3078
3079 /* Combine the op_type with the repeat_type */
3080
3081 repeat_type += op_type;
3082
3083 /* A minimum of zero is handled either as the special case * or ?, or as
3084 an UPTO, with the maximum given. */
3085
3086 if (repeat_min == 0)
3087 {
3088 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3089 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3090 else
3091 {
3092 *code++ = OP_UPTO + repeat_type;
3093 PUT2INC(code, 0, repeat_max);
3094 }
3095 }
3096
3097 /* A repeat minimum of 1 is optimized into some special cases. If the
3098 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3099 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3100 one less than the maximum. */
3101
3102 else if (repeat_min == 1)
3103 {
3104 if (repeat_max == -1)
3105 *code++ = OP_PLUS + repeat_type;
3106 else
3107 {
3108 code = oldcode; /* leave previous item in place */
3109 if (repeat_max == 1) goto END_REPEAT;
3110 *code++ = OP_UPTO + repeat_type;
3111 PUT2INC(code, 0, repeat_max - 1);
3112 }
3113 }
3114
3115 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3116 handled as an EXACT followed by an UPTO. */
3117
3118 else
3119 {
3120 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3121 PUT2INC(code, 0, repeat_min);
3122
3123 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3124 we have to insert the character for the previous code. For a repeated
3125 Unicode property match, there are two extra bytes that define the
3126 required property. In UTF-8 mode, long characters have their length in
3127 c, with the 0x80 bit as a flag. */
3128
3129 if (repeat_max < 0)
3130 {
3131 #ifdef SUPPORT_UTF8
3132 if (utf8 && c >= 128)
3133 {
3134 memcpy(code, utf8_char, c & 7);
3135 code += c & 7;
3136 }
3137 else
3138 #endif
3139 {
3140 *code++ = c;
3141 if (prop_type >= 0)
3142 {
3143 *code++ = prop_type;
3144 *code++ = prop_value;
3145 }
3146 }
3147 *code++ = OP_STAR + repeat_type;
3148 }
3149
3150 /* Else insert an UPTO if the max is greater than the min, again
3151 preceded by the character, for the previously inserted code. If the
3152 UPTO is just for 1 instance, we can use QUERY instead. */
3153
3154 else if (repeat_max != repeat_min)
3155 {
3156 #ifdef SUPPORT_UTF8
3157 if (utf8 && c >= 128)
3158 {
3159 memcpy(code, utf8_char, c & 7);
3160 code += c & 7;
3161 }
3162 else
3163 #endif
3164 *code++ = c;
3165 if (prop_type >= 0)
3166 {
3167 *code++ = prop_type;
3168 *code++ = prop_value;
3169 }
3170 repeat_max -= repeat_min;
3171
3172 if (repeat_max == 1)
3173 {
3174 *code++ = OP_QUERY + repeat_type;
3175 }
3176 else
3177 {
3178 *code++ = OP_UPTO + repeat_type;
3179 PUT2INC(code, 0, repeat_max);
3180 }
3181 }
3182 }
3183
3184 /* The character or character type itself comes last in all cases. */
3185
3186 #ifdef SUPPORT_UTF8
3187 if (utf8 && c >= 128)
3188 {
3189 memcpy(code, utf8_char, c & 7);
3190 code += c & 7;
3191 }
3192 else
3193 #endif
3194 *code++ = c;
3195
3196 /* For a repeated Unicode property match, there are two extra bytes that
3197 define the required property. */
3198
3199 #ifdef SUPPORT_UCP
3200 if (prop_type >= 0)
3201 {
3202 *code++ = prop_type;
3203 *code++ = prop_value;
3204 }
3205 #endif
3206 }
3207
3208 /* If previous was a character class or a back reference, we put the repeat
3209 stuff after it, but just skip the item if the repeat was {0,0}. */
3210
3211 else if (*previous == OP_CLASS ||
3212 *previous == OP_NCLASS ||
3213 #ifdef SUPPORT_UTF8
3214 *previous == OP_XCLASS ||
3215 #endif
3216 *previous == OP_REF)
3217 {
3218 if (repeat_max == 0)
3219 {
3220 code = previous;
3221 goto END_REPEAT;
3222 }
3223
3224 /* All real repeats make it impossible to handle partial matching (maybe
3225 one day we will be able to remove this restriction). */
3226
3227 if (repeat_max != 1) cd->nopartial = TRUE;
3228
3229 if (repeat_min == 0 && repeat_max == -1)
3230 *code++ = OP_CRSTAR + repeat_type;
3231 else if (repeat_min == 1 && repeat_max == -1)
3232 *code++ = OP_CRPLUS + repeat_type;
3233 else if (repeat_min == 0 && repeat_max == 1)
3234 *code++ = OP_CRQUERY + repeat_type;
3235 else
3236 {
3237 *code++ = OP_CRRANGE + repeat_type;
3238 PUT2INC(code, 0, repeat_min);
3239 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3240 PUT2INC(code, 0, repeat_max);
3241 }
3242 }
3243
3244 /* If previous was a bracket group, we may have to replicate it in certain
3245 cases. */
3246
3247 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3248 *previous == OP_ONCE || *previous == OP_COND)
3249 {
3250 register int i;
3251 int ketoffset = 0;
3252 int len = code - previous;
3253 uschar *bralink = NULL;
3254
3255 /* Repeating a DEFINE group is pointless */
3256
3257 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3258 {
3259 *errorcodeptr = ERR55;
3260 goto FAILED;
3261 }
3262
3263 /* This is a paranoid check to stop integer overflow later on */
3264
3265 if (len > MAX_DUPLENGTH)
3266 {
3267 *errorcodeptr = ERR50;
3268 goto FAILED;
3269 }
3270
3271 /* If the maximum repeat count is unlimited, find the end of the bracket
3272 by scanning through from the start, and compute the offset back to it
3273 from the current code pointer. There may be an OP_OPT setting following
3274 the final KET, so we can't find the end just by going back from the code
3275 pointer. */
3276
3277 if (repeat_max == -1)
3278 {
3279 register uschar *ket = previous;
3280 do ket += GET(ket, 1); while (*ket != OP_KET);
3281 ketoffset = code - ket;
3282 }
3283
3284 /* The case of a zero minimum is special because of the need to stick
3285 OP_BRAZERO in front of it, and because the group appears once in the
3286 data, whereas in other cases it appears the minimum number of times. For
3287 this reason, it is simplest to treat this case separately, as otherwise
3288 the code gets far too messy. There are several special subcases when the
3289 minimum is zero. */
3290
3291 if (repeat_min == 0)
3292 {
3293 /* If the maximum is also zero, we just omit the group from the output
3294 altogether. */
3295
3296 if (repeat_max == 0)
3297 {
3298 code = previous;
3299 goto END_REPEAT;
3300 }
3301
3302 /* If the maximum is 1 or unlimited, we just have to stick in the
3303 BRAZERO and do no more at this point. However, we do need to adjust
3304 any OP_RECURSE calls inside the group that refer to the group itself or
3305 any internal or forward referenced group, because the offset is from
3306 the start of the whole regex. Temporarily terminate the pattern while
3307 doing this. */
3308
3309 if (repeat_max <= 1)
3310 {
3311 *code = OP_END;
3312 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3313 memmove(previous+1, previous, len);
3314 code++;
3315 *previous++ = OP_BRAZERO + repeat_type;
3316 }
3317
3318 /* If the maximum is greater than 1 and limited, we have to replicate
3319 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3320 The first one has to be handled carefully because it's the original
3321 copy, which has to be moved up. The remainder can be handled by code
3322 that is common with the non-zero minimum case below. We have to
3323 adjust the value or repeat_max, since one less copy is required. Once
3324 again, we may have to adjust any OP_RECURSE calls inside the group. */
3325
3326 else
3327 {
3328 int offset;
3329 *code = OP_END;
3330 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3331 memmove(previous + 2 + LINK_SIZE, previous, len);
3332 code += 2 + LINK_SIZE;
3333 *previous++ = OP_BRAZERO + repeat_type;
3334 *previous++ = OP_BRA;
3335
3336 /* We chain together the bracket offset fields that have to be
3337 filled in later when the ends of the brackets are reached. */
3338
3339 offset = (bralink == NULL)? 0 : previous - bralink;
3340 bralink = previous;
3341 PUTINC(previous, 0, offset);
3342 }
3343
3344 repeat_max--;
3345 }
3346
3347 /* If the minimum is greater than zero, replicate the group as many
3348 times as necessary, and adjust the maximum to the number of subsequent
3349 copies that we need. If we set a first char from the group, and didn't
3350 set a required char, copy the latter from the former. If there are any
3351 forward reference subroutine calls in the group, there will be entries on
3352 the workspace list; replicate these with an appropriate increment. */
3353
3354 else
3355 {
3356 if (repeat_min > 1)
3357 {
3358 /* In the pre-compile phase, we don't actually do the replication. We
3359 just adjust the length as if we had. */
3360
3361 if (lengthptr != NULL)
3362 *lengthptr += (repeat_min - 1)*length_prevgroup;
3363
3364 /* This is compiling for real */
3365
3366 else
3367 {
3368 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3369 for (i = 1; i < repeat_min; i++)
3370 {
3371 uschar *hc;
3372 uschar *this_hwm = cd->hwm;
3373 memcpy(code, previous, len);
3374 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3375 {
3376 PUT(cd->hwm, 0, GET(hc, 0) + len);
3377 cd->hwm += LINK_SIZE;
3378 }
3379 save_hwm = this_hwm;
3380 code += len;
3381 }
3382 }
3383 }
3384
3385 if (repeat_max > 0) repeat_max -= repeat_min;
3386 }
3387
3388 /* This code is common to both the zero and non-zero minimum cases. If
3389 the maximum is limited, it replicates the group in a nested fashion,
3390 remembering the bracket starts on a stack. In the case of a zero minimum,
3391 the first one was set up above. In all cases the repeat_max now specifies
3392 the number of additional copies needed. Again, we must remember to
3393 replicate entries on the forward reference list. */
3394
3395 if (repeat_max >= 0)
3396 {
3397 /* In the pre-compile phase, we don't actually do the replication. We
3398 just adjust the length as if we had. For each repetition we must add 1
3399 to the length for BRAZERO and for all but the last repetition we must
3400 add 2 + 2*LINKSIZE to allow for the nesting that occurs. */
3401
3402 if (lengthptr != NULL && repeat_max > 0)
3403 *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3404 2 - 2*LINK_SIZE; /* Last one doesn't nest */
3405
3406 /* This is compiling for real */
3407
3408 else for (i = repeat_max - 1; i >= 0; i--)
3409 {
3410 uschar *hc;
3411 uschar *this_hwm = cd->hwm;
3412
3413 *code++ = OP_BRAZERO + repeat_type;
3414
3415 /* All but the final copy start a new nesting, maintaining the
3416 chain of brackets outstanding. */
3417
3418 if (i != 0)
3419 {
3420 int offset;
3421 *code++ = OP_BRA;
3422 offset = (bralink == NULL)? 0 : code - bralink;
3423 bralink = code;
3424 PUTINC(code, 0, offset);
3425 }
3426
3427 memcpy(code, previous, len);
3428 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3429 {
3430 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3431 cd->hwm += LINK_SIZE;
3432 }
3433 save_hwm = this_hwm;
3434 code += len;
3435 }
3436
3437 /* Now chain through the pending brackets, and fill in their length
3438 fields (which are holding the chain links pro tem). */
3439
3440 while (bralink != NULL)
3441 {
3442 int oldlinkoffset;
3443 int offset = code - bralink + 1;
3444 uschar *bra = code - offset;
3445 oldlinkoffset = GET(bra, 1);
3446 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3447 *code++ = OP_KET;
3448 PUTINC(code, 0, offset);
3449 PUT(bra, 1, offset);
3450 }
3451 }
3452
3453 /* If the maximum is unlimited, set a repeater in the final copy. We
3454 can't just offset backwards from the current code point, because we
3455 don't know if there's been an options resetting after the ket. The
3456 correct offset was computed above.
3457
3458 Then, when we are doing the actual compile phase, check to see whether
3459 this group is a non-atomic one that could match an empty string. If so,
3460 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3461 that runtime checking can be done. [This check is also applied to
3462 atomic groups at runtime, but in a different way.] */
3463
3464 else
3465 {
3466 uschar *ketcode = code - ketoffset;
3467 uschar *bracode = ketcode - GET(ketcode, 1);
3468 *ketcode = OP_KETRMAX + repeat_type;
3469 if (lengthptr == NULL && *bracode != OP_ONCE)
3470 {
3471 uschar *scode = bracode;
3472 do
3473 {
3474 if (could_be_empty_branch(scode, ketcode, utf8))
3475 {
3476 *bracode += OP_SBRA - OP_BRA;
3477 break;
3478 }
3479 scode += GET(scode, 1);
3480 }
3481 while (*scode == OP_ALT);
3482 }
3483 }
3484 }
3485
3486 /* Else there's some kind of shambles */
3487
3488 else
3489 {
3490 *errorcodeptr = ERR11;
3491 goto FAILED;
3492 }
3493
3494 /* If the character following a repeat is '+', or if certain optimization
3495 tests above succeeded, possessive_quantifier is TRUE. For some of the
3496 simpler opcodes, there is an special alternative opcode for this. For
3497 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3498 The '+' notation is just syntactic sugar, taken from Sun's Java package,
3499 but the special opcodes can optimize it a bit. The repeated item starts at
3500 tempcode, not at previous, which might be the first part of a string whose
3501 (former) last char we repeated.
3502
3503 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3504 an 'upto' may follow. We skip over an 'exact' item, and then test the
3505 length of what remains before proceeding. */
3506
3507 if (possessive_quantifier)
3508 {
3509 int len;
3510 if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3511 *tempcode == OP_NOTEXACT)
3512 tempcode += _pcre_OP_lengths[*tempcode];
3513 len = code - tempcode;
3514 if (len > 0) switch (*tempcode)
3515 {
3516 case OP_STAR: *tempcode = OP_POSSTAR; break;
3517 case OP_PLUS: *tempcode = OP_POSPLUS; break;
3518 case OP_QUERY: *tempcode = OP_POSQUERY; break;
3519 case OP_UPTO: *tempcode = OP_POSUPTO; break;
3520
3521 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
3522 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
3523 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3524 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
3525
3526 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
3527 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
3528 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3529 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
3530
3531 default:
3532 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3533 code += 1 + LINK_SIZE;
3534 len += 1 + LINK_SIZE;
3535 tempcode[0] = OP_ONCE;
3536 *code++ = OP_KET;
3537 PUTINC(code, 0, len);
3538 PUT(tempcode, 1, len);
3539 break;
3540 }
3541 }
3542
3543 /* In all case we no longer have a previous item. We also set the
3544 "follows varying string" flag for subsequently encountered reqbytes if
3545 it isn't already set and we have just passed a varying length item. */
3546
3547 END_REPEAT:
3548 previous = NULL;
3549 cd->req_varyopt |= reqvary;
3550 break;
3551
3552
3553 /* ===================================================================*/
3554 /* Start of nested parenthesized sub-expression, or comment or lookahead or
3555 lookbehind or option setting or condition or all the other extended
3556 parenthesis forms. First deal with the specials; all are introduced by ?,
3557 and the appearance of any of them means that this is not a capturing
3558 group. */
3559
3560 case '(':
3561 newoptions = options;
3562 skipbytes = 0;
3563 bravalue = OP_CBRA;
3564 save_hwm = cd->hwm;
3565
3566 if (*(++ptr) == '?')
3567 {
3568 int i, set, unset, namelen;
3569 int *optset;
3570 const uschar *name;
3571 uschar *slot;
3572
3573 switch (*(++ptr))
3574 {
3575 case '#': /* Comment; skip to ket */
3576 ptr++;
3577 while (*ptr != 0 && *ptr != ')') ptr++;
3578 if (*ptr == 0)
3579 {
3580 *errorcodeptr = ERR18;
3581 goto FAILED;
3582 }
3583 continue;
3584
3585
3586 /* ------------------------------------------------------------ */
3587 case ':': /* Non-capturing bracket */
3588 bravalue = OP_BRA;
3589 ptr++;
3590 break;
3591
3592
3593 /* ------------------------------------------------------------ */
3594 case '(':
3595 bravalue = OP_COND; /* Conditional group */
3596
3597 /* A condition can be an assertion, a number (referring to a numbered
3598 group), a name (referring to a named group), or 'R', referring to
3599 recursion. R<digits> and R&name are also permitted for recursion tests.
3600
3601 There are several syntaxes for testing a named group: (?(name)) is used
3602 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3603
3604 There are two unfortunate ambiguities, caused by history. (a) 'R' can
3605 be the recursive thing or the name 'R' (and similarly for 'R' followed
3606 by digits), and (b) a number could be a name that consists of digits.
3607 In both cases, we look for a name first; if not found, we try the other
3608 cases. */
3609
3610 /* For conditions that are assertions, check the syntax, and then exit
3611 the switch. This will take control down to where bracketed groups,
3612 including assertions, are processed. */
3613
3614 if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3615 break;
3616
3617 /* Most other conditions use OP_CREF (a couple change to OP_RREF
3618 below), and all need to skip 3 bytes at the start of the group. */
3619
3620 code[1+LINK_SIZE] = OP_CREF;
3621 skipbytes = 3;
3622
3623 /* Check for a test for recursion in a named group. */
3624
3625 if (ptr[1] == 'R' && ptr[2] == '&')
3626 {
3627 terminator = -1;
3628 ptr += 2;
3629 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
3630 }
3631
3632 /* Check for a test for a named group's having been set, using the Perl
3633 syntax (?(<name>) or (?('name') */
3634
3635 else if (ptr[1] == '<')
3636 {
3637 terminator = '>';
3638 ptr++;
3639 }
3640 else if (ptr[1] == '\'')
3641 {
3642 terminator = '\'';
3643 ptr++;
3644 }
3645 else terminator = 0;
3646
3647 /* We now expect to read a name; any thing else is an error */
3648
3649 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
3650 {
3651 ptr += 1; /* To get the right offset */
3652 *errorcodeptr = ERR28;
3653 goto FAILED;
3654 }
3655
3656 /* Read the name, but also get it as a number if it's all digits */
3657
3658 recno = 0;
3659 name = ++ptr;
3660 while ((cd->ctypes[*ptr] & ctype_word) != 0)
3661 {
3662 if (recno >= 0)
3663 recno = ((digitab[*ptr] & ctype_digit) != 0)?
3664 recno * 10 + *ptr - '0' : -1;
3665 ptr++;
3666 }
3667 namelen = ptr - name;
3668
3669 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
3670 {
3671 ptr--; /* Error offset */
3672 *errorcodeptr = ERR26;
3673 goto FAILED;
3674 }
3675
3676 /* Do no further checking in the pre-compile phase. */
3677
3678 if (lengthptr != NULL) break;
3679
3680 /* In the real compile we do the work of looking for the actual
3681 reference. */
3682
3683 slot = cd->name_table;
3684 for (i = 0; i < cd->names_found; i++)
3685 {
3686 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3687 slot += cd->name_entry_size;
3688 }
3689
3690 /* Found a previous named subpattern */
3691
3692 if (i < cd->names_found)
3693 {
3694 recno = GET2(slot, 0);
3695 PUT2(code, 2+LINK_SIZE, recno);
3696 }
3697
3698 /* Search the pattern for a forward reference */
3699
3700 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
3701 (options & PCRE_EXTENDED) != 0)) > 0)
3702 {
3703 PUT2(code, 2+LINK_SIZE, i);
3704 }
3705
3706 /* If terminator == 0 it means that the name followed directly after
3707 the opening parenthesis [e.g. (?(abc)...] and in this case there are
3708 some further alternatives to try. For the cases where terminator != 0
3709 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
3710 now checked all the possibilities, so give an error. */
3711
3712 else if (terminator != 0)
3713 {
3714 *errorcodeptr = ERR15;
3715 goto FAILED;
3716 }
3717
3718 /* Check for (?(R) for recursion. Allow digits after R to specify a
3719 specific group number. */
3720
3721 else if (*name == 'R')
3722 {
3723 recno = 0;
3724 for (i = 1; i < namelen; i++)
3725 {
3726 if ((digitab[name[i]] & ctype_digit) == 0)
3727 {
3728 *errorcodeptr = ERR15;
3729 goto FAILED;
3730 }
3731 recno = recno * 10 + name[i] - '0';
3732 }
3733 if (recno == 0) recno = RREF_ANY;
3734 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
3735 PUT2(code, 2+LINK_SIZE, recno);
3736 }
3737
3738 /* Similarly, check for the (?(DEFINE) "condition", which is always
3739 false. */
3740
3741 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
3742 {
3743 code[1+LINK_SIZE] = OP_DEF;
3744 skipbytes = 1;
3745 }
3746
3747 /* Check for the "name" actually being a subpattern number. */
3748
3749 else if (recno > 0)
3750 {
3751 PUT2(code, 2+LINK_SIZE, recno);
3752 }
3753
3754 /* Either an unidentified subpattern, or a reference to (?(0) */
3755
3756 else
3757 {
3758 *errorcodeptr = (recno == 0)? ERR35: ERR15;
3759 goto FAILED;
3760 }
3761 break;
3762
3763
3764 /* ------------------------------------------------------------ */
3765 case '=': /* Positive lookahead */
3766 bravalue = OP_ASSERT;
3767 ptr++;
3768 break;
3769
3770
3771 /* ------------------------------------------------------------ */
3772 case '!': /* Negative lookahead */
3773 bravalue = OP_ASSERT_NOT;
3774 ptr++;
3775 break;
3776
3777
3778 /* ------------------------------------------------------------ */
3779 case '<': /* Lookbehind or named define */
3780 switch (ptr[1])
3781 {
3782 case '=': /* Positive lookbehind */
3783 bravalue = OP_ASSERTBACK;
3784 ptr += 2;
3785 break;
3786
3787 case '!': /* Negative lookbehind */
3788 bravalue = OP_ASSERTBACK_NOT;
3789 ptr += 2;
3790 break;
3791
3792 default: /* Could be name define, else bad */
3793 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
3794 ptr++; /* Correct offset for error */
3795 *errorcodeptr = ERR24;
3796 goto FAILED;
3797 }
3798 break;
3799
3800
3801 /* ------------------------------------------------------------ */
3802 case '>': /* One-time brackets */
3803 bravalue = OP_ONCE;
3804 ptr++;
3805 break;
3806
3807
3808 /* ------------------------------------------------------------ */
3809 case 'C': /* Callout - may be followed by digits; */
3810 previous_callout = code; /* Save for later completion */
3811 after_manual_callout = 1; /* Skip one item before completing */
3812 *code++ = OP_CALLOUT;
3813 {
3814 int n = 0;
3815 while ((digitab[*(++ptr)] & ctype_digit) != 0)
3816 n = n * 10 + *ptr - '0';
3817 if (*ptr != ')')
3818 {
3819 *errorcodeptr = ERR39;
3820 goto FAILED;
3821 }
3822 if (n > 255)
3823 {
3824 *errorcodeptr = ERR38;
3825 goto FAILED;
3826 }
3827 *code++ = n;
3828 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
3829 PUT(code, LINK_SIZE, 0); /* Default length */
3830 code += 2 * LINK_SIZE;
3831 }
3832 previous = NULL;
3833 continue;
3834
3835
3836 /* ------------------------------------------------------------ */
3837 case 'P': /* Python-style named subpattern handling */
3838 if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
3839 {
3840 is_recurse = *ptr == '>';
3841 terminator = ')';
3842 goto NAMED_REF_OR_RECURSE;
3843 }
3844 else if (*ptr != '<') /* Test for Python-style definition */
3845 {
3846 *errorcodeptr = ERR41;
3847 goto FAILED;
3848 }
3849 /* Fall through to handle (?P< as (?< is handled */
3850
3851
3852 /* ------------------------------------------------------------ */
3853 DEFINE_NAME: /* Come here from (?< handling */
3854 case '\'':
3855 {
3856 terminator = (*ptr == '<')? '>' : '\'';
3857 name = ++ptr;
3858
3859 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3860 namelen = ptr - name;
3861
3862 /* In the pre-compile phase, just do a syntax check. */
3863
3864 if (lengthptr != NULL)
3865 {
3866 if (*ptr != terminator)
3867 {
3868 *errorcodeptr = ERR42;
3869 goto FAILED;
3870 }
3871 if (cd->names_found >= MAX_NAME_COUNT)
3872 {
3873 *errorcodeptr = ERR49;
3874 goto FAILED;
3875 }
3876 if (namelen + 3 > cd->name_entry_size)
3877 {
3878 cd->name_entry_size = namelen + 3;
3879 if (namelen > MAX_NAME_SIZE)
3880 {
3881 *errorcodeptr = ERR48;
3882 goto FAILED;
3883 }
3884 }
3885 }
3886
3887 /* In the real compile, create the entry in the table */
3888
3889 else
3890 {
3891 slot = cd->name_table;
3892 for (i = 0; i < cd->names_found; i++)
3893 {
3894 int crc = memcmp(name, slot+2, namelen);
3895 if (crc == 0)
3896 {
3897 if (slot[2+namelen] == 0)
3898 {
3899 if ((options & PCRE_DUPNAMES) == 0)
3900 {
3901 *errorcodeptr = ERR43;
3902 goto FAILED;
3903 }
3904 }
3905 else crc = -1; /* Current name is substring */
3906 }
3907 if (crc < 0)
3908 {
3909 memmove(slot + cd->name_entry_size, slot,
3910 (cd->names_found - i) * cd->name_entry_size);
3911 break;
3912 }
3913 slot += cd->name_entry_size;
3914 }
3915
3916 PUT2(slot, 0, cd->bracount + 1);
3917 memcpy(slot + 2, name, namelen);
3918 slot[2+namelen] = 0;
3919 }
3920 }
3921
3922 /* In both cases, count the number of names we've encountered. */
3923
3924 ptr++; /* Move past > or ' */
3925 cd->names_found++;
3926 goto NUMBERED_GROUP;
3927
3928
3929 /* ------------------------------------------------------------ */
3930 case '&': /* Perl recursion/subroutine syntax */
3931 terminator = ')';
3932 is_recurse = TRUE;
3933 /* Fall through */
3934
3935 /* We come here from the Python syntax above that handles both
3936 references (?P=name) and recursion (?P>name), as well as falling
3937 through from the Perl recursion syntax (?&name). */
3938
3939 NAMED_REF_OR_RECURSE:
3940 name = ++ptr;
3941 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3942 namelen = ptr - name;
3943
3944 /* In the pre-compile phase, do a syntax check and set a dummy
3945 reference number. */
3946
3947 if (lengthptr != NULL)
3948 {
3949 if (*ptr != terminator)
3950 {
3951 *errorcodeptr = ERR42;
3952 goto FAILED;
3953 }
3954 if (namelen > MAX_NAME_SIZE)
3955 {
3956 *errorcodeptr = ERR48;
3957 goto FAILED;
3958 }
3959 recno = 0;
3960 }
3961
3962 /* In the real compile, seek the name in the table */
3963
3964 else
3965 {
3966 slot = cd->name_table;
3967 for (i = 0; i < cd->names_found; i++)
3968 {
3969 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3970 slot += cd->name_entry_size;
3971 }
3972
3973 if (i < cd->names_found) /* Back reference */
3974 {
3975 recno = GET2(slot, 0);
3976 }
3977 else if ((recno = /* Forward back reference */
3978 find_parens(ptr, cd->bracount, name, namelen,
3979 (options & PCRE_EXTENDED) != 0)) <= 0)
3980 {
3981 *errorcodeptr = ERR15;
3982 goto FAILED;
3983 }
3984 }
3985
3986 /* In both phases, we can now go to the code than handles numerical
3987 recursion or backreferences. */
3988
3989 if (is_recurse) goto HANDLE_RECURSION;
3990 else goto HANDLE_REFERENCE;
3991
3992
3993 /* ------------------------------------------------------------ */
3994 case 'R': /* Recursion */
3995 ptr++; /* Same as (?0) */
3996 /* Fall through */
3997
3998
3999 /* ------------------------------------------------------------ */
4000 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4001 case '5': case '6': case '7': case '8': case '9': /* subroutine */
4002 {
4003 const uschar *called;
4004 recno = 0;
4005 while((digitab[*ptr] & ctype_digit) != 0)
4006 recno = recno * 10 + *ptr++ - '0';
4007 if (*ptr != ')')
4008 {
4009 *errorcodeptr = ERR29;
4010 goto FAILED;
4011 }
4012
4013 /* Come here from code above that handles a named recursion */
4014
4015 HANDLE_RECURSION:
4016
4017 previous = code;
4018 called = cd->start_code;
4019
4020 /* When we are actually compiling, find the bracket that is being
4021 referenced. Temporarily end the regex in case it doesn't exist before
4022 this point. If we end up with a forward reference, first check that
4023 the bracket does occur later so we can give the error (and position)
4024 now. Then remember this forward reference in the workspace so it can
4025 be filled in at the end. */
4026
4027 if (lengthptr == NULL)
4028 {
4029 *code = OP_END;
4030 if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4031
4032 /* Forward reference */
4033
4034 if (called == NULL)
4035 {
4036 if (find_parens(ptr, cd->bracount, NULL, recno,
4037 (options & PCRE_EXTENDED) != 0) < 0)
4038 {
4039 *errorcodeptr = ERR15;
4040 goto FAILED;
4041 }
4042 called = cd->start_code + recno;
4043 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4044 }
4045
4046 /* If not a forward reference, and the subpattern is still open,
4047 this is a recursive call. We check to see if this is a left
4048 recursion that could loop for ever, and diagnose that case. */
4049
4050 else if (GET(called, 1) == 0 &&
4051 could_be_empty(called, code, bcptr, utf8))
4052 {
4053 *errorcodeptr = ERR40;
4054 goto FAILED;
4055 }
4056 }
4057
4058 /* Insert the recursion/subroutine item, automatically wrapped inside
4059 "once" brackets. Set up a "previous group" length so that a
4060 subsequent quantifier will work. */
4061
4062 *code = OP_ONCE;
4063 PUT(code, 1, 2 + 2*LINK_SIZE);
4064 code += 1 + LINK_SIZE;
4065
4066 *code = OP_RECURSE;
4067 PUT(code, 1, called - cd->start_code);
4068 code += 1 + LINK_SIZE;
4069
4070 *code = OP_KET;
4071 PUT(code, 1, 2 + 2*LINK_SIZE);
4072 code += 1 + LINK_SIZE;
4073
4074 length_prevgroup = 3 + 3*LINK_SIZE;
4075 }
4076
4077 /* Can't determine a first byte now */
4078
4079 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4080 continue;
4081
4082
4083 /* ------------------------------------------------------------ */
4084 default: /* Other characters: check option setting */
4085 set = unset = 0;
4086 optset = &set;
4087
4088 while (*ptr != ')' && *ptr != ':')
4089 {
4090 switch (*ptr++)
4091 {
4092 case '-': optset = &unset; break;
4093
4094 case 'J': /* Record that it changed in the external options */
4095 *optset |= PCRE_DUPNAMES;
4096 cd->external_options |= PCRE_JCHANGED;
4097 break;
4098
4099 case 'i': *optset |= PCRE_CASELESS; break;
4100 case 'm': *optset |= PCRE_MULTILINE; break;
4101 case 's': *optset |= PCRE_DOTALL; break;
4102 case 'x': *optset |= PCRE_EXTENDED; break;
4103 case 'U': *optset |= PCRE_UNGREEDY; break;
4104 case 'X': *optset |= PCRE_EXTRA; break;
4105
4106 default: *errorcodeptr = ERR12;
4107 ptr--; /* Correct the offset */
4108 goto FAILED;
4109 }
4110 }
4111
4112 /* Set up the changed option bits, but don't change anything yet. */
4113
4114 newoptions = (options | set) & (~unset);
4115
4116 /* If the options ended with ')' this is not the start of a nested
4117 group with option changes, so the options change at this level. If this
4118 item is right at the start of the pattern, the options can be
4119 abstracted and made external in the pre-compile phase, and ignored in
4120 the compile phase. This can be helpful when matching -- for instance in
4121 caseless checking of required bytes.
4122
4123 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4124 definitely *not* at the start of the pattern because something has been
4125 compiled. In the pre-compile phase, however, the code pointer can have
4126 that value after the start, because it gets reset as code is discarded
4127 during the pre-compile. However, this can happen only at top level - if
4128 we are within parentheses, the starting BRA will still be present. At
4129 any parenthesis level, the length value can be used to test if anything
4130 has been compiled at that level. Thus, a test for both these conditions
4131 is necessary to ensure we correctly detect the start of the pattern in
4132 both phases.
4133
4134 If we are not at the pattern start, compile code to change the ims
4135 options if this setting actually changes any of them. We also pass the
4136 new setting back so that it can be put at the start of any following
4137 branches, and when this group ends (if we are in a group), a resetting
4138 item can be compiled. */
4139
4140 if (*ptr == ')')
4141 {
4142 if (code == cd->start_code + 1 + LINK_SIZE &&
4143 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4144 {
4145 cd->external_options = newoptions;
4146 options = newoptions;
4147 }
4148 else
4149 {
4150 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4151 {
4152 *code++ = OP_OPT;
4153 *code++ = newoptions & PCRE_IMS;
4154 }
4155
4156 /* Change options at this level, and pass them back for use
4157 in subsequent branches. Reset the greedy defaults and the case
4158 value for firstbyte and reqbyte. */
4159
4160 *optionsptr = options = newoptions;
4161 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4162 greedy_non_default = greedy_default ^ 1;
4163 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4164 }
4165
4166 previous = NULL; /* This item can't be repeated */
4167 continue; /* It is complete */
4168 }
4169
4170 /* If the options ended with ':' we are heading into a nested group
4171 with possible change of options. Such groups are non-capturing and are
4172 not assertions of any kind. All we need to do is skip over the ':';
4173 the newoptions value is handled below. */
4174
4175 bravalue = OP_BRA;
4176 ptr++;
4177 } /* End of switch for character following (? */
4178 } /* End of (? handling */
4179
4180 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4181 all unadorned brackets become non-capturing and behave like (?:...)
4182 brackets. */
4183
4184 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4185 {
4186 bravalue = OP_BRA;
4187 }
4188
4189 /* Else we have a capturing group. */
4190
4191 else
4192 {
4193 NUMBERED_GROUP:
4194 cd->bracount += 1;
4195 PUT2(code, 1+LINK_SIZE, cd->bracount);
4196 skipbytes = 2;
4197 }
4198
4199 /* Process nested bracketed regex. Assertions may not be repeated, but
4200 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4201 non-register variable in order to be able to pass its address because some
4202 compilers complain otherwise. Pass in a new setting for the ims options if
4203 they have changed. */
4204
4205 previous = (bravalue >= OP_ONCE)? code : NULL;
4206 *code = bravalue;
4207 tempcode = code;
4208 tempreqvary = cd->req_varyopt; /* Save value before bracket */
4209 length_prevgroup = 0; /* Initialize for pre-compile phase */
4210
4211 if (!compile_regex(
4212 newoptions, /* The complete new option state */
4213 options & PCRE_IMS, /* The previous ims option state */
4214 &tempcode, /* Where to put code (updated) */
4215 &ptr, /* Input pointer (updated) */
4216 errorcodeptr, /* Where to put an error message */
4217 (bravalue == OP_ASSERTBACK ||
4218 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4219 skipbytes, /* Skip over bracket number */
4220 &subfirstbyte, /* For possible first char */
4221 &subreqbyte, /* For possible last char */
4222 bcptr, /* Current branch chain */
4223 cd, /* Tables block */
4224 (lengthptr == NULL)? NULL : /* Actual compile phase */
4225 &length_prevgroup /* Pre-compile phase */
4226 ))
4227 goto FAILED;
4228
4229 /* At the end of compiling, code is still pointing to the start of the
4230 group, while tempcode has been updated to point past the end of the group
4231 and any option resetting that may follow it. The pattern pointer (ptr)
4232 is on the bracket. */
4233
4234 /* If this is a conditional bracket, check that there are no more than
4235 two branches in the group, or just one if it's a DEFINE group. */
4236
4237 if (bravalue == OP_COND)
4238 {
4239 uschar *tc = code;
4240 int condcount = 0;
4241
4242 do {
4243 condcount++;
4244 tc += GET(tc,1);
4245 }
4246 while (*tc != OP_KET);
4247
4248 /* A DEFINE group is never obeyed inline (the "condition" is always
4249 false). It must have only one branch. */
4250
4251 if (code[LINK_SIZE+1] == OP_DEF)
4252 {
4253 if (condcount > 1)
4254 {
4255 *errorcodeptr = ERR54;
4256 goto FAILED;
4257 }
4258 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
4259 }
4260
4261 /* A "normal" conditional group. If there is just one branch, we must not
4262 make use of its firstbyte or reqbyte, because this is equivalent to an
4263 empty second branch. */
4264
4265 else
4266 {
4267 if (condcount > 2)
4268 {
4269 *errorcodeptr = ERR27;
4270 goto FAILED;
4271 }
4272 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4273 }
4274 }
4275
4276 /* Error if hit end of pattern */
4277
4278 if (*ptr != ')')
4279 {
4280 *errorcodeptr = ERR14;
4281 goto FAILED;
4282 }
4283
4284 /* In the pre-compile phase, update the length by the length of the nested
4285 group, less the brackets at either end. Then reduce the compiled code to
4286 just the brackets so that it doesn't use much memory if it is duplicated by
4287 a quantifier. */
4288
4289 if (lengthptr != NULL)
4290 {
4291 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4292 code++;
4293 PUTINC(code, 0, 1 + LINK_SIZE);
4294 *code++ = OP_KET;
4295 PUTINC(code, 0, 1 + LINK_SIZE);
4296 }
4297
4298 /* Otherwise update the main code pointer to the end of the group. */
4299
4300 else code = tempcode;
4301
4302 /* For a DEFINE group, required and first character settings are not
4303 relevant. */
4304
4305 if (bravalue == OP_DEF) break;
4306
4307 /* Handle updating of the required and first characters for other types of
4308 group. Update for normal brackets of all kinds, and conditions with two
4309 branches (see code above). If the bracket is followed by a quantifier with
4310 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4311 zerofirstbyte outside the main loop so that they can be accessed for the
4312 back off. */
4313
4314 zeroreqbyte = reqbyte;
4315 zerofirstbyte = firstbyte;
4316 groupsetfirstbyte = FALSE;
4317
4318 if (bravalue >= OP_ONCE)
4319 {
4320 /* If we have not yet set a firstbyte in this branch, take it from the
4321 subpattern, remembering that it was set here so that a repeat of more
4322 than one can replicate it as reqbyte if necessary. If the subpattern has
4323 no firstbyte, set "none" for the whole branch. In both cases, a zero
4324 repeat forces firstbyte to "none". */
4325
4326 if (firstbyte == REQ_UNSET)
4327 {
4328 if (subfirstbyte >= 0)
4329 {
4330 firstbyte = subfirstbyte;
4331 groupsetfirstbyte = TRUE;
4332 }
4333 else firstbyte = REQ_NONE;
4334 zerofirstbyte = REQ_NONE;
4335 }
4336
4337 /* If firstbyte was previously set, convert the subpattern's firstbyte
4338 into reqbyte if there wasn't one, using the vary flag that was in
4339 existence beforehand. */
4340
4341 else if (subfirstbyte >= 0 && subreqbyte < 0)
4342 subreqbyte = subfirstbyte | tempreqvary;
4343
4344 /* If the subpattern set a required byte (or set a first byte that isn't
4345 really the first byte - see above), set it. */
4346
4347 if (subreqbyte >= 0) reqbyte = subreqbyte;
4348 }
4349
4350 /* For a forward assertion, we take the reqbyte, if set. This can be
4351 helpful if the pattern that follows the assertion doesn't set a different
4352 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
4353 for an assertion, however because it leads to incorrect effect for patterns
4354 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
4355 of a firstbyte. This is overcome by a scan at the end if there's no
4356 firstbyte, looking for an asserted first char. */
4357
4358 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
4359 break; /* End of processing '(' */
4360
4361
4362 /* ===================================================================*/
4363 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
4364 are arranged to be the negation of the corresponding OP_values. For the
4365 back references, the values are ESC_REF plus the reference number. Only
4366 back references and those types that consume a character may be repeated.
4367 We can test for values between ESC_b and ESC_Z for the latter; this may
4368 have to change if any new ones are ever created. */
4369
4370 case '\\':
4371 tempptr = ptr;
4372 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
4373 if (*errorcodeptr != 0) goto FAILED;
4374
4375 if (c < 0)
4376 {
4377 if (-c == ESC_Q) /* Handle start of quoted string */
4378 {
4379 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
4380 else inescq = TRUE;
4381 continue;
4382 }
4383
4384 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
4385
4386 /* For metasequences that actually match a character, we disable the
4387 setting of a first character if it hasn't already been set. */
4388
4389 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
4390 firstbyte = REQ_NONE;
4391
4392 /* Set values to reset to if this is followed by a zero repeat. */
4393
4394 zerofirstbyte = firstbyte;
4395 zeroreqbyte = reqbyte;
4396
4397 /* \k<name> or \k'name' is a back reference by name (Perl syntax) */
4398
4399 if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\''))
4400 {
4401 is_recurse = FALSE;
4402 terminator = (*(++ptr) == '<')? '>' : '\'';
4403 goto NAMED_REF_OR_RECURSE;
4404 }
4405
4406 /* Back references are handled specially; must disable firstbyte if
4407 not set to cope with cases like (?=(\w+))\1: which would otherwise set
4408 ':' later. */
4409
4410 if (-c >= ESC_REF)
4411 {
4412 recno = -c - ESC_REF;
4413
4414 HANDLE_REFERENCE: /* Come here from named backref handling */
4415 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4416 previous = code;
4417 *code++ = OP_REF;
4418 PUT2INC(code, 0, recno);
4419 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
4420 if (recno > cd->top_backref) cd->top_backref = recno;
4421 }
4422
4423 /* So are Unicode property matches, if supported. */
4424
4425 #ifdef SUPPORT_UCP
4426 else if (-c == ESC_P || -c == ESC_p)
4427 {
4428 BOOL negated;
4429 int pdata;
4430 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4431 if (ptype < 0) goto FAILED;
4432 previous = code;
4433 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
4434 *code++ = ptype;
4435 *code++ = pdata;
4436 }
4437 #else
4438
4439 /* If Unicode properties are not supported, \X, \P, and \p are not
4440 allowed. */
4441
4442 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
4443 {
4444 *errorcodeptr = ERR45;
4445 goto FAILED;
4446 }
4447 #endif
4448
4449 /* For the rest (including \X when Unicode properties are supported), we
4450 can obtain the OP value by negating the escape value. */
4451
4452 else
4453 {
4454 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
4455 *code++ = -c;
4456 }
4457 continue;
4458 }
4459
4460 /* We have a data character whose value is in c. In UTF-8 mode it may have
4461 a value > 127. We set its representation in the length/buffer, and then
4462 handle it as a data character. */
4463
4464 #ifdef SUPPORT_UTF8
4465 if (utf8 && c > 127)
4466 mclength = _pcre_ord2utf8(c, mcbuffer);
4467 else
4468 #endif
4469
4470 {
4471 mcbuffer[0] = c;
4472 mclength = 1;
4473 }
4474 goto ONE_CHAR;
4475
4476
4477 /* ===================================================================*/
4478 /* Handle a literal character. It is guaranteed not to be whitespace or #
4479 when the extended flag is set. If we are in UTF-8 mode, it may be a
4480 multi-byte literal character. */
4481
4482 default:
4483 NORMAL_CHAR:
4484 mclength = 1;
4485 mcbuffer[0] = c;
4486
4487 #ifdef SUPPORT_UTF8
4488 if (utf8 && c >= 0xc0)
4489 {
4490 while ((ptr[1] & 0xc0) == 0x80)
4491 mcbuffer[mclength++] = *(++ptr);
4492 }
4493 #endif
4494
4495 /* At this point we have the character's bytes in mcbuffer, and the length
4496 in mclength. When not in UTF-8 mode, the length is always 1. */
4497
4498 ONE_CHAR:
4499 previous = code;
4500 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
4501 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
4502
4503 /* Set the first and required bytes appropriately. If no previous first
4504 byte, set it from this character, but revert to none on a zero repeat.
4505 Otherwise, leave the firstbyte value alone, and don't change it on a zero
4506 repeat. */
4507
4508 if (firstbyte == REQ_UNSET)
4509 {
4510 zerofirstbyte = REQ_NONE;
4511 zeroreqbyte = reqbyte;
4512
4513 /* If the character is more than one byte long, we can set firstbyte
4514 only if it is not to be matched caselessly. */
4515
4516 if (mclength == 1 || req_caseopt == 0)
4517 {
4518 firstbyte = mcbuffer[0] | req_caseopt;
4519 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
4520 }
4521 else firstbyte = reqbyte = REQ_NONE;
4522 }
4523
4524 /* firstbyte was previously set; we can set reqbyte only the length is
4525 1 or the matching is caseful. */
4526
4527 else
4528 {
4529 zerofirstbyte = firstbyte;
4530 zeroreqbyte = reqbyte;
4531 if (mclength == 1 || req_caseopt == 0)
4532 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
4533 }
4534
4535 break; /* End of literal character handling */
4536 }
4537 } /* end of big loop */
4538
4539
4540 /* Control never reaches here by falling through, only by a goto for all the
4541 error states. Pass back the position in the pattern so that it can be displayed
4542 to the user for diagnosing the error. */
4543
4544 FAILED:
4545 *ptrptr = ptr;
4546 return FALSE;
4547 }
4548
4549
4550
4551
4552 /*************************************************
4553 * Compile sequence of alternatives *
4554 *************************************************/
4555
4556 /* On entry, ptr is pointing past the bracket character, but on return it
4557 points to the closing bracket, or vertical bar, or end of string. The code
4558 variable is pointing at the byte into which the BRA operator has been stored.
4559 If the ims options are changed at the start (for a (?ims: group) or during any
4560 branch, we need to insert an OP_OPT item at the start of every following branch
4561 to ensure they get set correctly at run time, and also pass the new options
4562 into every subsequent branch compile.
4563
4564 This function is used during the pre-compile phase when we are trying to find
4565 out the amount of memory needed, as well as during the real compile phase. The
4566 value of lengthptr distinguishes the two phases.
4567
4568 Argument:
4569 options option bits, including any changes for this subpattern
4570 oldims previous settings of ims option bits
4571 codeptr -> the address of the current code pointer
4572 ptrptr -> the address of the current pattern pointer
4573 errorcodeptr -> pointer to error code variable
4574 lookbehind TRUE if this is a lookbehind assertion
4575 skipbytes skip this many bytes at start (for brackets and OP_COND)
4576 firstbyteptr place to put the first required character, or a negative number
4577 reqbyteptr place to put the last required character, or a negative number
4578 bcptr pointer to the chain of currently open branches
4579 cd points to the data block with tables pointers etc.
4580 lengthptr NULL during the real compile phase
4581 points to length accumulator during pre-compile phase
4582
4583 Returns: TRUE on success
4584 */
4585
4586 static BOOL
4587 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
4588 int *errorcodeptr, BOOL lookbehind, int skipbytes, int *firstbyteptr,
4589 int *reqbyteptr, branch_chain *bcptr, compile_data *cd, int *lengthptr)
4590 {
4591 const uschar *ptr = *ptrptr;
4592 uschar *code = *codeptr;
4593 uschar *last_branch = code;
4594 uschar *start_bracket = code;
4595 uschar *reverse_count = NULL;
4596 int firstbyte, reqbyte;
4597 int branchfirstbyte, branchreqbyte;
4598 int length;
4599 branch_chain bc;
4600
4601 bc.outer = bcptr;
4602 bc.current = code;
4603
4604 firstbyte = reqbyte = REQ_UNSET;
4605
4606 /* Accumulate the length for use in the pre-compile phase. Start with the
4607 length of the BRA and KET and any extra bytes that are required at the
4608 beginning. We accumulate in a local variable to save frequent testing of
4609 lenthptr for NULL. We cannot do this by looking at the value of code at the
4610 start and end of each alternative, because compiled items are discarded during
4611 the pre-compile phase so that the work space is not exceeded. */
4612
4613 length = 2 + 2*LINK_SIZE + skipbytes;
4614
4615 /* WARNING: If the above line is changed for any reason, you must also change
4616 the code that abstracts option settings at the start of the pattern and makes
4617 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
4618 pre-compile phase to find out whether anything has yet been compiled or not. */
4619
4620 /* Offset is set zero to mark that this bracket is still open */
4621
4622 PUT(code, 1, 0);
4623 code += 1 + LINK_SIZE + skipbytes;
4624
4625 /* Loop for each alternative branch */
4626
4627 for (;;)
4628 {
4629 /* Handle a change of ims options at the start of the branch */
4630
4631 if ((options & PCRE_IMS) != oldims)
4632 {
4633 *code++ = OP_OPT;
4634 *code++ = options & PCRE_IMS;
4635 length += 2;
4636 }
4637
4638 /* Set up dummy OP_REVERSE if lookbehind assertion */
4639
4640 if (lookbehind)
4641 {
4642 *code++ = OP_REVERSE;
4643 reverse_count = code;
4644 PUTINC(code, 0, 0);
4645 length += 1 + LINK_SIZE;
4646 }
4647
4648 /* Now compile the branch; in the pre-compile phase its length gets added
4649 into the length. */
4650
4651 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
4652 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
4653 {
4654 *ptrptr = ptr;
4655 return FALSE;
4656 }
4657
4658 /* In the real compile phase, there is some post-processing to be done. */
4659
4660 if (lengthptr == NULL)
4661 {
4662 /* If this is the first branch, the firstbyte and reqbyte values for the
4663 branch become the values for the regex. */
4664
4665 if (*last_branch != OP_ALT)
4666 {
4667 firstbyte = branchfirstbyte;
4668 reqbyte = branchreqbyte;
4669 }
4670
4671 /* If this is not the first branch, the first char and reqbyte have to
4672 match the values from all the previous branches, except that if the
4673 previous value for reqbyte didn't have REQ_VARY set, it can still match,
4674 and we set REQ_VARY for the regex. */
4675
4676 else
4677 {
4678 /* If we previously had a firstbyte, but it doesn't match the new branch,
4679 we have to abandon the firstbyte for the regex, but if there was
4680 previously no reqbyte, it takes on the value of the old firstbyte. */
4681
4682 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
4683 {
4684 if (reqbyte < 0) reqbyte = firstbyte;
4685 firstbyte = REQ_NONE;
4686 }
4687
4688 /* If we (now or from before) have no firstbyte, a firstbyte from the
4689 branch becomes a reqbyte if there isn't a branch reqbyte. */
4690
4691 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
4692 branchreqbyte = branchfirstbyte;
4693
4694 /* Now ensure that the reqbytes match */
4695
4696 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
4697 reqbyte = REQ_NONE;
4698 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
4699 }
4700
4701 /* If lookbehind, check that this branch matches a fixed-length string, and
4702 put the length into the OP_REVERSE item. Temporarily mark the end of the
4703 branch with OP_END. */
4704
4705 if (lookbehind)
4706 {
4707 int fixed_length;
4708 *code = OP_END;
4709 fixed_length = find_fixedlength(last_branch, options);
4710 DPRINTF(("fixed length = %d\n", fixed_length));
4711 if (fixed_length < 0)
4712 {
4713 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
4714 *ptrptr = ptr;
4715 return FALSE;
4716 }
4717 PUT(reverse_count, 0, fixed_length);
4718 }
4719 }
4720
4721 /* Reached end of expression, either ')' or end of pattern. Go back through
4722 the alternative branches and reverse the chain of offsets, with the field in
4723 the BRA item now becoming an offset to the first alternative. If there are
4724 no alternatives, it points to the end of the group. The length in the
4725 terminating ket is always the length of the whole bracketed item. If any of
4726 the ims options were changed inside the group, compile a resetting op-code
4727 following, except at the very end of the pattern. Return leaving the pointer
4728 at the terminating char. */
4729
4730 if (*ptr != '|')
4731 {
4732 int branch_length = code - last_branch;
4733 do
4734 {
4735 int prev_length = GET(last_branch, 1);
4736 PUT(last_branch, 1, branch_length);
4737 branch_length = prev_length;
4738 last_branch -= branch_length;
4739 }
4740 while (branch_length > 0);
4741
4742 /* Fill in the ket */
4743
4744 *code = OP_KET;
4745 PUT(code, 1, code - start_bracket);
4746 code += 1 + LINK_SIZE;
4747
4748 /* Resetting option if needed */
4749
4750 if ((options & PCRE_IMS) != oldims && *ptr == ')')
4751 {
4752 *code++ = OP_OPT;
4753 *code++ = oldims;
4754 length += 2;
4755 }
4756
4757 /* Set values to pass back */
4758
4759 *codeptr = code;
4760 *ptrptr = ptr;
4761 *firstbyteptr = firstbyte;
4762 *reqbyteptr = reqbyte;
4763 if (lengthptr != NULL) *lengthptr += length;
4764 return TRUE;
4765 }
4766
4767 /* Another branch follows; insert an "or" node. Its length field points back
4768 to the previous branch while the bracket remains open. At the end the chain
4769 is reversed. It's done like this so that the start of the bracket has a
4770 zero offset until it is closed, making it possible to detect recursion. */
4771
4772 *code = OP_ALT;
4773 PUT(code, 1, code - last_branch);
4774 bc.current = last_branch = code;
4775 code += 1 + LINK_SIZE;
4776 ptr++;
4777 length += 1 + LINK_SIZE;
4778 }
4779 /* Control never reaches here */
4780 }
4781
4782
4783
4784
4785 /*************************************************
4786 * Check for anchored expression *
4787 *************************************************/
4788
4789 /* Try to find out if this is an anchored regular expression. Consider each
4790 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
4791 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
4792 it's anchored. However, if this is a multiline pattern, then only OP_SOD
4793 counts, since OP_CIRC can match in the middle.
4794
4795 We can also consider a regex to be anchored if OP_SOM starts all its branches.
4796 This is the code for \G, which means "match at start of match position, taking
4797 into account the match offset".
4798
4799 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
4800 because that will try the rest of the pattern at all possible matching points,
4801 so there is no point trying again.... er ....
4802
4803 .... except when the .* appears inside capturing parentheses, and there is a
4804 subsequent back reference to those parentheses. We haven't enough information
4805 to catch that case precisely.
4806
4807 At first, the best we could do was to detect when .* was in capturing brackets
4808 and the highest back reference was greater than or equal to that level.
4809 However, by keeping a bitmap of the first 31 back references, we can catch some
4810 of the more common cases more precisely.
4811
4812 Arguments:
4813 code points to start of expression (the bracket)
4814 options points to the options setting
4815 bracket_map a bitmap of which brackets we are inside while testing; this
4816 handles up to substring 31; after that we just have to take
4817 the less precise approach
4818 backref_map the back reference bitmap
4819
4820 Returns: TRUE or FALSE
4821 */
4822
4823 static BOOL
4824 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
4825 unsigned int backref_map)
4826 {
4827 do {
4828 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
4829 options, PCRE_MULTILINE, FALSE);
4830 register int op = *scode;
4831
4832 /* Non-capturing brackets */
4833
4834 if (op == OP_BRA)
4835 {
4836 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
4837 }
4838
4839 /* Capturing brackets */
4840
4841 else if (op == OP_CBRA)
4842 {
4843 int n = GET2(scode, 1+LINK_SIZE);
4844 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
4845 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
4846 }
4847
4848 /* Other brackets */
4849
4850 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4851 {
4852 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
4853 }
4854
4855 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
4856 are or may be referenced. */
4857
4858 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
4859 op == OP_TYPEPOSSTAR) &&
4860 (*options & PCRE_DOTALL) != 0)
4861 {
4862 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
4863 }
4864
4865 /* Check for explicit anchoring */
4866
4867 else if (op != OP_SOD && op != OP_SOM &&
4868 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
4869 return FALSE;
4870 code += GET(code, 1);
4871 }
4872 while (*code == OP_ALT); /* Loop for each alternative */
4873 return TRUE;
4874 }
4875
4876
4877
4878 /*************************************************
4879 * Check for starting with ^ or .* *
4880 *************************************************/
4881
4882 /* This is called to find out if every branch starts with ^ or .* so that
4883 "first char" processing can be done to speed things up in multiline
4884 matching and for non-DOTALL patterns that start with .* (which must start at
4885 the beginning or after \n). As in the case of is_anchored() (see above), we
4886 have to take account of back references to capturing brackets that contain .*
4887 because in that case we can't make the assumption.
4888
4889 Arguments:
4890 code points to start of expression (the bracket)
4891 bracket_map a bitmap of which brackets we are inside while testing; this
4892 handles up to substring 31; after that we just have to take
4893 the less precise approach
4894 backref_map the back reference bitmap
4895
4896 Returns: TRUE or FALSE
4897 */
4898
4899 static BOOL
4900 is_startline(const uschar *code, unsigned int bracket_map,
4901 unsigned int backref_map)
4902 {
4903 do {
4904 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
4905 NULL, 0, FALSE);
4906 register int op = *scode;
4907
4908 /* Non-capturing brackets */
4909
4910 if (op == OP_BRA)
4911 {
4912 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
4913 }
4914
4915 /* Capturing brackets */
4916
4917 else if (op == OP_CBRA)
4918 {
4919 int n = GET2(scode, 1+LINK_SIZE);
4920 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
4921 if (!is_startline(scode, new_map, backref_map)) return FALSE;
4922 }
4923
4924 /* Other brackets */
4925
4926 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4927 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
4928
4929 /* .* means "start at start or after \n" if it isn't in brackets that
4930 may be referenced. */
4931
4932 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
4933 {
4934 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
4935 }
4936
4937 /* Check for explicit circumflex */
4938
4939 else if (op != OP_CIRC) return FALSE;
4940
4941 /* Move on to the next alternative */
4942
4943 code += GET(code, 1);
4944 }
4945 while (*code == OP_ALT); /* Loop for each alternative */
4946 return TRUE;
4947 }
4948
4949
4950
4951 /*************************************************
4952 * Check for asserted fixed first char *
4953 *************************************************/
4954
4955 /* During compilation, the "first char" settings from forward assertions are
4956 discarded, because they can cause conflicts with actual literals that follow.
4957 However, if we end up without a first char setting for an unanchored pattern,
4958 it is worth scanning the regex to see if there is an initial asserted first
4959 char. If all branches start with the same asserted char, or with a bracket all
4960 of whose alternatives start with the same asserted char (recurse ad lib), then
4961 we return that char, otherwise -1.
4962
4963 Arguments:
4964 code points to start of expression (the bracket)
4965 options pointer to the options (used to check casing changes)
4966 inassert TRUE if in an assertion
4967
4968 Returns: -1 or the fixed first char
4969 */
4970
4971 static int
4972 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
4973 {
4974 register int c = -1;
4975 do {
4976 int d;
4977 const uschar *scode =
4978 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
4979 register int op = *scode;
4980
4981 switch(op)
4982 {
4983 default:
4984 return -1;
4985
4986 case OP_BRA:
4987 case OP_CBRA:
4988 case OP_ASSERT:
4989 case OP_ONCE:
4990 case OP_COND:
4991 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
4992 return -1;
4993 if (c < 0) c = d; else if (c != d) return -1;
4994 break;
4995
4996 case OP_EXACT: /* Fall through */
4997 scode += 2;
4998
4999 case OP_CHAR:
5000 case OP_CHARNC:
5001 case OP_PLUS:
5002 case OP_MINPLUS:
5003 case OP_POSPLUS:
5004 if (!inassert) return -1;
5005 if (c < 0)
5006 {
5007 c = scode[1];
5008 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5009 }
5010 else if (c != scode[1]) return -1;
5011 break;
5012 }
5013
5014 code += GET(code, 1);
5015 }
5016 while (*code == OP_ALT);
5017 return c;
5018 }
5019
5020
5021
5022 /*************************************************
5023 * Compile a Regular Expression *
5024 *************************************************/
5025
5026 /* This function takes a string and returns a pointer to a block of store
5027 holding a compiled version of the expression. The original API for this
5028 function had no error code return variable; it is retained for backwards
5029 compatibility. The new function is given a new name.
5030
5031 Arguments:
5032 pattern the regular expression
5033 options various option bits
5034 errorcodeptr pointer to error code variable (pcre_compile2() only)
5035 can be NULL if you don't want a code value
5036 errorptr pointer to pointer to error text
5037 erroroffset ptr offset in pattern where error was detected
5038 tables pointer to character tables or NULL
5039
5040 Returns: pointer to compiled data block, or NULL on error,
5041 with errorptr and erroroffset set
5042 */
5043
5044 PCRE_DATA_SCOPE pcre *
5045 pcre_compile(const char *pattern, int options, const char **errorptr,
5046 int *erroroffset, const unsigned char *tables)
5047 {
5048 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5049 }
5050
5051
5052 PCRE_DATA_SCOPE pcre *
5053 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5054 const char **errorptr, int *erroroffset, const unsigned char *tables)
5055 {
5056 real_pcre *re;
5057 int length = 1; /* For final END opcode */
5058 int firstbyte, reqbyte, newline;
5059 int errorcode = 0;
5060 #ifdef SUPPORT_UTF8
5061 BOOL utf8;
5062 #endif
5063 size_t size;
5064 uschar *code;
5065 const uschar *codestart;
5066 const uschar *ptr;
5067 compile_data compile_block;
5068 compile_data *cd = &compile_block;
5069
5070 /* This space is used for "compiling" into during the first phase, when we are
5071 computing the amount of memory that is needed. Compiled items are thrown away
5072 as soon as possible, so that a fairly large buffer should be sufficient for
5073 this purpose. The same space is used in the second phase for remembering where
5074 to fill in forward references to subpatterns. */
5075
5076 uschar cworkspace[COMPILE_WORK_SIZE];
5077
5078
5079 /* Set this early so that early errors get offset 0. */
5080
5081 ptr = (const uschar *)pattern;
5082
5083 /* We can't pass back an error message if errorptr is NULL; I guess the best we
5084 can do is just return NULL, but we can set a code value if there is a code
5085 pointer. */
5086
5087 if (errorptr == NULL)
5088 {
5089 if (errorcodeptr != NULL) *errorcodeptr = 99;
5090 return NULL;
5091 }
5092
5093 *errorptr = NULL;
5094 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5095
5096 /* However, we can give a message for this error */
5097
5098 if (erroroffset == NULL)
5099 {
5100 errorcode = ERR16;
5101 goto PCRE_EARLY_ERROR_RETURN;
5102 }
5103
5104 *erroroffset = 0;
5105
5106 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
5107
5108 #ifdef SUPPORT_UTF8
5109 utf8 = (options & PCRE_UTF8) != 0;
5110 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
5111 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5112 {
5113 errorcode = ERR44;
5114 goto PCRE_UTF8_ERROR_RETURN;
5115 }
5116 #else
5117 if ((options & PCRE_UTF8) != 0)
5118 {
5119 errorcode = ERR32;
5120 goto PCRE_EARLY_ERROR_RETURN;
5121 }
5122 #endif
5123
5124 if ((options & ~PUBLIC_OPTIONS) != 0)
5125 {
5126 errorcode = ERR17;
5127 goto PCRE_EARLY_ERROR_RETURN;
5128 }
5129
5130 /* Set up pointers to the individual character tables */
5131
5132 if (tables == NULL) tables = _pcre_default_tables;
5133 cd->lcc = tables + lcc_offset;
5134 cd->fcc = tables + fcc_offset;
5135 cd->cbits = tables + cbits_offset;
5136 cd->ctypes = tables + ctypes_offset;
5137
5138 /* Handle different types of newline. The three bits give seven cases. The
5139 current code allows for fixed one- or two-byte sequences, plus "any". */
5140
5141 switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))
5142 {
5143 case 0: newline = NEWLINE; break; /* Compile-time default */
5144 case PCRE_NEWLINE_CR: newline = '\r'; break;
5145 case PCRE_NEWLINE_LF: newline = '\n'; break;
5146 case PCRE_NEWLINE_CR+
5147 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5148 case PCRE_NEWLINE_ANY: newline = -1; break;
5149 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5150 }
5151
5152 if (newline < 0)
5153 {
5154 cd->nltype = NLTYPE_ANY;
5155 }
5156 else
5157 {
5158 cd->nltype = NLTYPE_FIXED;
5159 if (newline > 255)
5160 {
5161 cd->nllen = 2;
5162 cd->nl[0] = (newline >> 8) & 255;
5163 cd->nl[1] = newline & 255;
5164 }
5165 else
5166 {
5167 cd->nllen = 1;
5168 cd->nl[0] = newline;
5169 }
5170 }
5171
5172 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
5173 references to help in deciding whether (.*) can be treated as anchored or not.
5174 */
5175
5176 cd->top_backref = 0;
5177 cd->backref_map = 0;
5178
5179 /* Reflect pattern for debugging output */
5180
5181 DPRINTF(("------------------------------------------------------------------\n"));
5182 DPRINTF(("%s\n", pattern));
5183
5184 /* Pretend to compile the pattern while actually just accumulating the length
5185 of memory required. This behaviour is triggered by passing a non-NULL final
5186 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
5187 to compile parts of the pattern into; the compiled code is discarded when it is
5188 no longer needed, so hopefully this workspace will never overflow, though there
5189 is a test for its doing so. */
5190
5191 cd->bracount = 0;
5192 cd->names_found = 0;
5193 cd->name_entry_size = 0;
5194 cd->name_table = NULL;
5195 cd->start_workspace = cworkspace;
5196 cd->start_code = cworkspace;
5197 cd->hwm = cworkspace;
5198 cd->start_pattern = (const uschar *)pattern;
5199 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
5200 cd->req_varyopt = 0;
5201 cd->nopartial = FALSE;
5202 cd->external_options = options;
5203
5204 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
5205 don't need to look at the result of the function here. The initial options have
5206 been put into the cd block so that they can be changed if an option setting is
5207 found within the regex right at the beginning. Bringing initial option settings
5208 outside can help speed up starting point checks. */
5209
5210 code = cworkspace;
5211 *code = OP_BRA;
5212 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
5213 &code, &ptr, &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, &length);
5214 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
5215
5216 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
5217 cd->hwm - cworkspace));
5218
5219 if (length > MAX_PATTERN_SIZE)
5220 {
5221 errorcode = ERR20;
5222 goto PCRE_EARLY_ERROR_RETURN;
5223 }
5224
5225 /* Compute the size of data block needed and get it, either from malloc or
5226 externally provided function. Integer overflow should no longer be possible
5227 because nowadays we limit the maximum value of cd->names_found and
5228 cd->name_entry_size. */
5229
5230 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
5231 re = (real_pcre *)(pcre_malloc)(size);
5232
5233 if (re == NULL)
5234 {
5235 errorcode = ERR21;
5236 goto PCRE_EARLY_ERROR_RETURN;
5237 }
5238
5239 /* Put in the magic number, and save the sizes, initial options, and character
5240 table pointer. NULL is used for the default character tables. The nullpad field
5241 is at the end; it's there to help in the case when a regex compiled on a system
5242 with 4-byte pointers is run on another with 8-byte pointers. */
5243
5244 re->magic_number = MAGIC_NUMBER;
5245 re->size = size;
5246 re->options = cd->external_options;
5247 re->dummy1 = 0;
5248 re->first_byte = 0;
5249 re->req_byte = 0;
5250 re->name_table_offset = sizeof(real_pcre);
5251 re->name_entry_size = cd->name_entry_size;
5252 re->name_count = cd->names_found;
5253 re->ref_count = 0;
5254 re->tables = (tables == _pcre_default_tables)? NULL : tables;
5255 re->nullpad = NULL;
5256
5257 /* The starting points of the name/number translation table and of the code are
5258 passed around in the compile data block. The start/end pattern and initial
5259 options are already set from the pre-compile phase, as is the name_entry_size
5260 field. Reset the bracket count and the names_found field. Also reset the hwm
5261 field; this time it's used for remembering forward references to subpatterns.
5262 */
5263
5264 cd->bracount = 0;
5265 cd->names_found = 0;
5266 cd->name_table = (uschar *)re + re->name_table_offset;
5267 codestart = cd->name_table + re->name_entry_size * re->name_count;
5268 cd->start_code = codestart;
5269 cd->hwm = cworkspace;
5270 cd->req_varyopt = 0;
5271 cd->nopartial = FALSE;
5272
5273 /* Set up a starting, non-extracting bracket, then compile the expression. On
5274 error, errorcode will be set non-zero, so we don't need to look at the result
5275 of the function here. */
5276
5277 ptr = (const uschar *)pattern;
5278 code = (uschar *)codestart;
5279 *code = OP_BRA;
5280 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
5281 &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
5282 re->top_bracket = cd->bracount;
5283 re->top_backref = cd->top_backref;
5284
5285 if (cd->nopartial) re->options |= PCRE_NOPARTIAL;
5286
5287 /* If not reached end of pattern on success, there's an excess bracket. */
5288
5289 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
5290
5291 /* Fill in the terminating state and check for disastrous overflow, but
5292 if debugging, leave the test till after things are printed out. */
5293
5294 *code++ = OP_END;
5295
5296 #ifndef DEBUG
5297 if (code - codestart > length) errorcode = ERR23;
5298 #endif
5299
5300 /* Fill in any forward references that are required. */
5301
5302 while (errorcode == 0 && cd->hwm > cworkspace)
5303 {
5304 int offset, recno;
5305 const uschar *groupptr;
5306 cd->hwm -= LINK_SIZE;
5307 offset = GET(cd->hwm, 0);
5308 recno = GET(codestart, offset);
5309 groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
5310 if (groupptr == NULL) errorcode = ERR53;
5311 else PUT(((uschar *)codestart), offset, groupptr - codestart);
5312 }
5313
5314 /* Give an error if there's back reference to a non-existent capturing
5315 subpattern. */
5316
5317 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
5318
5319 /* Failed to compile, or error while post-processing */
5320
5321 if (errorcode != 0)
5322 {
5323 (pcre_free)(re);
5324 PCRE_EARLY_ERROR_RETURN:
5325 *erroroffset = ptr - (const uschar *)pattern;
5326 #ifdef SUPPORT_UTF8
5327 PCRE_UTF8_ERROR_RETURN:
5328 #endif
5329 *errorptr = error_texts[errorcode];
5330 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
5331 return NULL;
5332 }
5333
5334 /* If the anchored option was not passed, set the flag if we can determine that
5335 the pattern is anchored by virtue of ^ characters or \A or anything else (such
5336 as starting with .* when DOTALL is set).
5337
5338 Otherwise, if we know what the first byte has to be, save it, because that
5339 speeds up unanchored matches no end. If not, see if we can set the
5340 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
5341 start with ^. and also when all branches start with .* for non-DOTALL matches.
5342 */
5343
5344 if ((re->options & PCRE_ANCHORED) == 0)
5345 {
5346 int temp_options = re->options; /* May get changed during these scans */
5347 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
5348 re->options |= PCRE_ANCHORED;
5349 else
5350 {
5351 if (firstbyte < 0)
5352 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
5353 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
5354 {
5355 int ch = firstbyte & 255;
5356 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
5357 cd->fcc[ch] == ch)? ch : firstbyte;
5358 re->options |= PCRE_FIRSTSET;
5359 }
5360 else if (is_startline(codestart, 0, cd->backref_map))
5361 re->options |= PCRE_STARTLINE;
5362 }
5363 }
5364
5365 /* For an anchored pattern, we use the "required byte" only if it follows a
5366 variable length item in the regex. Remove the caseless flag for non-caseable
5367 bytes. */
5368
5369 if (reqbyte >= 0 &&
5370 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
5371 {
5372 int ch = reqbyte & 255;
5373 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
5374 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5375 re->options |= PCRE_REQCHSET;
5376 }
5377
5378 /* Print out the compiled data if debugging is enabled. This is never the
5379 case when building a production library. */
5380
5381 #ifdef DEBUG
5382
5383 printf("Length = %d top_bracket = %d top_backref = %d\n",
5384 length, re->top_bracket, re->top_backref);
5385
5386 if (re->options != 0)
5387 {
5388 printf("%s%s%s%s%s%s%s%s%s\n",
5389 ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5390 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5391 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5392 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5393 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5394 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5395 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5396 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5397 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5398 }
5399
5400 if ((re->options & PCRE_FIRSTSET) != 0)
5401 {
5402 int ch = re->first_byte & 255;
5403 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
5404 "" : " (caseless)";
5405 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5406 else printf("First char = \\x%02x%s\n", ch, caseless);
5407 }
5408
5409 if ((re->options & PCRE_REQCHSET) != 0)
5410 {
5411 int ch = re->req_byte & 255;
5412 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
5413 "" : " (caseless)";
5414 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5415 else printf("Req char = \\x%02x%s\n", ch, caseless);
5416 }
5417
5418 pcre_printint(re, stdout);
5419
5420 /* This check is done here in the debugging case so that the code that
5421 was compiled can be seen. */
5422
5423 if (code - codestart > length)
5424 {
5425 (pcre_free)(re);
5426 *errorptr = error_texts[ERR23];
5427 *erroroffset = ptr - (uschar *)pattern;
5428 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
5429 return NULL;
5430 }
5431 #endif /* DEBUG */
5432
5433 return (pcre *)re;
5434 }
5435
5436 /* End of pcre_compile.c */