Allow only accept and warn in the not-QUIT ACL.
[exim.git] / src / src / pcre / pcre_compile.c
CommitLineData
64f2600a 1/* $Cambridge: exim/src/src/pcre/pcre_compile.c,v 1.5 2007/06/26 11:16:54 ph10 Exp $ */
8ac170f3
PH
2
3/*************************************************
4* Perl-Compatible Regular Expressions *
5*************************************************/
6
7/* PCRE is a library of functions to support regular expressions whose syntax
8and semantics are as close as possible to those of the Perl 5 language.
9
10 Written by Philip Hazel
64f2600a 11 Copyright (c) 1997-2007 University of Cambridge
8ac170f3
PH
12
13-----------------------------------------------------------------------------
14Redistribution and use in source and binary forms, with or without
15modification, are permitted provided that the following conditions are met:
16
17 * Redistributions of source code must retain the above copyright notice,
18 this list of conditions and the following disclaimer.
19
20 * Redistributions in binary form must reproduce the above copyright
21 notice, this list of conditions and the following disclaimer in the
22 documentation and/or other materials provided with the distribution.
23
24 * Neither the name of the University of Cambridge nor the names of its
25 contributors may be used to endorse or promote products derived from
26 this software without specific prior written permission.
27
28THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38POSSIBILITY OF SUCH DAMAGE.
39-----------------------------------------------------------------------------
40*/
41
42
43/* This module contains the external function pcre_compile(), along with
44supporting internal functions that are not used by other modules. */
45
46
6bf342e1
PH
47#define NLBLOCK cd /* Block containing newline information */
48#define PSSTART start_pattern /* Field containing processed string start */
49#define PSEND end_pattern /* Field containing processed string end */
50
51
8ac170f3
PH
52#include "pcre_internal.h"
53
54
aa41d2de
PH
55/* When DEBUG is defined, we need the pcre_printint() function, which is also
56used by pcretest. DEBUG is not defined when building a production library. */
57
58#ifdef DEBUG
59#include "pcre_printint.src"
60#endif
61
62
64f2600a
PH
63/* Macro for setting individual bits in class bitmaps. */
64
65#define SETBIT(a,b) a[b/8] |= (1 << (b%8))
66
67
8ac170f3
PH
68/*************************************************
69* Code parameters and static tables *
70*************************************************/
71
6bf342e1
PH
72/* This value specifies the size of stack workspace that is used during the
73first pre-compile phase that determines how much memory is required. The regex
74is partly compiled into this space, but the compiled parts are discarded as
75soon as they can be, so that hopefully there will never be an overrun. The code
76does, however, check for an overrun. The largest amount I've seen used is 218,
77so this number is very generous.
78
79The same workspace is used during the second, actual compile phase for
80remembering forward references to groups so that they can be filled in at the
81end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
82is 4 there is plenty of room. */
8ac170f3 83
6bf342e1 84#define COMPILE_WORK_SIZE (4096)
8ac170f3
PH
85
86
87/* Table for handling escaped characters in the range '0'-'z'. Positive returns
88are simple data values; negative values are for special things like \d and so
89on. Zero means further processing is needed (for things like \x), or the escape
90is invalid. */
91
64f2600a 92#ifndef EBCDIC /* This is the "normal" table for ASCII systems */
8ac170f3
PH
93static const short int escapes[] = {
94 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
95 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
96 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
64f2600a
PH
97-ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
98-ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
8ac170f3
PH
99-ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
100 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
64f2600a
PH
101-ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
102-ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
8ac170f3
PH
103 0, 0, -ESC_z /* x - z */
104};
105
64f2600a 106#else /* This is the "abnormal" table for EBCDIC systems */
8ac170f3
PH
107static const short int escapes[] = {
108/* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
109/* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
110/* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
111/* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
112/* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
113/* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
114/* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
115/* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
64f2600a 116/* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
6bf342e1 117/* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
8ac170f3 118/* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
64f2600a 119/* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
8ac170f3
PH
120/* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
121/* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
122/* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
123/* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
64f2600a 124/* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
8ac170f3 125/* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
6bf342e1 126/* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
64f2600a 127/* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
8ac170f3
PH
128/* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
129/* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
130/* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
131};
132#endif
133
134
135/* Tables of names of POSIX character classes and their lengths. The list is
aa41d2de 136terminated by a zero length entry. The first three must be alpha, lower, upper,
8ac170f3
PH
137as this is assumed for handling case independence. */
138
139static const char *const posix_names[] = {
140 "alpha", "lower", "upper",
141 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
142 "print", "punct", "space", "word", "xdigit" };
143
144static const uschar posix_name_lengths[] = {
145 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
146
aa41d2de
PH
147/* Table of class bit maps for each POSIX class. Each class is formed from a
148base map, with an optional addition or removal of another map. Then, for some
149classes, there is some additional tweaking: for [:blank:] the vertical space
150characters are removed, and for [:alpha:] and [:alnum:] the underscore
151character is removed. The triples in the table consist of the base map offset,
152second map offset or -1 if no second map, and a non-negative value for map
153addition or a negative value for map subtraction (if there are two maps). The
154absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
155remove vertical space characters, 2 => remove underscore. */
8ac170f3
PH
156
157static const int posix_class_maps[] = {
aa41d2de
PH
158 cbit_word, cbit_digit, -2, /* alpha */
159 cbit_lower, -1, 0, /* lower */
160 cbit_upper, -1, 0, /* upper */
161 cbit_word, -1, 2, /* alnum - word without underscore */
162 cbit_print, cbit_cntrl, 0, /* ascii */
163 cbit_space, -1, 1, /* blank - a GNU extension */
164 cbit_cntrl, -1, 0, /* cntrl */
165 cbit_digit, -1, 0, /* digit */
166 cbit_graph, -1, 0, /* graph */
167 cbit_print, -1, 0, /* print */
168 cbit_punct, -1, 0, /* punct */
169 cbit_space, -1, 0, /* space */
170 cbit_word, -1, 0, /* word - a Perl extension */
171 cbit_xdigit,-1, 0 /* xdigit */
8ac170f3
PH
172};
173
174
6bf342e1
PH
175#define STRING(a) # a
176#define XSTRING(s) STRING(s)
177
8ac170f3 178/* The texts of compile-time error messages. These are "char *" because they
6bf342e1
PH
179are passed to the outside world. Do not ever re-use any error number, because
180they are documented. Always add a new error instead. Messages marked DEAD below
181are no longer used. */
8ac170f3
PH
182
183static const char *error_texts[] = {
184 "no error",
185 "\\ at end of pattern",
186 "\\c at end of pattern",
187 "unrecognized character follows \\",
188 "numbers out of order in {} quantifier",
189 /* 5 */
190 "number too big in {} quantifier",
191 "missing terminating ] for character class",
192 "invalid escape sequence in character class",
193 "range out of order in character class",
194 "nothing to repeat",
195 /* 10 */
6bf342e1 196 "operand of unlimited repeat could match the empty string", /** DEAD **/
8ac170f3
PH
197 "internal error: unexpected repeat",
198 "unrecognized character after (?",
199 "POSIX named classes are supported only within a class",
200 "missing )",
201 /* 15 */
202 "reference to non-existent subpattern",
203 "erroffset passed as NULL",
204 "unknown option bit(s) set",
205 "missing ) after comment",
6bf342e1 206 "parentheses nested too deeply", /** DEAD **/
8ac170f3
PH
207 /* 20 */
208 "regular expression too large",
209 "failed to get memory",
210 "unmatched parentheses",
211 "internal error: code overflow",
212 "unrecognized character after (?<",
213 /* 25 */
214 "lookbehind assertion is not fixed length",
aa41d2de 215 "malformed number or name after (?(",
8ac170f3
PH
216 "conditional group contains more than two branches",
217 "assertion expected after (?(",
64f2600a 218 "(?R or (?[+-]digits must be followed by )",
8ac170f3
PH
219 /* 30 */
220 "unknown POSIX class name",
221 "POSIX collating elements are not supported",
222 "this version of PCRE is not compiled with PCRE_UTF8 support",
6bf342e1 223 "spare error", /** DEAD **/
8ac170f3
PH
224 "character value in \\x{...} sequence is too large",
225 /* 35 */
226 "invalid condition (?(0)",
227 "\\C not allowed in lookbehind assertion",
228 "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
229 "number after (?C is > 255",
230 "closing ) for (?C expected",
231 /* 40 */
232 "recursive call could loop indefinitely",
233 "unrecognized character after (?P",
6bf342e1 234 "syntax error in subpattern name (missing terminator)",
aa41d2de 235 "two named subpatterns have the same name",
8ac170f3
PH
236 "invalid UTF-8 string",
237 /* 45 */
238 "support for \\P, \\p, and \\X has not been compiled",
239 "malformed \\P or \\p sequence",
aa41d2de 240 "unknown property name after \\P or \\p",
6bf342e1
PH
241 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
242 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
aa41d2de
PH
243 /* 50 */
244 "repeated subpattern is too long",
6bf342e1
PH
245 "octal value is greater than \\377 (not in UTF-8 mode)",
246 "internal error: overran compiling workspace",
247 "internal error: previously-checked referenced subpattern not found",
248 "DEFINE group contains more than one branch",
249 /* 55 */
250 "repeating a DEFINE group is not allowed",
251 "inconsistent NEWLINE options",
64f2600a
PH
252 "\\g is not followed by a braced name or an optionally braced non-zero number",
253 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number"
8ac170f3
PH
254};
255
256
257/* Table to identify digits and hex digits. This is used when compiling
258patterns. Note that the tables in chartables are dependent on the locale, and
259may mark arbitrary characters as digits - but the PCRE compiling code expects
260to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
261a private table here. It costs 256 bytes, but it is a lot faster than doing
262character value tests (at least in some simple cases I timed), and in some
263applications one wants PCRE to compile efficiently as well as match
264efficiently.
265
266For convenience, we use the same bit definitions as in chartables:
267
268 0x04 decimal digit
269 0x08 hexadecimal digit
270
271Then we can use ctype_digit and ctype_xdigit in the code. */
272
64f2600a 273#ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
8ac170f3
PH
274static const unsigned char digitab[] =
275 {
276 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
277 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
278 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
279 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
280 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
281 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
282 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
283 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
284 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
285 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
286 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
287 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
288 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
289 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
290 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
291 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
292 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
293 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
294 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
295 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
296 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
297 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
298 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
299 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
300 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
301 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
302 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
303 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
304 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
305 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
306 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
307 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
308
64f2600a 309#else /* This is the "abnormal" case, for EBCDIC systems */
8ac170f3
PH
310static const unsigned char digitab[] =
311 {
312 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
313 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
314 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
315 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
316 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
317 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
318 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
319 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
320 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
321 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
322 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
64f2600a 323 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
8ac170f3
PH
324 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
325 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
326 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
327 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
328 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
329 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
330 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
331 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
332 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
333 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
334 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
335 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
336 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
337 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
338 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
339 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
340 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
341 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
342 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
343 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
344
345static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
346 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
347 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
348 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
349 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
350 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
351 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
352 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
353 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
354 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
355 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
356 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
64f2600a 357 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
8ac170f3
PH
358 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
359 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
360 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
361 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
362 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
363 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
364 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
365 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
366 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
367 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
368 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
369 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
370 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
371 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
372 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
373 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
374 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
375 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
376 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
377 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
378#endif
379
380
381/* Definition to allow mutual recursion */
382
383static BOOL
64f2600a
PH
384 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
385 int *, int *, branch_chain *, compile_data *, int *);
8ac170f3
PH
386
387
388
389/*************************************************
390* Handle escapes *
391*************************************************/
392
393/* This function is called when a \ has been encountered. It either returns a
394positive value for a simple escape such as \n, or a negative value which
6bf342e1
PH
395encodes one of the more complicated things such as \d. A backreference to group
396n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
397UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
398ptr is pointing at the \. On exit, it is on the final character of the escape
399sequence.
8ac170f3
PH
400
401Arguments:
402 ptrptr points to the pattern position pointer
403 errorcodeptr points to the errorcode variable
404 bracount number of previous extracting brackets
405 options the options bits
406 isclass TRUE if inside a character class
407
408Returns: zero or positive => a data character
409 negative => a special escape sequence
410 on error, errorptr is set
411*/
412
413static int
414check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
415 int options, BOOL isclass)
416{
aa41d2de
PH
417BOOL utf8 = (options & PCRE_UTF8) != 0;
418const uschar *ptr = *ptrptr + 1;
8ac170f3
PH
419int c, i;
420
aa41d2de
PH
421GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
422ptr--; /* Set pointer back to the last byte */
423
8ac170f3
PH
424/* If backslash is at the end of the pattern, it's an error. */
425
8ac170f3
PH
426if (c == 0) *errorcodeptr = ERR1;
427
428/* Non-alphamerics are literals. For digits or letters, do an initial lookup in
429a table. A non-zero result is something that can be returned immediately.
430Otherwise further processing may be required. */
431
64f2600a 432#ifndef EBCDIC /* ASCII coding */
8ac170f3
PH
433else if (c < '0' || c > 'z') {} /* Not alphameric */
434else if ((i = escapes[c - '0']) != 0) c = i;
435
64f2600a 436#else /* EBCDIC coding */
8ac170f3
PH
437else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
438else if ((i = escapes[c - 0x48]) != 0) c = i;
439#endif
440
441/* Escapes that need further processing, or are illegal. */
442
443else
444 {
445 const uschar *oldptr;
6bf342e1
PH
446 BOOL braced, negated;
447
8ac170f3
PH
448 switch (c)
449 {
450 /* A number of Perl escapes are not handled by PCRE. We give an explicit
451 error. */
452
453 case 'l':
454 case 'L':
455 case 'N':
456 case 'u':
457 case 'U':
458 *errorcodeptr = ERR37;
459 break;
460
6bf342e1
PH
461 /* \g must be followed by a number, either plain or braced. If positive, it
462 is an absolute backreference. If negative, it is a relative backreference.
64f2600a
PH
463 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
464 reference to a named group. This is part of Perl's movement towards a
465 unified syntax for back references. As this is synonymous with \k{name}, we
466 fudge it up by pretending it really was \k. */
6bf342e1
PH
467
468 case 'g':
469 if (ptr[1] == '{')
470 {
64f2600a
PH
471 const uschar *p;
472 for (p = ptr+2; *p != 0 && *p != '}'; p++)
473 if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
474 if (*p != 0 && *p != '}')
475 {
476 c = -ESC_k;
477 break;
478 }
6bf342e1
PH
479 braced = TRUE;
480 ptr++;
481 }
482 else braced = FALSE;
483
484 if (ptr[1] == '-')
485 {
486 negated = TRUE;
487 ptr++;
488 }
489 else negated = FALSE;
490
491 c = 0;
492 while ((digitab[ptr[1]] & ctype_digit) != 0)
493 c = c * 10 + *(++ptr) - '0';
494
495 if (c == 0 || (braced && *(++ptr) != '}'))
496 {
497 *errorcodeptr = ERR57;
498 return 0;
499 }
500
501 if (negated)
502 {
503 if (c > bracount)
504 {
505 *errorcodeptr = ERR15;
506 return 0;
507 }
508 c = bracount - (c - 1);
509 }
510
511 c = -(ESC_REF + c);
512 break;
513
8ac170f3
PH
514 /* The handling of escape sequences consisting of a string of digits
515 starting with one that is not zero is not straightforward. By experiment,
516 the way Perl works seems to be as follows:
517
518 Outside a character class, the digits are read as a decimal number. If the
519 number is less than 10, or if there are that many previous extracting
520 left brackets, then it is a back reference. Otherwise, up to three octal
521 digits are read to form an escaped byte. Thus \123 is likely to be octal
522 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
523 value is greater than 377, the least significant 8 bits are taken. Inside a
524 character class, \ followed by a digit is always an octal number. */
525
526 case '1': case '2': case '3': case '4': case '5':
527 case '6': case '7': case '8': case '9':
528
529 if (!isclass)
530 {
531 oldptr = ptr;
532 c -= '0';
533 while ((digitab[ptr[1]] & ctype_digit) != 0)
534 c = c * 10 + *(++ptr) - '0';
535 if (c < 10 || c <= bracount)
536 {
537 c = -(ESC_REF + c);
538 break;
539 }
540 ptr = oldptr; /* Put the pointer back and fall through */
541 }
542
543 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
544 generates a binary zero byte and treats the digit as a following literal.
545 Thus we have to pull back the pointer by one. */
546
547 if ((c = *ptr) >= '8')
548 {
549 ptr--;
550 c = 0;
551 break;
552 }
553
554 /* \0 always starts an octal number, but we may drop through to here with a
aa41d2de
PH
555 larger first octal digit. The original code used just to take the least
556 significant 8 bits of octal numbers (I think this is what early Perls used
557 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
558 than 3 octal digits. */
8ac170f3
PH
559
560 case '0':
561 c -= '0';
562 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
563 c = c * 8 + *(++ptr) - '0';
aa41d2de 564 if (!utf8 && c > 255) *errorcodeptr = ERR51;
8ac170f3
PH
565 break;
566
aa41d2de
PH
567 /* \x is complicated. \x{ddd} is a character number which can be greater
568 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
569 treated as a data character. */
8ac170f3
PH
570
571 case 'x':
aa41d2de 572 if (ptr[1] == '{')
8ac170f3
PH
573 {
574 const uschar *pt = ptr + 2;
aa41d2de
PH
575 int count = 0;
576
8ac170f3
PH
577 c = 0;
578 while ((digitab[*pt] & ctype_xdigit) != 0)
579 {
aa41d2de
PH
580 register int cc = *pt++;
581 if (c == 0 && cc == '0') continue; /* Leading zeroes */
8ac170f3 582 count++;
aa41d2de 583
64f2600a 584#ifndef EBCDIC /* ASCII coding */
8ac170f3 585 if (cc >= 'a') cc -= 32; /* Convert to upper case */
aa41d2de 586 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
64f2600a 587#else /* EBCDIC coding */
8ac170f3 588 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
aa41d2de 589 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
8ac170f3
PH
590#endif
591 }
aa41d2de 592
8ac170f3
PH
593 if (*pt == '}')
594 {
aa41d2de 595 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
8ac170f3
PH
596 ptr = pt;
597 break;
598 }
aa41d2de 599
8ac170f3
PH
600 /* If the sequence of hex digits does not end with '}', then we don't
601 recognize this construct; fall through to the normal \x handling. */
602 }
8ac170f3 603
aa41d2de 604 /* Read just a single-byte hex-defined char */
8ac170f3
PH
605
606 c = 0;
607 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
608 {
609 int cc; /* Some compilers don't like ++ */
610 cc = *(++ptr); /* in initializers */
64f2600a 611#ifndef EBCDIC /* ASCII coding */
8ac170f3
PH
612 if (cc >= 'a') cc -= 32; /* Convert to upper case */
613 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
64f2600a 614#else /* EBCDIC coding */
8ac170f3
PH
615 if (cc <= 'z') cc += 64; /* Convert to upper case */
616 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
617#endif
618 }
619 break;
620
6bf342e1
PH
621 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
622 This coding is ASCII-specific, but then the whole concept of \cx is
623 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
8ac170f3
PH
624
625 case 'c':
626 c = *(++ptr);
627 if (c == 0)
628 {
629 *errorcodeptr = ERR2;
630 return 0;
631 }
632
64f2600a 633#ifndef EBCDIC /* ASCII coding */
8ac170f3
PH
634 if (c >= 'a' && c <= 'z') c -= 32;
635 c ^= 0x40;
64f2600a 636#else /* EBCDIC coding */
8ac170f3
PH
637 if (c >= 'a' && c <= 'z') c += 64;
638 c ^= 0xC0;
639#endif
640 break;
641
642 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
643 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
644 for Perl compatibility, it is a literal. This code looks a bit odd, but
645 there used to be some cases other than the default, and there may be again
646 in future, so I haven't "optimized" it. */
647
648 default:
649 if ((options & PCRE_EXTRA) != 0) switch(c)
650 {
651 default:
652 *errorcodeptr = ERR3;
653 break;
654 }
655 break;
656 }
657 }
658
659*ptrptr = ptr;
660return c;
661}
662
663
664
665#ifdef SUPPORT_UCP
666/*************************************************
667* Handle \P and \p *
668*************************************************/
669
670/* This function is called after \P or \p has been encountered, provided that
671PCRE is compiled with support for Unicode properties. On entry, ptrptr is
672pointing at the P or p. On exit, it is pointing at the final character of the
673escape sequence.
674
675Argument:
676 ptrptr points to the pattern position pointer
677 negptr points to a boolean that is set TRUE for negation else FALSE
aa41d2de 678 dptr points to an int that is set to the detailed property value
8ac170f3
PH
679 errorcodeptr points to the error code variable
680
aa41d2de 681Returns: type value from ucp_type_table, or -1 for an invalid type
8ac170f3
PH
682*/
683
684static int
aa41d2de 685get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
8ac170f3
PH
686{
687int c, i, bot, top;
688const uschar *ptr = *ptrptr;
aa41d2de 689char name[32];
8ac170f3
PH
690
691c = *(++ptr);
692if (c == 0) goto ERROR_RETURN;
693
694*negptr = FALSE;
695
aa41d2de
PH
696/* \P or \p can be followed by a name in {}, optionally preceded by ^ for
697negation. */
8ac170f3
PH
698
699if (c == '{')
700 {
701 if (ptr[1] == '^')
702 {
703 *negptr = TRUE;
704 ptr++;
705 }
aa41d2de 706 for (i = 0; i < sizeof(name) - 1; i++)
8ac170f3
PH
707 {
708 c = *(++ptr);
709 if (c == 0) goto ERROR_RETURN;
710 if (c == '}') break;
711 name[i] = c;
712 }
aa41d2de 713 if (c !='}') goto ERROR_RETURN;
8ac170f3
PH
714 name[i] = 0;
715 }
716
717/* Otherwise there is just one following character */
718
719else
720 {
721 name[0] = c;
722 name[1] = 0;
723 }
724
725*ptrptr = ptr;
726
727/* Search for a recognized property name using binary chop */
728
729bot = 0;
730top = _pcre_utt_size;
731
732while (bot < top)
733 {
aa41d2de 734 i = (bot + top) >> 1;
8ac170f3 735 c = strcmp(name, _pcre_utt[i].name);
aa41d2de
PH
736 if (c == 0)
737 {
738 *dptr = _pcre_utt[i].value;
739 return _pcre_utt[i].type;
740 }
8ac170f3
PH
741 if (c > 0) bot = i + 1; else top = i;
742 }
743
8ac170f3
PH
744*errorcodeptr = ERR47;
745*ptrptr = ptr;
746return -1;
747
748ERROR_RETURN:
749*errorcodeptr = ERR46;
750*ptrptr = ptr;
751return -1;
752}
753#endif
754
755
756
757
758/*************************************************
759* Check for counted repeat *
760*************************************************/
761
762/* This function is called when a '{' is encountered in a place where it might
763start a quantifier. It looks ahead to see if it really is a quantifier or not.
764It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
765where the ddds are digits.
766
767Arguments:
768 p pointer to the first char after '{'
769
770Returns: TRUE or FALSE
771*/
772
773static BOOL
774is_counted_repeat(const uschar *p)
775{
776if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
777while ((digitab[*p] & ctype_digit) != 0) p++;
778if (*p == '}') return TRUE;
779
780if (*p++ != ',') return FALSE;
781if (*p == '}') return TRUE;
782
783if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
784while ((digitab[*p] & ctype_digit) != 0) p++;
785
786return (*p == '}');
787}
788
789
790
791/*************************************************
792* Read repeat counts *
793*************************************************/
794
795/* Read an item of the form {n,m} and return the values. This is called only
796after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
797so the syntax is guaranteed to be correct, but we need to check the values.
798
799Arguments:
800 p pointer to first char after '{'
801 minp pointer to int for min
802 maxp pointer to int for max
803 returned as -1 if no max
804 errorcodeptr points to error code variable
805
806Returns: pointer to '}' on success;
807 current ptr on error, with errorcodeptr set non-zero
808*/
809
810static const uschar *
811read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
812{
813int min = 0;
814int max = -1;
815
92e772ff
PH
816/* Read the minimum value and do a paranoid check: a negative value indicates
817an integer overflow. */
818
8ac170f3 819while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
92e772ff
PH
820if (min < 0 || min > 65535)
821 {
822 *errorcodeptr = ERR5;
823 return p;
824 }
825
826/* Read the maximum value if there is one, and again do a paranoid on its size.
827Also, max must not be less than min. */
8ac170f3
PH
828
829if (*p == '}') max = min; else
830 {
831 if (*(++p) != '}')
832 {
833 max = 0;
834 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
92e772ff
PH
835 if (max < 0 || max > 65535)
836 {
837 *errorcodeptr = ERR5;
838 return p;
839 }
8ac170f3
PH
840 if (max < min)
841 {
842 *errorcodeptr = ERR4;
843 return p;
844 }
845 }
846 }
847
92e772ff
PH
848/* Fill in the required variables, and pass back the pointer to the terminating
849'}'. */
8ac170f3 850
92e772ff
PH
851*minp = min;
852*maxp = max;
8ac170f3
PH
853return p;
854}
855
856
857
aa41d2de 858/*************************************************
6bf342e1 859* Find forward referenced subpattern *
aa41d2de
PH
860*************************************************/
861
6bf342e1
PH
862/* This function scans along a pattern's text looking for capturing
863subpatterns, and counting them. If it finds a named pattern that matches the
864name it is given, it returns its number. Alternatively, if the name is NULL, it
865returns when it reaches a given numbered subpattern. This is used for forward
866references to subpatterns. We know that if (?P< is encountered, the name will
867be terminated by '>' because that is checked in the first pass.
aa41d2de
PH
868
869Arguments:
6bf342e1
PH
870 ptr current position in the pattern
871 count current count of capturing parens so far encountered
872 name name to seek, or NULL if seeking a numbered subpattern
873 lorn name length, or subpattern number if name is NULL
874 xmode TRUE if we are in /x mode
aa41d2de
PH
875
876Returns: the number of the named subpattern, or -1 if not found
877*/
878
879static int
6bf342e1
PH
880find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
881 BOOL xmode)
aa41d2de
PH
882{
883const uschar *thisname;
6bf342e1 884
aa41d2de
PH
885for (; *ptr != 0; ptr++)
886 {
6bf342e1
PH
887 int term;
888
889 /* Skip over backslashed characters and also entire \Q...\E */
890
891 if (*ptr == '\\')
892 {
893 if (*(++ptr) == 0) return -1;
894 if (*ptr == 'Q') for (;;)
895 {
896 while (*(++ptr) != 0 && *ptr != '\\');
897 if (*ptr == 0) return -1;
898 if (*(++ptr) == 'E') break;
899 }
900 continue;
901 }
902
903 /* Skip over character classes */
904
905 if (*ptr == '[')
906 {
907 while (*(++ptr) != ']')
908 {
909 if (*ptr == '\\')
910 {
911 if (*(++ptr) == 0) return -1;
912 if (*ptr == 'Q') for (;;)
913 {
914 while (*(++ptr) != 0 && *ptr != '\\');
915 if (*ptr == 0) return -1;
916 if (*(++ptr) == 'E') break;
917 }
918 continue;
919 }
920 }
921 continue;
922 }
923
924 /* Skip comments in /x mode */
925
926 if (xmode && *ptr == '#')
927 {
928 while (*(++ptr) != 0 && *ptr != '\n');
929 if (*ptr == 0) return -1;
930 continue;
931 }
932
933 /* An opening parens must now be a real metacharacter */
934
aa41d2de 935 if (*ptr != '(') continue;
6bf342e1
PH
936 if (ptr[1] != '?')
937 {
938 count++;
939 if (name == NULL && count == lorn) return count;
940 continue;
941 }
942
943 ptr += 2;
944 if (*ptr == 'P') ptr++; /* Allow optional P */
945
946 /* We have to disambiguate (?<! and (?<= from (?<name> */
947
948 if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
949 *ptr != '\'')
950 continue;
951
aa41d2de 952 count++;
6bf342e1
PH
953
954 if (name == NULL && count == lorn) return count;
955 term = *ptr++;
956 if (term == '<') term = '>';
aa41d2de 957 thisname = ptr;
6bf342e1
PH
958 while (*ptr != term) ptr++;
959 if (name != NULL && lorn == ptr - thisname &&
960 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
aa41d2de
PH
961 return count;
962 }
6bf342e1 963
aa41d2de
PH
964return -1;
965}
966
967
968
8ac170f3
PH
969/*************************************************
970* Find first significant op code *
971*************************************************/
972
973/* This is called by several functions that scan a compiled expression looking
974for a fixed first character, or an anchoring op code etc. It skips over things
975that do not influence this. For some calls, a change of option is important.
976For some calls, it makes sense to skip negative forward and all backward
977assertions, and also the \b assertion; for others it does not.
978
979Arguments:
980 code pointer to the start of the group
981 options pointer to external options
982 optbit the option bit whose changing is significant, or
983 zero if none are
984 skipassert TRUE if certain assertions are to be skipped
985
986Returns: pointer to the first significant opcode
987*/
988
989static const uschar*
990first_significant_code(const uschar *code, int *options, int optbit,
991 BOOL skipassert)
992{
993for (;;)
994 {
995 switch ((int)*code)
996 {
997 case OP_OPT:
998 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
999 *options = (int)code[1];
1000 code += 2;
1001 break;
1002
1003 case OP_ASSERT_NOT:
1004 case OP_ASSERTBACK:
1005 case OP_ASSERTBACK_NOT:
1006 if (!skipassert) return code;
1007 do code += GET(code, 1); while (*code == OP_ALT);
1008 code += _pcre_OP_lengths[*code];
1009 break;
1010
1011 case OP_WORD_BOUNDARY:
1012 case OP_NOT_WORD_BOUNDARY:
1013 if (!skipassert) return code;
1014 /* Fall through */
1015
1016 case OP_CALLOUT:
1017 case OP_CREF:
6bf342e1
PH
1018 case OP_RREF:
1019 case OP_DEF:
8ac170f3
PH
1020 code += _pcre_OP_lengths[*code];
1021 break;
1022
1023 default:
1024 return code;
1025 }
1026 }
1027/* Control never reaches here */
1028}
1029
1030
1031
1032
1033/*************************************************
1034* Find the fixed length of a pattern *
1035*************************************************/
1036
1037/* Scan a pattern and compute the fixed length of subject that will match it,
1038if the length is fixed. This is needed for dealing with backward assertions.
1039In UTF8 mode, the result is in characters rather than bytes.
1040
1041Arguments:
1042 code points to the start of the pattern (the bracket)
1043 options the compiling options
1044
1045Returns: the fixed length, or -1 if there is no fixed length,
1046 or -2 if \C was encountered
1047*/
1048
1049static int
1050find_fixedlength(uschar *code, int options)
1051{
1052int length = -1;
1053
1054register int branchlength = 0;
1055register uschar *cc = code + 1 + LINK_SIZE;
1056
1057/* Scan along the opcodes for this branch. If we get to the end of the
1058branch, check the length against that of the other branches. */
1059
1060for (;;)
1061 {
1062 int d;
1063 register int op = *cc;
8ac170f3
PH
1064
1065 switch (op)
1066 {
6bf342e1 1067 case OP_CBRA:
8ac170f3
PH
1068 case OP_BRA:
1069 case OP_ONCE:
1070 case OP_COND:
6bf342e1 1071 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
8ac170f3
PH
1072 if (d < 0) return d;
1073 branchlength += d;
1074 do cc += GET(cc, 1); while (*cc == OP_ALT);
1075 cc += 1 + LINK_SIZE;
1076 break;
1077
1078 /* Reached end of a branch; if it's a ket it is the end of a nested
1079 call. If it's ALT it is an alternation in a nested call. If it is
1080 END it's the end of the outer call. All can be handled by the same code. */
1081
1082 case OP_ALT:
1083 case OP_KET:
1084 case OP_KETRMAX:
1085 case OP_KETRMIN:
1086 case OP_END:
1087 if (length < 0) length = branchlength;
1088 else if (length != branchlength) return -1;
1089 if (*cc != OP_ALT) return length;
1090 cc += 1 + LINK_SIZE;
1091 branchlength = 0;
1092 break;
1093
1094 /* Skip over assertive subpatterns */
1095
1096 case OP_ASSERT:
1097 case OP_ASSERT_NOT:
1098 case OP_ASSERTBACK:
1099 case OP_ASSERTBACK_NOT:
1100 do cc += GET(cc, 1); while (*cc == OP_ALT);
1101 /* Fall through */
1102
1103 /* Skip over things that don't match chars */
1104
1105 case OP_REVERSE:
8ac170f3 1106 case OP_CREF:
6bf342e1
PH
1107 case OP_RREF:
1108 case OP_DEF:
8ac170f3
PH
1109 case OP_OPT:
1110 case OP_CALLOUT:
1111 case OP_SOD:
1112 case OP_SOM:
1113 case OP_EOD:
1114 case OP_EODN:
1115 case OP_CIRC:
1116 case OP_DOLL:
1117 case OP_NOT_WORD_BOUNDARY:
1118 case OP_WORD_BOUNDARY:
1119 cc += _pcre_OP_lengths[*cc];
1120 break;
1121
1122 /* Handle literal characters */
1123
1124 case OP_CHAR:
1125 case OP_CHARNC:
aa41d2de 1126 case OP_NOT:
8ac170f3
PH
1127 branchlength++;
1128 cc += 2;
1129#ifdef SUPPORT_UTF8
1130 if ((options & PCRE_UTF8) != 0)
1131 {
1132 while ((*cc & 0xc0) == 0x80) cc++;
1133 }
1134#endif
1135 break;
1136
1137 /* Handle exact repetitions. The count is already in characters, but we
1138 need to skip over a multibyte character in UTF8 mode. */
1139
1140 case OP_EXACT:
1141 branchlength += GET2(cc,1);
1142 cc += 4;
1143#ifdef SUPPORT_UTF8
1144 if ((options & PCRE_UTF8) != 0)
1145 {
1146 while((*cc & 0x80) == 0x80) cc++;
1147 }
1148#endif
1149 break;
1150
1151 case OP_TYPEEXACT:
1152 branchlength += GET2(cc,1);
1153 cc += 4;
1154 break;
1155
1156 /* Handle single-char matchers */
1157
1158 case OP_PROP:
1159 case OP_NOTPROP:
aa41d2de 1160 cc += 2;
8ac170f3
PH
1161 /* Fall through */
1162
1163 case OP_NOT_DIGIT:
1164 case OP_DIGIT:
1165 case OP_NOT_WHITESPACE:
1166 case OP_WHITESPACE:
1167 case OP_NOT_WORDCHAR:
1168 case OP_WORDCHAR:
1169 case OP_ANY:
1170 branchlength++;
1171 cc++;
1172 break;
1173
1174 /* The single-byte matcher isn't allowed */
1175
1176 case OP_ANYBYTE:
1177 return -2;
1178
1179 /* Check a class for variable quantification */
1180
1181#ifdef SUPPORT_UTF8
1182 case OP_XCLASS:
1183 cc += GET(cc, 1) - 33;
1184 /* Fall through */
1185#endif
1186
1187 case OP_CLASS:
1188 case OP_NCLASS:
1189 cc += 33;
1190
1191 switch (*cc)
1192 {
1193 case OP_CRSTAR:
1194 case OP_CRMINSTAR:
1195 case OP_CRQUERY:
1196 case OP_CRMINQUERY:
1197 return -1;
1198
1199 case OP_CRRANGE:
1200 case OP_CRMINRANGE:
1201 if (GET2(cc,1) != GET2(cc,3)) return -1;
1202 branchlength += GET2(cc,1);
1203 cc += 5;
1204 break;
1205
1206 default:
1207 branchlength++;
1208 }
1209 break;
1210
1211 /* Anything else is variable length */
1212
1213 default:
1214 return -1;
1215 }
1216 }
1217/* Control never gets here */
1218}
1219
1220
1221
1222
1223/*************************************************
1224* Scan compiled regex for numbered bracket *
1225*************************************************/
1226
1227/* This little function scans through a compiled pattern until it finds a
1228capturing bracket with the given number.
1229
1230Arguments:
1231 code points to start of expression
1232 utf8 TRUE in UTF-8 mode
1233 number the required bracket number
1234
1235Returns: pointer to the opcode for the bracket, or NULL if not found
1236*/
1237
1238static const uschar *
1239find_bracket(const uschar *code, BOOL utf8, int number)
1240{
8ac170f3
PH
1241for (;;)
1242 {
1243 register int c = *code;
1244 if (c == OP_END) return NULL;
aa41d2de
PH
1245
1246 /* XCLASS is used for classes that cannot be represented just by a bit
1247 map. This includes negated single high-valued characters. The length in
1248 the table is zero; the actual length is stored in the compiled code. */
1249
1250 if (c == OP_XCLASS) code += GET(code, 1);
1251
6bf342e1 1252 /* Handle capturing bracket */
aa41d2de 1253
6bf342e1 1254 else if (c == OP_CBRA)
8ac170f3 1255 {
6bf342e1 1256 int n = GET2(code, 1+LINK_SIZE);
8ac170f3 1257 if (n == number) return (uschar *)code;
6bf342e1 1258 code += _pcre_OP_lengths[c];
8ac170f3 1259 }
aa41d2de 1260
6bf342e1
PH
1261 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1262 a multi-byte character. The length in the table is a minimum, so we have to
1263 arrange to skip the extra bytes. */
aa41d2de 1264
8ac170f3
PH
1265 else
1266 {
1267 code += _pcre_OP_lengths[c];
64f2600a 1268#ifdef SUPPORT_UTF8
8ac170f3
PH
1269 if (utf8) switch(c)
1270 {
1271 case OP_CHAR:
1272 case OP_CHARNC:
1273 case OP_EXACT:
1274 case OP_UPTO:
1275 case OP_MINUPTO:
6bf342e1 1276 case OP_POSUPTO:
8ac170f3
PH
1277 case OP_STAR:
1278 case OP_MINSTAR:
6bf342e1 1279 case OP_POSSTAR:
8ac170f3
PH
1280 case OP_PLUS:
1281 case OP_MINPLUS:
6bf342e1 1282 case OP_POSPLUS:
8ac170f3
PH
1283 case OP_QUERY:
1284 case OP_MINQUERY:
6bf342e1
PH
1285 case OP_POSQUERY:
1286 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
8ac170f3 1287 break;
8ac170f3 1288 }
64f2600a 1289#endif
8ac170f3
PH
1290 }
1291 }
1292}
1293
1294
1295
1296/*************************************************
1297* Scan compiled regex for recursion reference *
1298*************************************************/
1299
1300/* This little function scans through a compiled pattern until it finds an
1301instance of OP_RECURSE.
1302
1303Arguments:
1304 code points to start of expression
1305 utf8 TRUE in UTF-8 mode
1306
1307Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1308*/
1309
1310static const uschar *
1311find_recurse(const uschar *code, BOOL utf8)
1312{
8ac170f3
PH
1313for (;;)
1314 {
1315 register int c = *code;
1316 if (c == OP_END) return NULL;
aa41d2de
PH
1317 if (c == OP_RECURSE) return code;
1318
1319 /* XCLASS is used for classes that cannot be represented just by a bit
1320 map. This includes negated single high-valued characters. The length in
1321 the table is zero; the actual length is stored in the compiled code. */
1322
1323 if (c == OP_XCLASS) code += GET(code, 1);
1324
aa41d2de
PH
1325 /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1326 that are followed by a character may be followed by a multi-byte character.
6bf342e1
PH
1327 The length in the table is a minimum, so we have to arrange to skip the extra
1328 bytes. */
aa41d2de 1329
8ac170f3
PH
1330 else
1331 {
1332 code += _pcre_OP_lengths[c];
64f2600a 1333#ifdef SUPPORT_UTF8
8ac170f3
PH
1334 if (utf8) switch(c)
1335 {
1336 case OP_CHAR:
1337 case OP_CHARNC:
1338 case OP_EXACT:
1339 case OP_UPTO:
1340 case OP_MINUPTO:
6bf342e1 1341 case OP_POSUPTO:
8ac170f3
PH
1342 case OP_STAR:
1343 case OP_MINSTAR:
6bf342e1 1344 case OP_POSSTAR:
8ac170f3
PH
1345 case OP_PLUS:
1346 case OP_MINPLUS:
6bf342e1 1347 case OP_POSPLUS:
8ac170f3
PH
1348 case OP_QUERY:
1349 case OP_MINQUERY:
6bf342e1
PH
1350 case OP_POSQUERY:
1351 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
8ac170f3 1352 break;
8ac170f3 1353 }
64f2600a 1354#endif
8ac170f3
PH
1355 }
1356 }
1357}
1358
1359
1360
1361/*************************************************
1362* Scan compiled branch for non-emptiness *
1363*************************************************/
1364
1365/* This function scans through a branch of a compiled pattern to see whether it
6bf342e1
PH
1366can match the empty string or not. It is called from could_be_empty()
1367below and from compile_branch() when checking for an unlimited repeat of a
1368group that can match nothing. Note that first_significant_code() skips over
1369assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1370struck an inner bracket whose current branch will already have been scanned.
8ac170f3
PH
1371
1372Arguments:
1373 code points to start of search
1374 endcode points to where to stop
1375 utf8 TRUE if in UTF8 mode
1376
1377Returns: TRUE if what is matched could be empty
1378*/
1379
1380static BOOL
1381could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1382{
1383register int c;
6bf342e1 1384for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
8ac170f3
PH
1385 code < endcode;
1386 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1387 {
1388 const uschar *ccode;
1389
1390 c = *code;
1391
64f2600a
PH
1392 /* Groups with zero repeats can of course be empty; skip them. */
1393
1394 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1395 {
1396 code += _pcre_OP_lengths[c];
1397 do code += GET(code, 1); while (*code == OP_ALT);
1398 c = *code;
1399 continue;
1400 }
1401
1402 /* For other groups, scan the branches. */
1403
6bf342e1 1404 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
8ac170f3
PH
1405 {
1406 BOOL empty_branch;
1407 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1408
1409 /* Scan a closed bracket */
1410
1411 empty_branch = FALSE;
1412 do
1413 {
1414 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1415 empty_branch = TRUE;
1416 code += GET(code, 1);
1417 }
1418 while (*code == OP_ALT);
1419 if (!empty_branch) return FALSE; /* All branches are non-empty */
64f2600a 1420 c = *code;
6bf342e1 1421 continue;
8ac170f3
PH
1422 }
1423
6bf342e1
PH
1424 /* Handle the other opcodes */
1425
1426 switch (c)
8ac170f3
PH
1427 {
1428 /* Check for quantifiers after a class */
1429
1430#ifdef SUPPORT_UTF8
1431 case OP_XCLASS:
1432 ccode = code + GET(code, 1);
1433 goto CHECK_CLASS_REPEAT;
1434#endif
1435
1436 case OP_CLASS:
1437 case OP_NCLASS:
1438 ccode = code + 33;
1439
1440#ifdef SUPPORT_UTF8
1441 CHECK_CLASS_REPEAT:
1442#endif
1443
1444 switch (*ccode)
1445 {
1446 case OP_CRSTAR: /* These could be empty; continue */
1447 case OP_CRMINSTAR:
1448 case OP_CRQUERY:
1449 case OP_CRMINQUERY:
1450 break;
1451
1452 default: /* Non-repeat => class must match */
1453 case OP_CRPLUS: /* These repeats aren't empty */
1454 case OP_CRMINPLUS:
1455 return FALSE;
1456
1457 case OP_CRRANGE:
1458 case OP_CRMINRANGE:
1459 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1460 break;
1461 }
1462 break;
1463
1464 /* Opcodes that must match a character */
1465
1466 case OP_PROP:
1467 case OP_NOTPROP:
1468 case OP_EXTUNI:
1469 case OP_NOT_DIGIT:
1470 case OP_DIGIT:
1471 case OP_NOT_WHITESPACE:
1472 case OP_WHITESPACE:
1473 case OP_NOT_WORDCHAR:
1474 case OP_WORDCHAR:
1475 case OP_ANY:
1476 case OP_ANYBYTE:
1477 case OP_CHAR:
1478 case OP_CHARNC:
1479 case OP_NOT:
1480 case OP_PLUS:
1481 case OP_MINPLUS:
6bf342e1 1482 case OP_POSPLUS:
8ac170f3
PH
1483 case OP_EXACT:
1484 case OP_NOTPLUS:
1485 case OP_NOTMINPLUS:
6bf342e1 1486 case OP_NOTPOSPLUS:
8ac170f3
PH
1487 case OP_NOTEXACT:
1488 case OP_TYPEPLUS:
1489 case OP_TYPEMINPLUS:
6bf342e1 1490 case OP_TYPEPOSPLUS:
8ac170f3
PH
1491 case OP_TYPEEXACT:
1492 return FALSE;
1493
1494 /* End of branch */
1495
1496 case OP_KET:
1497 case OP_KETRMAX:
1498 case OP_KETRMIN:
1499 case OP_ALT:
1500 return TRUE;
1501
6bf342e1
PH
1502 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1503 MINUPTO, and POSUPTO may be followed by a multibyte character */
8ac170f3
PH
1504
1505#ifdef SUPPORT_UTF8
1506 case OP_STAR:
1507 case OP_MINSTAR:
6bf342e1 1508 case OP_POSSTAR:
8ac170f3
PH
1509 case OP_QUERY:
1510 case OP_MINQUERY:
6bf342e1 1511 case OP_POSQUERY:
8ac170f3
PH
1512 case OP_UPTO:
1513 case OP_MINUPTO:
6bf342e1 1514 case OP_POSUPTO:
8ac170f3
PH
1515 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1516 break;
1517#endif
1518 }
1519 }
1520
1521return TRUE;
1522}
1523
1524
1525
1526/*************************************************
1527* Scan compiled regex for non-emptiness *
1528*************************************************/
1529
1530/* This function is called to check for left recursive calls. We want to check
1531the current branch of the current pattern to see if it could match the empty
1532string. If it could, we must look outwards for branches at other levels,
1533stopping when we pass beyond the bracket which is the subject of the recursion.
1534
1535Arguments:
1536 code points to start of the recursion
1537 endcode points to where to stop (current RECURSE item)
1538 bcptr points to the chain of current (unclosed) branch starts
1539 utf8 TRUE if in UTF-8 mode
1540
1541Returns: TRUE if what is matched could be empty
1542*/
1543
1544static BOOL
1545could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1546 BOOL utf8)
1547{
1548while (bcptr != NULL && bcptr->current >= code)
1549 {
1550 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1551 bcptr = bcptr->outer;
1552 }
1553return TRUE;
1554}
1555
1556
1557
1558/*************************************************
1559* Check for POSIX class syntax *
1560*************************************************/
1561
1562/* This function is called when the sequence "[:" or "[." or "[=" is
1563encountered in a character class. It checks whether this is followed by an
1564optional ^ and then a sequence of letters, terminated by a matching ":]" or
1565".]" or "=]".
1566
1567Argument:
1568 ptr pointer to the initial [
1569 endptr where to return the end pointer
1570 cd pointer to compile data
1571
1572Returns: TRUE or FALSE
1573*/
1574
1575static BOOL
1576check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1577{
1578int terminator; /* Don't combine these lines; the Solaris cc */
1579terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1580if (*(++ptr) == '^') ptr++;
1581while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1582if (*ptr == terminator && ptr[1] == ']')
1583 {
1584 *endptr = ptr;
1585 return TRUE;
1586 }
1587return FALSE;
1588}
1589
1590
1591
1592
1593/*************************************************
1594* Check POSIX class name *
1595*************************************************/
1596
1597/* This function is called to check the name given in a POSIX-style class entry
1598such as [:alnum:].
1599
1600Arguments:
1601 ptr points to the first letter
1602 len the length of the name
1603
1604Returns: a value representing the name, or -1 if unknown
1605*/
1606
1607static int
1608check_posix_name(const uschar *ptr, int len)
1609{
1610register int yield = 0;
1611while (posix_name_lengths[yield] != 0)
1612 {
1613 if (len == posix_name_lengths[yield] &&
1614 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1615 yield++;
1616 }
1617return -1;
1618}
1619
1620
1621/*************************************************
1622* Adjust OP_RECURSE items in repeated group *
1623*************************************************/
1624
1625/* OP_RECURSE items contain an offset from the start of the regex to the group
1626that is referenced. This means that groups can be replicated for fixed
1627repetition simply by copying (because the recursion is allowed to refer to
1628earlier groups that are outside the current group). However, when a group is
1629optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1630it, after it has been compiled. This means that any OP_RECURSE items within it
1631that refer to the group itself or any contained groups have to have their
6bf342e1
PH
1632offsets adjusted. That one of the jobs of this function. Before it is called,
1633the partially compiled regex must be temporarily terminated with OP_END.
1634
1635This function has been extended with the possibility of forward references for
1636recursions and subroutine calls. It must also check the list of such references
1637for the group we are dealing with. If it finds that one of the recursions in
1638the current group is on this list, it adjusts the offset in the list, not the
1639value in the reference (which is a group number).
8ac170f3
PH
1640
1641Arguments:
1642 group points to the start of the group
1643 adjust the amount by which the group is to be moved
1644 utf8 TRUE in UTF-8 mode
1645 cd contains pointers to tables etc.
6bf342e1 1646 save_hwm the hwm forward reference pointer at the start of the group
8ac170f3
PH
1647
1648Returns: nothing
1649*/
1650
1651static void
6bf342e1
PH
1652adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1653 uschar *save_hwm)
8ac170f3
PH
1654{
1655uschar *ptr = group;
1656while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1657 {
6bf342e1
PH
1658 int offset;
1659 uschar *hc;
1660
1661 /* See if this recursion is on the forward reference list. If so, adjust the
1662 reference. */
1663
1664 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1665 {
1666 offset = GET(hc, 0);
1667 if (cd->start_code + offset == ptr + 1)
1668 {
1669 PUT(hc, 0, offset + adjust);
1670 break;
1671 }
1672 }
1673
1674 /* Otherwise, adjust the recursion offset if it's after the start of this
1675 group. */
1676
1677 if (hc >= cd->hwm)
1678 {
1679 offset = GET(ptr, 1);
1680 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1681 }
1682
8ac170f3
PH
1683 ptr += 1 + LINK_SIZE;
1684 }
1685}
1686
1687
1688
1689/*************************************************
1690* Insert an automatic callout point *
1691*************************************************/
1692
1693/* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1694callout points before each pattern item.
1695
1696Arguments:
1697 code current code pointer
1698 ptr current pattern pointer
1699 cd pointers to tables etc
1700
1701Returns: new code pointer
1702*/
1703
1704static uschar *
1705auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1706{
1707*code++ = OP_CALLOUT;
1708*code++ = 255;
1709PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1710PUT(code, LINK_SIZE, 0); /* Default length */
1711return code + 2*LINK_SIZE;
1712}
1713
1714
1715
1716/*************************************************
1717* Complete a callout item *
1718*************************************************/
1719
1720/* A callout item contains the length of the next item in the pattern, which
1721we can't fill in till after we have reached the relevant point. This is used
1722for both automatic and manual callouts.
1723
1724Arguments:
1725 previous_callout points to previous callout item
1726 ptr current pattern pointer
1727 cd pointers to tables etc
1728
1729Returns: nothing
1730*/
1731
1732static void
1733complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1734{
1735int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1736PUT(previous_callout, 2 + LINK_SIZE, length);
1737}
1738
1739
1740
1741#ifdef SUPPORT_UCP
1742/*************************************************
1743* Get othercase range *
1744*************************************************/
1745
1746/* This function is passed the start and end of a class range, in UTF-8 mode
1747with UCP support. It searches up the characters, looking for internal ranges of
1748characters in the "other" case. Each call returns the next one, updating the
1749start address.
1750
1751Arguments:
1752 cptr points to starting character value; updated
1753 d end value
1754 ocptr where to put start of othercase range
1755 odptr where to put end of othercase range
1756
1757Yield: TRUE when range returned; FALSE when no more
1758*/
1759
1760static BOOL
6bf342e1
PH
1761get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1762 unsigned int *odptr)
8ac170f3 1763{
6bf342e1 1764unsigned int c, othercase, next;
8ac170f3
PH
1765
1766for (c = *cptr; c <= d; c++)
6bf342e1 1767 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
8ac170f3
PH
1768
1769if (c > d) return FALSE;
1770
1771*ocptr = othercase;
1772next = othercase + 1;
1773
1774for (++c; c <= d; c++)
1775 {
aa41d2de 1776 if (_pcre_ucp_othercase(c) != next) break;
8ac170f3
PH
1777 next++;
1778 }
1779
1780*odptr = next - 1;
1781*cptr = c;
1782
1783return TRUE;
1784}
1785#endif /* SUPPORT_UCP */
1786
1787
6bf342e1
PH
1788
1789/*************************************************
1790* Check if auto-possessifying is possible *
1791*************************************************/
1792
1793/* This function is called for unlimited repeats of certain items, to see
1794whether the next thing could possibly match the repeated item. If not, it makes
1795sense to automatically possessify the repeated item.
1796
1797Arguments:
1798 op_code the repeated op code
1799 this data for this item, depends on the opcode
1800 utf8 TRUE in UTF-8 mode
1801 utf8_char used for utf8 character bytes, NULL if not relevant
1802 ptr next character in pattern
1803 options options bits
1804 cd contains pointers to tables etc.
1805
1806Returns: TRUE if possessifying is wanted
1807*/
1808
1809static BOOL
1810check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1811 const uschar *ptr, int options, compile_data *cd)
1812{
1813int next;
1814
1815/* Skip whitespace and comments in extended mode */
1816
1817if ((options & PCRE_EXTENDED) != 0)
1818 {
1819 for (;;)
1820 {
1821 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1822 if (*ptr == '#')
1823 {
1824 while (*(++ptr) != 0)
1825 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1826 }
1827 else break;
1828 }
1829 }
1830
1831/* If the next item is one that we can handle, get its value. A non-negative
1832value is a character, a negative value is an escape value. */
1833
1834if (*ptr == '\\')
1835 {
1836 int temperrorcode = 0;
1837 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1838 if (temperrorcode != 0) return FALSE;
1839 ptr++; /* Point after the escape sequence */
1840 }
1841
1842else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1843 {
1844#ifdef SUPPORT_UTF8
1845 if (utf8) { GETCHARINC(next, ptr); } else
1846#endif
1847 next = *ptr++;
1848 }
1849
1850else return FALSE;
1851
1852/* Skip whitespace and comments in extended mode */
1853
1854if ((options & PCRE_EXTENDED) != 0)
1855 {
1856 for (;;)
1857 {
1858 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1859 if (*ptr == '#')
1860 {
1861 while (*(++ptr) != 0)
1862 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1863 }
1864 else break;
1865 }
1866 }
1867
1868/* If the next thing is itself optional, we have to give up. */
1869
1870if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1871 return FALSE;
1872
1873/* Now compare the next item with the previous opcode. If the previous is a
1874positive single character match, "item" either contains the character or, if
1875"item" is greater than 127 in utf8 mode, the character's bytes are in
1876utf8_char. */
1877
1878
1879/* Handle cases when the next item is a character. */
1880
1881if (next >= 0) switch(op_code)
1882 {
1883 case OP_CHAR:
1884#ifdef SUPPORT_UTF8
1885 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1886#endif
1887 return item != next;
1888
1889 /* For CHARNC (caseless character) we must check the other case. If we have
1890 Unicode property support, we can use it to test the other case of
1891 high-valued characters. */
1892
1893 case OP_CHARNC:
1894#ifdef SUPPORT_UTF8
1895 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1896#endif
1897 if (item == next) return FALSE;
1898#ifdef SUPPORT_UTF8
1899 if (utf8)
1900 {
1901 unsigned int othercase;
1902 if (next < 128) othercase = cd->fcc[next]; else
1903#ifdef SUPPORT_UCP
1904 othercase = _pcre_ucp_othercase((unsigned int)next);
1905#else
1906 othercase = NOTACHAR;
1907#endif
1908 return (unsigned int)item != othercase;
1909 }
1910 else
1911#endif /* SUPPORT_UTF8 */
1912 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
1913
1914 /* For OP_NOT, "item" must be a single-byte character. */
1915
1916 case OP_NOT:
1917 if (next < 0) return FALSE; /* Not a character */
1918 if (item == next) return TRUE;
1919 if ((options & PCRE_CASELESS) == 0) return FALSE;
1920#ifdef SUPPORT_UTF8
1921 if (utf8)
1922 {
1923 unsigned int othercase;
1924 if (next < 128) othercase = cd->fcc[next]; else
1925#ifdef SUPPORT_UCP
1926 othercase = _pcre_ucp_othercase(next);
1927#else
1928 othercase = NOTACHAR;
1929#endif
1930 return (unsigned int)item == othercase;
1931 }
1932 else
1933#endif /* SUPPORT_UTF8 */
1934 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
1935
1936 case OP_DIGIT:
1937 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1938
1939 case OP_NOT_DIGIT:
1940 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1941
1942 case OP_WHITESPACE:
1943 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1944
1945 case OP_NOT_WHITESPACE:
1946 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1947
1948 case OP_WORDCHAR:
1949 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1950
1951 case OP_NOT_WORDCHAR:
1952 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1953
64f2600a
PH
1954 case OP_HSPACE:
1955 case OP_NOT_HSPACE:
1956 switch(next)
1957 {
1958 case 0x09:
1959 case 0x20:
1960 case 0xa0:
1961 case 0x1680:
1962 case 0x180e:
1963 case 0x2000:
1964 case 0x2001:
1965 case 0x2002:
1966 case 0x2003:
1967 case 0x2004:
1968 case 0x2005:
1969 case 0x2006:
1970 case 0x2007:
1971 case 0x2008:
1972 case 0x2009:
1973 case 0x200A:
1974 case 0x202f:
1975 case 0x205f:
1976 case 0x3000:
1977 return op_code != OP_HSPACE;
1978 default:
1979 return op_code == OP_HSPACE;
1980 }
1981
1982 case OP_VSPACE:
1983 case OP_NOT_VSPACE:
1984 switch(next)
1985 {
1986 case 0x0a:
1987 case 0x0b:
1988 case 0x0c:
1989 case 0x0d:
1990 case 0x85:
1991 case 0x2028:
1992 case 0x2029:
1993 return op_code != OP_VSPACE;
1994 default:
1995 return op_code == OP_VSPACE;
1996 }
1997
6bf342e1
PH
1998 default:
1999 return FALSE;
2000 }
2001
2002
2003/* Handle the case when the next item is \d, \s, etc. */
2004
2005switch(op_code)
2006 {
2007 case OP_CHAR:
2008 case OP_CHARNC:
2009#ifdef SUPPORT_UTF8
2010 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2011#endif
2012 switch(-next)
2013 {
2014 case ESC_d:
2015 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2016
2017 case ESC_D:
2018 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2019
2020 case ESC_s:
2021 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2022
2023 case ESC_S:
2024 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2025
2026 case ESC_w:
2027 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2028
2029 case ESC_W:
2030 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2031
64f2600a
PH
2032 case ESC_h:
2033 case ESC_H:
2034 switch(item)
2035 {
2036 case 0x09:
2037 case 0x20:
2038 case 0xa0:
2039 case 0x1680:
2040 case 0x180e:
2041 case 0x2000:
2042 case 0x2001:
2043 case 0x2002:
2044 case 0x2003:
2045 case 0x2004:
2046 case 0x2005:
2047 case 0x2006:
2048 case 0x2007:
2049 case 0x2008:
2050 case 0x2009:
2051 case 0x200A:
2052 case 0x202f:
2053 case 0x205f:
2054 case 0x3000:
2055 return -next != ESC_h;
2056 default:
2057 return -next == ESC_h;
2058 }
2059
2060 case ESC_v:
2061 case ESC_V:
2062 switch(item)
2063 {
2064 case 0x0a:
2065 case 0x0b:
2066 case 0x0c:
2067 case 0x0d:
2068 case 0x85:
2069 case 0x2028:
2070 case 0x2029:
2071 return -next != ESC_v;
2072 default:
2073 return -next == ESC_v;
2074 }
2075
6bf342e1
PH
2076 default:
2077 return FALSE;
2078 }
2079
2080 case OP_DIGIT:
64f2600a
PH
2081 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2082 next == -ESC_h || next == -ESC_v;
6bf342e1
PH
2083
2084 case OP_NOT_DIGIT:
2085 return next == -ESC_d;
2086
2087 case OP_WHITESPACE:
2088 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2089
2090 case OP_NOT_WHITESPACE:
64f2600a
PH
2091 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2092
2093 case OP_HSPACE:
2094 return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2095
2096 case OP_NOT_HSPACE:
2097 return next == -ESC_h;
2098
2099 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2100 case OP_VSPACE:
2101 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2102
2103 case OP_NOT_VSPACE:
2104 return next == -ESC_v;
6bf342e1
PH
2105
2106 case OP_WORDCHAR:
64f2600a 2107 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
6bf342e1
PH
2108
2109 case OP_NOT_WORDCHAR:
2110 return next == -ESC_w || next == -ESC_d;
2111
2112 default:
2113 return FALSE;
2114 }
2115
2116/* Control does not reach here */
2117}
2118
2119
2120
8ac170f3
PH
2121/*************************************************
2122* Compile one branch *
2123*************************************************/
2124
6bf342e1 2125/* Scan the pattern, compiling it into the a vector. If the options are
8ac170f3 2126changed during the branch, the pointer is used to change the external options
6bf342e1
PH
2127bits. This function is used during the pre-compile phase when we are trying
2128to find out the amount of memory needed, as well as during the real compile
2129phase. The value of lengthptr distinguishes the two phases.
8ac170f3
PH
2130
2131Arguments:
2132 optionsptr pointer to the option bits
8ac170f3
PH
2133 codeptr points to the pointer to the current code point
2134 ptrptr points to the current pattern pointer
2135 errorcodeptr points to error code variable
2136 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2137 reqbyteptr set to the last literal character required, else < 0
2138 bcptr points to current branch chain
2139 cd contains pointers to tables etc.
6bf342e1
PH
2140 lengthptr NULL during the real compile phase
2141 points to length accumulator during pre-compile phase
8ac170f3
PH
2142
2143Returns: TRUE on success
2144 FALSE, with *errorcodeptr set non-zero on error
2145*/
2146
2147static BOOL
6bf342e1
PH
2148compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2149 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2150 compile_data *cd, int *lengthptr)
8ac170f3
PH
2151{
2152int repeat_type, op_type;
2153int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2154int bravalue = 0;
2155int greedy_default, greedy_non_default;
2156int firstbyte, reqbyte;
2157int zeroreqbyte, zerofirstbyte;
2158int req_caseopt, reqvary, tempreqvary;
8ac170f3
PH
2159int options = *optionsptr;
2160int after_manual_callout = 0;
6bf342e1 2161int length_prevgroup = 0;
8ac170f3
PH
2162register int c;
2163register uschar *code = *codeptr;
6bf342e1
PH
2164uschar *last_code = code;
2165uschar *orig_code = code;
8ac170f3
PH
2166uschar *tempcode;
2167BOOL inescq = FALSE;
2168BOOL groupsetfirstbyte = FALSE;
2169const uschar *ptr = *ptrptr;
2170const uschar *tempptr;
2171uschar *previous = NULL;
2172uschar *previous_callout = NULL;
6bf342e1 2173uschar *save_hwm = NULL;
8ac170f3
PH
2174uschar classbits[32];
2175
2176#ifdef SUPPORT_UTF8
2177BOOL class_utf8;
2178BOOL utf8 = (options & PCRE_UTF8) != 0;
2179uschar *class_utf8data;
2180uschar utf8_char[6];
2181#else
2182BOOL utf8 = FALSE;
6bf342e1
PH
2183uschar *utf8_char = NULL;
2184#endif
2185
2186#ifdef DEBUG
2187if (lengthptr != NULL) DPRINTF((">> start branch\n"));
8ac170f3
PH
2188#endif
2189
2190/* Set up the default and non-default settings for greediness */
2191
2192greedy_default = ((options & PCRE_UNGREEDY) != 0);
2193greedy_non_default = greedy_default ^ 1;
2194
2195/* Initialize no first byte, no required byte. REQ_UNSET means "no char
2196matching encountered yet". It gets changed to REQ_NONE if we hit something that
2197matches a non-fixed char first char; reqbyte just remains unset if we never
2198find one.
2199
2200When we hit a repeat whose minimum is zero, we may have to adjust these values
2201to take the zero repeat into account. This is implemented by setting them to
2202zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2203item types that can be repeated set these backoff variables appropriately. */
2204
2205firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2206
2207/* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2208according to the current setting of the caseless flag. REQ_CASELESS is a bit
2209value > 255. It is added into the firstbyte or reqbyte variables to record the
2210case status of the value. This is used only for ASCII characters. */
2211
2212req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2213
2214/* Switch on next character until the end of the branch */
2215
2216for (;; ptr++)
2217 {
2218 BOOL negate_class;
2219 BOOL possessive_quantifier;
2220 BOOL is_quantifier;
6bf342e1 2221 BOOL is_recurse;
64f2600a 2222 BOOL reset_bracount;
8ac170f3
PH
2223 int class_charcount;
2224 int class_lastchar;
2225 int newoptions;
2226 int recno;
64f2600a 2227 int refsign;
8ac170f3
PH
2228 int skipbytes;
2229 int subreqbyte;
2230 int subfirstbyte;
6bf342e1 2231 int terminator;
8ac170f3
PH
2232 int mclength;
2233 uschar mcbuffer[8];
2234
6bf342e1 2235 /* Get next byte in the pattern */
8ac170f3
PH
2236
2237 c = *ptr;
2238
6bf342e1
PH
2239 /* If we are in the pre-compile phase, accumulate the length used for the
2240 previous cycle of this loop. */
2241
2242 if (lengthptr != NULL)
2243 {
2244#ifdef DEBUG
2245 if (code > cd->hwm) cd->hwm = code; /* High water info */
2246#endif
2247 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2248 {
2249 *errorcodeptr = ERR52;
2250 goto FAILED;
2251 }
2252
2253 /* There is at least one situation where code goes backwards: this is the
2254 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2255 the class is simply eliminated. However, it is created first, so we have to
2256 allow memory for it. Therefore, don't ever reduce the length at this point.
2257 */
2258
2259 if (code < last_code) code = last_code;
2260 *lengthptr += code - last_code;
2261 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2262
2263 /* If "previous" is set and it is not at the start of the work space, move
2264 it back to there, in order to avoid filling up the work space. Otherwise,
2265 if "previous" is NULL, reset the current code pointer to the start. */
2266
2267 if (previous != NULL)
2268 {
2269 if (previous > orig_code)
2270 {
2271 memmove(orig_code, previous, code - previous);
2272 code -= previous - orig_code;
2273 previous = orig_code;
2274 }
2275 }
2276 else code = orig_code;
2277
2278 /* Remember where this code item starts so we can pick up the length
2279 next time round. */
2280
2281 last_code = code;
2282 }
2283
2284 /* In the real compile phase, just check the workspace used by the forward
2285 reference list. */
2286
2287 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2288 {
2289 *errorcodeptr = ERR52;
2290 goto FAILED;
2291 }
2292
8ac170f3
PH
2293 /* If in \Q...\E, check for the end; if not, we have a literal */
2294
2295 if (inescq && c != 0)
2296 {
2297 if (c == '\\' && ptr[1] == 'E')
2298 {
2299 inescq = FALSE;
2300 ptr++;
2301 continue;
2302 }
2303 else
2304 {
2305 if (previous_callout != NULL)
2306 {
6bf342e1
PH
2307 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2308 complete_callout(previous_callout, ptr, cd);
8ac170f3
PH
2309 previous_callout = NULL;
2310 }
2311 if ((options & PCRE_AUTO_CALLOUT) != 0)
2312 {
2313 previous_callout = code;
2314 code = auto_callout(code, ptr, cd);
2315 }
2316 goto NORMAL_CHAR;
2317 }
2318 }
2319
2320 /* Fill in length of a previous callout, except when the next thing is
2321 a quantifier. */
2322
2323 is_quantifier = c == '*' || c == '+' || c == '?' ||
2324 (c == '{' && is_counted_repeat(ptr+1));
2325
2326 if (!is_quantifier && previous_callout != NULL &&
2327 after_manual_callout-- <= 0)
2328 {
6bf342e1
PH
2329 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2330 complete_callout(previous_callout, ptr, cd);
8ac170f3
PH
2331 previous_callout = NULL;
2332 }
2333
2334 /* In extended mode, skip white space and comments */
2335
2336 if ((options & PCRE_EXTENDED) != 0)
2337 {
2338 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2339 if (c == '#')
2340 {
6bf342e1 2341 while (*(++ptr) != 0)
aa41d2de 2342 {
6bf342e1 2343 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
aa41d2de 2344 }
6bf342e1
PH
2345 if (*ptr != 0) continue;
2346
aa41d2de
PH
2347 /* Else fall through to handle end of string */
2348 c = 0;
8ac170f3
PH
2349 }
2350 }
2351
2352 /* No auto callout for quantifiers. */
2353
2354 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2355 {
2356 previous_callout = code;
2357 code = auto_callout(code, ptr, cd);
2358 }
2359
2360 switch(c)
2361 {
6bf342e1
PH
2362 /* ===================================================================*/
2363 case 0: /* The branch terminates at string end */
2364 case '|': /* or | or ) */
8ac170f3
PH
2365 case ')':
2366 *firstbyteptr = firstbyte;
2367 *reqbyteptr = reqbyte;
2368 *codeptr = code;
2369 *ptrptr = ptr;
6bf342e1
PH
2370 if (lengthptr != NULL)
2371 {
2372 *lengthptr += code - last_code; /* To include callout length */
2373 DPRINTF((">> end branch\n"));
2374 }
8ac170f3
PH
2375 return TRUE;
2376
6bf342e1
PH
2377
2378 /* ===================================================================*/
8ac170f3
PH
2379 /* Handle single-character metacharacters. In multiline mode, ^ disables
2380 the setting of any following char as a first character. */
2381
2382 case '^':
2383 if ((options & PCRE_MULTILINE) != 0)
2384 {
2385 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2386 }
2387 previous = NULL;
2388 *code++ = OP_CIRC;
2389 break;
2390
2391 case '$':
2392 previous = NULL;
2393 *code++ = OP_DOLL;
2394 break;
2395
2396 /* There can never be a first char if '.' is first, whatever happens about
2397 repeats. The value of reqbyte doesn't change either. */
2398
2399 case '.':
2400 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2401 zerofirstbyte = firstbyte;
2402 zeroreqbyte = reqbyte;
2403 previous = code;
2404 *code++ = OP_ANY;
2405 break;
2406
6bf342e1
PH
2407
2408 /* ===================================================================*/
aa41d2de
PH
2409 /* Character classes. If the included characters are all < 256, we build a
2410 32-byte bitmap of the permitted characters, except in the special case
2411 where there is only one such character. For negated classes, we build the
2412 map as usual, then invert it at the end. However, we use a different opcode
2413 so that data characters > 255 can be handled correctly.
8ac170f3
PH
2414
2415 If the class contains characters outside the 0-255 range, a different
2416 opcode is compiled. It may optionally have a bit map for characters < 256,
2417 but those above are are explicitly listed afterwards. A flag byte tells
2418 whether the bitmap is present, and whether this is a negated class or not.
2419 */
2420
2421 case '[':
2422 previous = code;
2423
2424 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2425 they are encountered at the top level, so we'll do that too. */
2426
2427 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2428 check_posix_syntax(ptr, &tempptr, cd))
2429 {
2430 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2431 goto FAILED;
2432 }
2433
2434 /* If the first character is '^', set the negation flag and skip it. */
2435
2436 if ((c = *(++ptr)) == '^')
2437 {
2438 negate_class = TRUE;
2439 c = *(++ptr);
2440 }
2441 else
2442 {
2443 negate_class = FALSE;
2444 }
2445
2446 /* Keep a count of chars with values < 256 so that we can optimize the case
6bf342e1
PH
2447 of just a single character (as long as it's < 256). However, For higher
2448 valued UTF-8 characters, we don't yet do any optimization. */
8ac170f3
PH
2449
2450 class_charcount = 0;
2451 class_lastchar = -1;
2452
6bf342e1
PH
2453 /* Initialize the 32-char bit map to all zeros. We build the map in a
2454 temporary bit of memory, in case the class contains only 1 character (less
2455 than 256), because in that case the compiled code doesn't use the bit map.
2456 */
2457
2458 memset(classbits, 0, 32 * sizeof(uschar));
2459
8ac170f3
PH
2460#ifdef SUPPORT_UTF8
2461 class_utf8 = FALSE; /* No chars >= 256 */
6bf342e1 2462 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
8ac170f3
PH
2463#endif
2464
8ac170f3 2465 /* Process characters until ] is reached. By writing this as a "do" it
6bf342e1
PH
2466 means that an initial ] is taken as a data character. At the start of the
2467 loop, c contains the first byte of the character. */
8ac170f3 2468
6bf342e1 2469 if (c != 0) do
8ac170f3 2470 {
6bf342e1
PH
2471 const uschar *oldptr;
2472
8ac170f3
PH
2473#ifdef SUPPORT_UTF8
2474 if (utf8 && c > 127)
2475 { /* Braces are required because the */
2476 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2477 }
2478#endif
2479
2480 /* Inside \Q...\E everything is literal except \E */
2481
2482 if (inescq)
2483 {
6bf342e1 2484 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
8ac170f3 2485 {
6bf342e1
PH
2486 inescq = FALSE; /* Reset literal state */
2487 ptr++; /* Skip the 'E' */
2488 continue; /* Carry on with next */
8ac170f3 2489 }
6bf342e1 2490 goto CHECK_RANGE; /* Could be range if \E follows */
8ac170f3
PH
2491 }
2492
2493 /* Handle POSIX class names. Perl allows a negation extension of the
2494 form [:^name:]. A square bracket that doesn't match the syntax is
2495 treated as a literal. We also recognize the POSIX constructions
2496 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2497 5.6 and 5.8 do. */
2498
2499 if (c == '[' &&
2500 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2501 check_posix_syntax(ptr, &tempptr, cd))
2502 {
2503 BOOL local_negate = FALSE;
aa41d2de 2504 int posix_class, taboffset, tabopt;
8ac170f3 2505 register const uschar *cbits = cd->cbits;
aa41d2de 2506 uschar pbits[32];
8ac170f3
PH
2507
2508 if (ptr[1] != ':')
2509 {
2510 *errorcodeptr = ERR31;
2511 goto FAILED;
2512 }
2513
2514 ptr += 2;
2515 if (*ptr == '^')
2516 {
2517 local_negate = TRUE;
2518 ptr++;
2519 }
2520
2521 posix_class = check_posix_name(ptr, tempptr - ptr);
2522 if (posix_class < 0)
2523 {
2524 *errorcodeptr = ERR30;
2525 goto FAILED;
2526 }
2527
2528 /* If matching is caseless, upper and lower are converted to
2529 alpha. This relies on the fact that the class table starts with
2530 alpha, lower, upper as the first 3 entries. */
2531
2532 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2533 posix_class = 0;
2534
aa41d2de
PH
2535 /* We build the bit map for the POSIX class in a chunk of local store
2536 because we may be adding and subtracting from it, and we don't want to
2537 subtract bits that may be in the main map already. At the end we or the
2538 result into the bit map that is being built. */
8ac170f3
PH
2539
2540 posix_class *= 3;
aa41d2de
PH
2541
2542 /* Copy in the first table (always present) */
2543
2544 memcpy(pbits, cbits + posix_class_maps[posix_class],
2545 32 * sizeof(uschar));
2546
2547 /* If there is a second table, add or remove it as required. */
2548
2549 taboffset = posix_class_maps[posix_class + 1];
2550 tabopt = posix_class_maps[posix_class + 2];
2551
2552 if (taboffset >= 0)
8ac170f3 2553 {
aa41d2de
PH
2554 if (tabopt >= 0)
2555 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
8ac170f3 2556 else
aa41d2de 2557 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
8ac170f3
PH
2558 }
2559
aa41d2de
PH
2560 /* Not see if we need to remove any special characters. An option
2561 value of 1 removes vertical space and 2 removes underscore. */
2562
2563 if (tabopt < 0) tabopt = -tabopt;
2564 if (tabopt == 1) pbits[1] &= ~0x3c;
2565 else if (tabopt == 2) pbits[11] &= 0x7f;
2566
2567 /* Add the POSIX table or its complement into the main table that is
2568 being built and we are done. */
2569
2570 if (local_negate)
2571 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2572 else
2573 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2574
8ac170f3
PH
2575 ptr = tempptr + 1;
2576 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2577 continue; /* End of POSIX syntax handling */
2578 }
2579
2580 /* Backslash may introduce a single character, or it may introduce one
6bf342e1
PH
2581 of the specials, which just set a flag. The sequence \b is a special
2582 case. Inside a class (and only there) it is treated as backspace.
2583 Elsewhere it marks a word boundary. Other escapes have preset maps ready
2584 to or into the one we are building. We assume they have more than one
8ac170f3
PH
2585 character in them, so set class_charcount bigger than one. */
2586
2587 if (c == '\\')
2588 {
6bf342e1
PH
2589 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2590 if (*errorcodeptr != 0) goto FAILED;
8ac170f3
PH
2591
2592 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2593 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
6bf342e1 2594 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
8ac170f3
PH
2595 else if (-c == ESC_Q) /* Handle start of quoted string */
2596 {
2597 if (ptr[1] == '\\' && ptr[2] == 'E')
2598 {
2599 ptr += 2; /* avoid empty string */
2600 }
2601 else inescq = TRUE;
2602 continue;
2603 }
2604
2605 if (c < 0)
2606 {
2607 register const uschar *cbits = cd->cbits;
2608 class_charcount += 2; /* Greater than 1 is what matters */
6bf342e1
PH
2609
2610 /* Save time by not doing this in the pre-compile phase. */
2611
2612 if (lengthptr == NULL) switch (-c)
8ac170f3
PH
2613 {
2614 case ESC_d:
2615 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2616 continue;
2617
2618 case ESC_D:
2619 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2620 continue;
2621
2622 case ESC_w:
2623 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2624 continue;
2625
2626 case ESC_W:
2627 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2628 continue;
2629
2630 case ESC_s:
2631 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2632 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2633 continue;
2634
2635 case ESC_S:
2636 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2637 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2638 continue;
2639
6bf342e1 2640 case ESC_E: /* Perl ignores an orphan \E */
8ac170f3 2641 continue;
8ac170f3 2642
6bf342e1
PH
2643 default: /* Not recognized; fall through */
2644 break; /* Need "default" setting to stop compiler warning. */
8ac170f3 2645 }
8ac170f3 2646
6bf342e1 2647 /* In the pre-compile phase, just do the recognition. */
8ac170f3 2648
6bf342e1
PH
2649 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2650 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2651
64f2600a
PH
2652 /* We need to deal with \H, \h, \V, and \v in both phases because
2653 they use extra memory. */
2654
2655 if (-c == ESC_h)
2656 {
2657 SETBIT(classbits, 0x09); /* VT */
2658 SETBIT(classbits, 0x20); /* SPACE */
2659 SETBIT(classbits, 0xa0); /* NSBP */
2660#ifdef SUPPORT_UTF8
2661 if (utf8)
2662 {
2663 class_utf8 = TRUE;
2664 *class_utf8data++ = XCL_SINGLE;
2665 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2666 *class_utf8data++ = XCL_SINGLE;
2667 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2668 *class_utf8data++ = XCL_RANGE;
2669 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2670 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2671 *class_utf8data++ = XCL_SINGLE;
2672 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2673 *class_utf8data++ = XCL_SINGLE;
2674 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2675 *class_utf8data++ = XCL_SINGLE;
2676 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2677 }
2678#endif
2679 continue;
2680 }
2681
2682 if (-c == ESC_H)
2683 {
2684 for (c = 0; c < 32; c++)
2685 {
2686 int x = 0xff;
2687 switch (c)
2688 {
2689 case 0x09/8: x ^= 1 << (0x09%8); break;
2690 case 0x20/8: x ^= 1 << (0x20%8); break;
2691 case 0xa0/8: x ^= 1 << (0xa0%8); break;
2692 default: break;
2693 }
2694 classbits[c] |= x;
2695 }
2696
2697#ifdef SUPPORT_UTF8
2698 if (utf8)
2699 {
2700 class_utf8 = TRUE;
2701 *class_utf8data++ = XCL_RANGE;
2702 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2703 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2704 *class_utf8data++ = XCL_RANGE;
2705 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2706 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2707 *class_utf8data++ = XCL_RANGE;
2708 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2709 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2710 *class_utf8data++ = XCL_RANGE;
2711 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2712 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2713 *class_utf8data++ = XCL_RANGE;
2714 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2715 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2716 *class_utf8data++ = XCL_RANGE;
2717 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2718 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2719 *class_utf8data++ = XCL_RANGE;
2720 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2721 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2722 }
2723#endif
2724 continue;
2725 }
2726
2727 if (-c == ESC_v)
2728 {
2729 SETBIT(classbits, 0x0a); /* LF */
2730 SETBIT(classbits, 0x0b); /* VT */
2731 SETBIT(classbits, 0x0c); /* FF */
2732 SETBIT(classbits, 0x0d); /* CR */
2733 SETBIT(classbits, 0x85); /* NEL */
2734#ifdef SUPPORT_UTF8
2735 if (utf8)
2736 {
2737 class_utf8 = TRUE;
2738 *class_utf8data++ = XCL_RANGE;
2739 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2740 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2741 }
2742#endif
2743 continue;
2744 }
2745
2746 if (-c == ESC_V)
2747 {
2748 for (c = 0; c < 32; c++)
2749 {
2750 int x = 0xff;
2751 switch (c)
2752 {
2753 case 0x0a/8: x ^= 1 << (0x0a%8);
2754 x ^= 1 << (0x0b%8);
2755 x ^= 1 << (0x0c%8);
2756 x ^= 1 << (0x0d%8);
2757 break;
2758 case 0x85/8: x ^= 1 << (0x85%8); break;
2759 default: break;
2760 }
2761 classbits[c] |= x;
2762 }
2763
2764#ifdef SUPPORT_UTF8
2765 if (utf8)
2766 {
2767 class_utf8 = TRUE;
2768 *class_utf8data++ = XCL_RANGE;
2769 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2770 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2771 *class_utf8data++ = XCL_RANGE;
2772 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2773 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2774 }
2775#endif
2776 continue;
2777 }
2778
6bf342e1
PH
2779 /* We need to deal with \P and \p in both phases. */
2780
2781#ifdef SUPPORT_UCP
2782 if (-c == ESC_p || -c == ESC_P)
2783 {
2784 BOOL negated;
2785 int pdata;
2786 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2787 if (ptype < 0) goto FAILED;
2788 class_utf8 = TRUE;
2789 *class_utf8data++ = ((-c == ESC_p) != negated)?
2790 XCL_PROP : XCL_NOTPROP;
2791 *class_utf8data++ = ptype;
2792 *class_utf8data++ = pdata;
2793 class_charcount -= 2; /* Not a < 256 character */
2794 continue;
2795 }
2796#endif
2797 /* Unrecognized escapes are faulted if PCRE is running in its
2798 strict mode. By default, for compatibility with Perl, they are
2799 treated as literals. */
2800
2801 if ((options & PCRE_EXTRA) != 0)
2802 {
2803 *errorcodeptr = ERR7;
2804 goto FAILED;
2805 }
2806
2807 class_charcount -= 2; /* Undo the default count from above */
2808 c = *ptr; /* Get the final character and fall through */
2809 }
2810
2811 /* Fall through if we have a single character (c >= 0). This may be
2812 greater than 256 in UTF-8 mode. */
2813
2814 } /* End of backslash handling */
8ac170f3
PH
2815
2816 /* A single character may be followed by '-' to form a range. However,
2817 Perl does not permit ']' to be the end of the range. A '-' character
6bf342e1
PH
2818 at the end is treated as a literal. Perl ignores orphaned \E sequences
2819 entirely. The code for handling \Q and \E is messy. */
2820
2821 CHECK_RANGE:
2822 while (ptr[1] == '\\' && ptr[2] == 'E')
2823 {
2824 inescq = FALSE;
2825 ptr += 2;
2826 }
2827
2828 oldptr = ptr;
8ac170f3 2829
6bf342e1 2830 if (!inescq && ptr[1] == '-')
8ac170f3
PH
2831 {
2832 int d;
2833 ptr += 2;
6bf342e1
PH
2834 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2835
2836 /* If we hit \Q (not followed by \E) at this point, go into escaped
2837 mode. */
2838
2839 while (*ptr == '\\' && ptr[1] == 'Q')
2840 {
2841 ptr += 2;
2842 if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2843 inescq = TRUE;
2844 break;
2845 }
2846
2847 if (*ptr == 0 || (!inescq && *ptr == ']'))
2848 {
2849 ptr = oldptr;
2850 goto LONE_SINGLE_CHARACTER;
2851 }
8ac170f3
PH
2852
2853#ifdef SUPPORT_UTF8
2854 if (utf8)
2855 { /* Braces are required because the */
2856 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2857 }
2858 else
2859#endif
2860 d = *ptr; /* Not UTF-8 mode */
2861
2862 /* The second part of a range can be a single-character escape, but
2863 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2864 in such circumstances. */
2865
6bf342e1 2866 if (!inescq && d == '\\')
8ac170f3 2867 {
6bf342e1
PH
2868 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2869 if (*errorcodeptr != 0) goto FAILED;
8ac170f3 2870
6bf342e1
PH
2871 /* \b is backslash; \X is literal X; \R is literal R; any other
2872 special means the '-' was literal */
8ac170f3
PH
2873
2874 if (d < 0)
2875 {
2876 if (d == -ESC_b) d = '\b';
6bf342e1
PH
2877 else if (d == -ESC_X) d = 'X';
2878 else if (d == -ESC_R) d = 'R'; else
8ac170f3 2879 {
6bf342e1 2880 ptr = oldptr;
8ac170f3
PH
2881 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2882 }
2883 }
2884 }
2885
6bf342e1
PH
2886 /* Check that the two values are in the correct order. Optimize
2887 one-character ranges */
2888
2889 if (d < c)
2890 {
2891 *errorcodeptr = ERR8;
2892 goto FAILED;
2893 }
8ac170f3
PH
2894
2895 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2896
2897 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2898 matching, we have to use an XCLASS with extra data items. Caseless
2899 matching for characters > 127 is available only if UCP support is
2900 available. */
2901
2902#ifdef SUPPORT_UTF8
2903 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2904 {
2905 class_utf8 = TRUE;
2906
2907 /* With UCP support, we can find the other case equivalents of
2908 the relevant characters. There may be several ranges. Optimize how
2909 they fit with the basic range. */
2910
2911#ifdef SUPPORT_UCP
2912 if ((options & PCRE_CASELESS) != 0)
2913 {
6bf342e1
PH
2914 unsigned int occ, ocd;
2915 unsigned int cc = c;
2916 unsigned int origd = d;
8ac170f3
PH
2917 while (get_othercase_range(&cc, origd, &occ, &ocd))
2918 {
64f2600a
PH
2919 if (occ >= (unsigned int)c &&
2920 ocd <= (unsigned int)d)
2921 continue; /* Skip embedded ranges */
8ac170f3 2922
64f2600a
PH
2923 if (occ < (unsigned int)c &&
2924 ocd >= (unsigned int)c - 1) /* Extend the basic range */
8ac170f3
PH
2925 { /* if there is overlap, */
2926 c = occ; /* noting that if occ < c */
2927 continue; /* we can't have ocd > d */
2928 } /* because a subrange is */
64f2600a
PH
2929 if (ocd > (unsigned int)d &&
2930 occ <= (unsigned int)d + 1) /* always shorter than */
8ac170f3
PH
2931 { /* the basic range. */
2932 d = ocd;
2933 continue;
2934 }
2935
2936 if (occ == ocd)
2937 {
2938 *class_utf8data++ = XCL_SINGLE;
2939 }
2940 else
2941 {
2942 *class_utf8data++ = XCL_RANGE;
2943 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2944 }
2945 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2946 }
2947 }
2948#endif /* SUPPORT_UCP */
2949
2950 /* Now record the original range, possibly modified for UCP caseless
2951 overlapping ranges. */
2952
2953 *class_utf8data++ = XCL_RANGE;
2954 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2955 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2956
2957 /* With UCP support, we are done. Without UCP support, there is no
2958 caseless matching for UTF-8 characters > 127; we can use the bit map
2959 for the smaller ones. */
2960
2961#ifdef SUPPORT_UCP
2962 continue; /* With next character in the class */
2963#else
2964 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2965
2966 /* Adjust upper limit and fall through to set up the map */
2967
2968 d = 127;
2969
2970#endif /* SUPPORT_UCP */
2971 }
2972#endif /* SUPPORT_UTF8 */
2973
2974 /* We use the bit map for all cases when not in UTF-8 mode; else
2975 ranges that lie entirely within 0-127 when there is UCP support; else
2976 for partial ranges without UCP support. */
2977
6bf342e1
PH
2978 class_charcount += d - c + 1;
2979 class_lastchar = d;
2980
2981 /* We can save a bit of time by skipping this in the pre-compile. */
2982
2983 if (lengthptr == NULL) for (; c <= d; c++)
8ac170f3
PH
2984 {
2985 classbits[c/8] |= (1 << (c&7));
2986 if ((options & PCRE_CASELESS) != 0)
2987 {
2988 int uc = cd->fcc[c]; /* flip case */
2989 classbits[uc/8] |= (1 << (uc&7));
2990 }
8ac170f3
PH
2991 }
2992
2993 continue; /* Go get the next char in the class */
2994 }
2995
2996 /* Handle a lone single character - we can get here for a normal
2997 non-escape char, or after \ that introduces a single character or for an
2998 apparent range that isn't. */
2999
3000 LONE_SINGLE_CHARACTER:
3001
3002 /* Handle a character that cannot go in the bit map */
3003
3004#ifdef SUPPORT_UTF8
3005 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3006 {
3007 class_utf8 = TRUE;
3008 *class_utf8data++ = XCL_SINGLE;
3009 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3010
3011#ifdef SUPPORT_UCP
3012 if ((options & PCRE_CASELESS) != 0)
3013 {
6bf342e1
PH
3014 unsigned int othercase;
3015 if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
8ac170f3
PH
3016 {
3017 *class_utf8data++ = XCL_SINGLE;
3018 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3019 }
3020 }
3021#endif /* SUPPORT_UCP */
3022
3023 }
3024 else
3025#endif /* SUPPORT_UTF8 */
3026
3027 /* Handle a single-byte character */
3028 {
3029 classbits[c/8] |= (1 << (c&7));
3030 if ((options & PCRE_CASELESS) != 0)
3031 {
3032 c = cd->fcc[c]; /* flip case */
3033 classbits[c/8] |= (1 << (c&7));
3034 }
3035 class_charcount++;
3036 class_lastchar = c;
3037 }
3038 }
3039
6bf342e1 3040 /* Loop until ']' reached. This "while" is the end of the "do" above. */
8ac170f3 3041
6bf342e1
PH
3042 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3043
3044 if (c == 0) /* Missing terminating ']' */
3045 {
3046 *errorcodeptr = ERR6;
3047 goto FAILED;
3048 }
8ac170f3
PH
3049
3050 /* If class_charcount is 1, we saw precisely one character whose value is
3051 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
3052 can optimize the negative case only if there were no characters >= 128
3053 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
3054 single-bytes only. This is an historical hangover. Maybe one day we can
3055 tidy these opcodes to handle multi-byte characters.
3056
3057 The optimization throws away the bit map. We turn the item into a
3058 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3059 that OP_NOT does not support multibyte characters. In the positive case, it
3060 can cause firstbyte to be set. Otherwise, there can be no first char if
3061 this item is first, whatever repeat count may follow. In the case of
3062 reqbyte, save the previous value for reinstating. */
3063
3064#ifdef SUPPORT_UTF8
3065 if (class_charcount == 1 &&
3066 (!utf8 ||
3067 (!class_utf8 && (!negate_class || class_lastchar < 128))))
3068
3069#else
3070 if (class_charcount == 1)
3071#endif
3072 {
3073 zeroreqbyte = reqbyte;
3074
3075 /* The OP_NOT opcode works on one-byte characters only. */
3076
3077 if (negate_class)
3078 {
3079 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3080 zerofirstbyte = firstbyte;
3081 *code++ = OP_NOT;
3082 *code++ = class_lastchar;
3083 break;
3084 }
3085
3086 /* For a single, positive character, get the value into mcbuffer, and
3087 then we can handle this with the normal one-character code. */
3088
3089#ifdef SUPPORT_UTF8
3090 if (utf8 && class_lastchar > 127)
3091 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3092 else
3093#endif
3094 {
3095 mcbuffer[0] = class_lastchar;
3096 mclength = 1;
3097 }
3098 goto ONE_CHAR;
3099 } /* End of 1-char optimization */
3100
3101 /* The general case - not the one-char optimization. If this is the first
3102 thing in the branch, there can be no first char setting, whatever the
3103 repeat count. Any reqbyte setting must remain unchanged after any kind of
3104 repeat. */
3105
3106 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3107 zerofirstbyte = firstbyte;
3108 zeroreqbyte = reqbyte;
3109
3110 /* If there are characters with values > 255, we have to compile an
3111 extended class, with its own opcode. If there are no characters < 256,
6bf342e1 3112 we can omit the bitmap in the actual compiled code. */
8ac170f3
PH
3113
3114#ifdef SUPPORT_UTF8
3115 if (class_utf8)
3116 {
3117 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3118 *code++ = OP_XCLASS;
3119 code += LINK_SIZE;
3120 *code = negate_class? XCL_NOT : 0;
3121
6bf342e1
PH
3122 /* If the map is required, move up the extra data to make room for it;
3123 otherwise just move the code pointer to the end of the extra data. */
8ac170f3
PH
3124
3125 if (class_charcount > 0)
3126 {
3127 *code++ |= XCL_MAP;
6bf342e1 3128 memmove(code + 32, code, class_utf8data - code);
8ac170f3 3129 memcpy(code, classbits, 32);
6bf342e1 3130 code = class_utf8data + 32;
8ac170f3 3131 }
6bf342e1 3132 else code = class_utf8data;
8ac170f3
PH
3133
3134 /* Now fill in the complete length of the item */
3135
3136 PUT(previous, 1, code - previous);
3137 break; /* End of class handling */
3138 }
3139#endif
3140
3141 /* If there are no characters > 255, negate the 32-byte map if necessary,
3142 and copy it into the code vector. If this is the first thing in the branch,
3143 there can be no first char setting, whatever the repeat count. Any reqbyte
3144 setting must remain unchanged after any kind of repeat. */
3145
3146 if (negate_class)
3147 {
3148 *code++ = OP_NCLASS;
6bf342e1
PH
3149 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3150 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
8ac170f3
PH
3151 }
3152 else
3153 {
3154 *code++ = OP_CLASS;
3155 memcpy(code, classbits, 32);
3156 }
3157 code += 32;
3158 break;
3159
6bf342e1
PH
3160
3161 /* ===================================================================*/
8ac170f3
PH
3162 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3163 has been tested above. */
3164
3165 case '{':
3166 if (!is_quantifier) goto NORMAL_CHAR;
3167 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3168 if (*errorcodeptr != 0) goto FAILED;
3169 goto REPEAT;
3170
3171 case '*':
3172 repeat_min = 0;
3173 repeat_max = -1;
3174 goto REPEAT;
3175
3176 case '+':
3177 repeat_min = 1;
3178 repeat_max = -1;
3179 goto REPEAT;
3180
3181 case '?':
3182 repeat_min = 0;
3183 repeat_max = 1;
3184
3185 REPEAT:
3186 if (previous == NULL)
3187 {
3188 *errorcodeptr = ERR9;
3189 goto FAILED;
3190 }
3191
3192 if (repeat_min == 0)
3193 {
3194 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3195 reqbyte = zeroreqbyte; /* Ditto */
3196 }
3197
3198 /* Remember whether this is a variable length repeat */
3199
3200 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3201
3202 op_type = 0; /* Default single-char op codes */
3203 possessive_quantifier = FALSE; /* Default not possessive quantifier */
3204
3205 /* Save start of previous item, in case we have to move it up to make space
3206 for an inserted OP_ONCE for the additional '+' extension. */
3207
3208 tempcode = previous;
3209
3210 /* If the next character is '+', we have a possessive quantifier. This
3211 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3212 If the next character is '?' this is a minimizing repeat, by default,
3213 but if PCRE_UNGREEDY is set, it works the other way round. We change the
3214 repeat type to the non-default. */
3215
3216 if (ptr[1] == '+')
3217 {
3218 repeat_type = 0; /* Force greedy */
3219 possessive_quantifier = TRUE;
3220 ptr++;
3221 }
3222 else if (ptr[1] == '?')
3223 {
3224 repeat_type = greedy_non_default;
3225 ptr++;
3226 }
3227 else repeat_type = greedy_default;
3228
8ac170f3
PH
3229 /* If previous was a character match, abolish the item and generate a
3230 repeat item instead. If a char item has a minumum of more than one, ensure
3231 that it is set in reqbyte - it might not be if a sequence such as x{3} is
3232 the first thing in a branch because the x will have gone into firstbyte
3233 instead. */
3234
3235 if (*previous == OP_CHAR || *previous == OP_CHARNC)
3236 {
3237 /* Deal with UTF-8 characters that take up more than one byte. It's
3238 easier to write this out separately than try to macrify it. Use c to
3239 hold the length of the character in bytes, plus 0x80 to flag that it's a
3240 length rather than a small character. */
3241
3242#ifdef SUPPORT_UTF8
3243 if (utf8 && (code[-1] & 0x80) != 0)
3244 {
3245 uschar *lastchar = code - 1;
3246 while((*lastchar & 0xc0) == 0x80) lastchar--;
3247 c = code - lastchar; /* Length of UTF-8 character */
3248 memcpy(utf8_char, lastchar, c); /* Save the char */
3249 c |= 0x80; /* Flag c as a length */
3250 }
3251 else
3252#endif
3253
3254 /* Handle the case of a single byte - either with no UTF8 support, or
3255 with UTF-8 disabled, or for a UTF-8 character < 128. */
3256
3257 {
3258 c = code[-1];
3259 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3260 }
3261
6bf342e1
PH
3262 /* If the repetition is unlimited, it pays to see if the next thing on
3263 the line is something that cannot possibly match this character. If so,
3264 automatically possessifying this item gains some performance in the case
3265 where the match fails. */
3266
3267 if (!possessive_quantifier &&
3268 repeat_max < 0 &&
3269 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3270 options, cd))
3271 {
3272 repeat_type = 0; /* Force greedy */
3273 possessive_quantifier = TRUE;
3274 }
3275
8ac170f3
PH
3276 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3277 }
3278
3279 /* If previous was a single negated character ([^a] or similar), we use
3280 one of the special opcodes, replacing it. The code is shared with single-
3281 character repeats by setting opt_type to add a suitable offset into
6bf342e1
PH
3282 repeat_type. We can also test for auto-possessification. OP_NOT is
3283 currently used only for single-byte chars. */
8ac170f3
PH
3284
3285 else if (*previous == OP_NOT)
3286 {
3287 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3288 c = previous[1];
6bf342e1
PH
3289 if (!possessive_quantifier &&
3290 repeat_max < 0 &&
3291 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3292 {
3293 repeat_type = 0; /* Force greedy */
3294 possessive_quantifier = TRUE;
3295 }
8ac170f3
PH
3296 goto OUTPUT_SINGLE_REPEAT;
3297 }
3298
3299 /* If previous was a character type match (\d or similar), abolish it and
3300 create a suitable repeat item. The code is shared with single-character
3301 repeats by setting op_type to add a suitable offset into repeat_type. Note
3302 the the Unicode property types will be present only when SUPPORT_UCP is
3303 defined, but we don't wrap the little bits of code here because it just
3304 makes it horribly messy. */
3305
3306 else if (*previous < OP_EODN)
3307 {
3308 uschar *oldcode;
aa41d2de 3309 int prop_type, prop_value;
8ac170f3
PH
3310 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3311 c = *previous;
3312
6bf342e1
PH
3313 if (!possessive_quantifier &&
3314 repeat_max < 0 &&
3315 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3316 {
3317 repeat_type = 0; /* Force greedy */
3318 possessive_quantifier = TRUE;
3319 }
3320
8ac170f3 3321 OUTPUT_SINGLE_REPEAT:
aa41d2de
PH
3322 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3323 {
3324 prop_type = previous[1];
3325 prop_value = previous[2];
3326 }
3327 else prop_type = prop_value = -1;
8ac170f3
PH
3328
3329 oldcode = code;
3330 code = previous; /* Usually overwrite previous item */
3331
3332 /* If the maximum is zero then the minimum must also be zero; Perl allows
3333 this case, so we do too - by simply omitting the item altogether. */
3334
3335 if (repeat_max == 0) goto END_REPEAT;
3336
3337 /* All real repeats make it impossible to handle partial matching (maybe
3338 one day we will be able to remove this restriction). */
3339
3340 if (repeat_max != 1) cd->nopartial = TRUE;
3341
3342 /* Combine the op_type with the repeat_type */
3343
3344 repeat_type += op_type;
3345
3346 /* A minimum of zero is handled either as the special case * or ?, or as
3347 an UPTO, with the maximum given. */
3348
3349 if (repeat_min == 0)
3350 {
3351 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3352 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3353 else
3354 {
3355 *code++ = OP_UPTO + repeat_type;
3356 PUT2INC(code, 0, repeat_max);
3357 }
3358 }
3359
3360 /* A repeat minimum of 1 is optimized into some special cases. If the
6bf342e1 3361 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
8ac170f3
PH
3362 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3363 one less than the maximum. */
3364
3365 else if (repeat_min == 1)
3366 {
3367 if (repeat_max == -1)
3368 *code++ = OP_PLUS + repeat_type;
3369 else
3370 {
3371 code = oldcode; /* leave previous item in place */
3372 if (repeat_max == 1) goto END_REPEAT;
3373 *code++ = OP_UPTO + repeat_type;
3374 PUT2INC(code, 0, repeat_max - 1);
3375 }
3376 }
3377
3378 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3379 handled as an EXACT followed by an UPTO. */
3380
3381 else
3382 {
3383 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3384 PUT2INC(code, 0, repeat_min);
3385
3386 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3387 we have to insert the character for the previous code. For a repeated
aa41d2de 3388 Unicode property match, there are two extra bytes that define the
8ac170f3
PH
3389 required property. In UTF-8 mode, long characters have their length in
3390 c, with the 0x80 bit as a flag. */
3391
3392 if (repeat_max < 0)
3393 {
3394#ifdef SUPPORT_UTF8
3395 if (utf8 && c >= 128)
3396 {
3397 memcpy(code, utf8_char, c & 7);
3398 code += c & 7;
3399 }
3400 else
3401#endif
3402 {
3403 *code++ = c;
aa41d2de
PH
3404 if (prop_type >= 0)
3405 {
3406 *code++ = prop_type;
3407 *code++ = prop_value;
3408 }
8ac170f3
PH
3409 }
3410 *code++ = OP_STAR + repeat_type;
3411 }
3412
3413 /* Else insert an UPTO if the max is greater than the min, again
6bf342e1
PH
3414 preceded by the character, for the previously inserted code. If the
3415 UPTO is just for 1 instance, we can use QUERY instead. */
8ac170f3
PH
3416
3417 else if (repeat_max != repeat_min)
3418 {
3419#ifdef SUPPORT_UTF8
3420 if (utf8 && c >= 128)
3421 {
3422 memcpy(code, utf8_char, c & 7);
3423 code += c & 7;
3424 }
3425 else
3426#endif
3427 *code++ = c;
aa41d2de
PH
3428 if (prop_type >= 0)
3429 {
3430 *code++ = prop_type;
3431 *code++ = prop_value;
3432 }
8ac170f3 3433 repeat_max -= repeat_min;
6bf342e1
PH
3434
3435 if (repeat_max == 1)
3436 {
3437 *code++ = OP_QUERY + repeat_type;
3438 }
3439 else
3440 {
3441 *code++ = OP_UPTO + repeat_type;
3442 PUT2INC(code, 0, repeat_max);
3443 }
8ac170f3
PH
3444 }
3445 }
3446
3447 /* The character or character type itself comes last in all cases. */
3448
3449#ifdef SUPPORT_UTF8
3450 if (utf8 && c >= 128)
3451 {
3452 memcpy(code, utf8_char, c & 7);
3453 code += c & 7;
3454 }
3455 else
3456#endif
3457 *code++ = c;
3458
aa41d2de
PH
3459 /* For a repeated Unicode property match, there are two extra bytes that
3460 define the required property. */
8ac170f3
PH
3461
3462#ifdef SUPPORT_UCP
aa41d2de
PH
3463 if (prop_type >= 0)
3464 {
3465 *code++ = prop_type;
3466 *code++ = prop_value;
3467 }
8ac170f3
PH
3468#endif
3469 }
3470
3471 /* If previous was a character class or a back reference, we put the repeat
3472 stuff after it, but just skip the item if the repeat was {0,0}. */
3473
3474 else if (*previous == OP_CLASS ||
3475 *previous == OP_NCLASS ||
3476#ifdef SUPPORT_UTF8
3477 *previous == OP_XCLASS ||
3478#endif
3479 *previous == OP_REF)
3480 {
3481 if (repeat_max == 0)
3482 {
3483 code = previous;
3484 goto END_REPEAT;
3485 }
3486
3487 /* All real repeats make it impossible to handle partial matching (maybe
3488 one day we will be able to remove this restriction). */
3489
3490 if (repeat_max != 1) cd->nopartial = TRUE;
3491
3492 if (repeat_min == 0 && repeat_max == -1)
3493 *code++ = OP_CRSTAR + repeat_type;
3494 else if (repeat_min == 1 && repeat_max == -1)
3495 *code++ = OP_CRPLUS + repeat_type;
3496 else if (repeat_min == 0 && repeat_max == 1)
3497 *code++ = OP_CRQUERY + repeat_type;
3498 else
3499 {
3500 *code++ = OP_CRRANGE + repeat_type;
3501 PUT2INC(code, 0, repeat_min);
3502 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3503 PUT2INC(code, 0, repeat_max);
3504 }
3505 }
3506
3507 /* If previous was a bracket group, we may have to replicate it in certain
3508 cases. */
3509
6bf342e1
PH
3510 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3511 *previous == OP_ONCE || *previous == OP_COND)
8ac170f3
PH
3512 {
3513 register int i;
3514 int ketoffset = 0;
3515 int len = code - previous;
3516 uschar *bralink = NULL;
3517
6bf342e1
PH
3518 /* Repeating a DEFINE group is pointless */
3519
3520 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3521 {
3522 *errorcodeptr = ERR55;
3523 goto FAILED;
3524 }
3525
3526 /* This is a paranoid check to stop integer overflow later on */
3527
3528 if (len > MAX_DUPLENGTH)
3529 {
3530 *errorcodeptr = ERR50;
3531 goto FAILED;
3532 }
3533
8ac170f3
PH
3534 /* If the maximum repeat count is unlimited, find the end of the bracket
3535 by scanning through from the start, and compute the offset back to it
3536 from the current code pointer. There may be an OP_OPT setting following
3537 the final KET, so we can't find the end just by going back from the code
3538 pointer. */
3539
3540 if (repeat_max == -1)
3541 {
3542 register uschar *ket = previous;
3543 do ket += GET(ket, 1); while (*ket != OP_KET);
3544 ketoffset = code - ket;
3545 }
3546
3547 /* The case of a zero minimum is special because of the need to stick
3548 OP_BRAZERO in front of it, and because the group appears once in the
3549 data, whereas in other cases it appears the minimum number of times. For
3550 this reason, it is simplest to treat this case separately, as otherwise
3551 the code gets far too messy. There are several special subcases when the
3552 minimum is zero. */
3553
3554 if (repeat_min == 0)
3555 {
3556 /* If the maximum is also zero, we just omit the group from the output
3557 altogether. */
3558
3559 if (repeat_max == 0)
3560 {
3561 code = previous;
3562 goto END_REPEAT;
3563 }
3564
3565 /* If the maximum is 1 or unlimited, we just have to stick in the
3566 BRAZERO and do no more at this point. However, we do need to adjust
3567 any OP_RECURSE calls inside the group that refer to the group itself or
6bf342e1
PH
3568 any internal or forward referenced group, because the offset is from
3569 the start of the whole regex. Temporarily terminate the pattern while
3570 doing this. */
8ac170f3
PH
3571
3572 if (repeat_max <= 1)
3573 {
3574 *code = OP_END;
6bf342e1 3575 adjust_recurse(previous, 1, utf8, cd, save_hwm);
8ac170f3
PH
3576 memmove(previous+1, previous, len);
3577 code++;
3578 *previous++ = OP_BRAZERO + repeat_type;
3579 }
3580
3581 /* If the maximum is greater than 1 and limited, we have to replicate
3582 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3583 The first one has to be handled carefully because it's the original
3584 copy, which has to be moved up. The remainder can be handled by code
3585 that is common with the non-zero minimum case below. We have to
3586 adjust the value or repeat_max, since one less copy is required. Once
3587 again, we may have to adjust any OP_RECURSE calls inside the group. */
3588
3589 else
3590 {
3591 int offset;
3592 *code = OP_END;
6bf342e1 3593 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
8ac170f3
PH
3594 memmove(previous + 2 + LINK_SIZE, previous, len);
3595 code += 2 + LINK_SIZE;
3596 *previous++ = OP_BRAZERO + repeat_type;
3597 *previous++ = OP_BRA;
3598
3599 /* We chain together the bracket offset fields that have to be
3600 filled in later when the ends of the brackets are reached. */
3601
3602 offset = (bralink == NULL)? 0 : previous - bralink;
3603 bralink = previous;
3604 PUTINC(previous, 0, offset);
3605 }
3606
3607 repeat_max--;
3608 }
3609
3610 /* If the minimum is greater than zero, replicate the group as many
3611 times as necessary, and adjust the maximum to the number of subsequent
3612 copies that we need. If we set a first char from the group, and didn't
6bf342e1
PH
3613 set a required char, copy the latter from the former. If there are any
3614 forward reference subroutine calls in the group, there will be entries on
3615 the workspace list; replicate these with an appropriate increment. */
8ac170f3
PH
3616
3617 else
3618 {
3619 if (repeat_min > 1)
3620 {
6bf342e1
PH
3621 /* In the pre-compile phase, we don't actually do the replication. We
3622 just adjust the length as if we had. */
3623
3624 if (lengthptr != NULL)
3625 *lengthptr += (repeat_min - 1)*length_prevgroup;
3626
3627 /* This is compiling for real */
3628
3629 else
8ac170f3 3630 {
6bf342e1
PH
3631 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3632 for (i = 1; i < repeat_min; i++)
3633 {
3634 uschar *hc;
3635 uschar *this_hwm = cd->hwm;
3636 memcpy(code, previous, len);
3637 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3638 {
3639 PUT(cd->hwm, 0, GET(hc, 0) + len);
3640 cd->hwm += LINK_SIZE;
3641 }
3642 save_hwm = this_hwm;
3643 code += len;
3644 }
8ac170f3
PH
3645 }
3646 }
6bf342e1 3647
8ac170f3
PH
3648 if (repeat_max > 0) repeat_max -= repeat_min;
3649 }
3650
3651 /* This code is common to both the zero and non-zero minimum cases. If
3652 the maximum is limited, it replicates the group in a nested fashion,
3653 remembering the bracket starts on a stack. In the case of a zero minimum,
3654 the first one was set up above. In all cases the repeat_max now specifies
6bf342e1
PH
3655 the number of additional copies needed. Again, we must remember to
3656 replicate entries on the forward reference list. */
8ac170f3
PH
3657
3658 if (repeat_max >= 0)
3659 {
6bf342e1
PH
3660 /* In the pre-compile phase, we don't actually do the replication. We
3661 just adjust the length as if we had. For each repetition we must add 1
3662 to the length for BRAZERO and for all but the last repetition we must
3663 add 2 + 2*LINKSIZE to allow for the nesting that occurs. */
3664
3665 if (lengthptr != NULL && repeat_max > 0)
3666 *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3667 2 - 2*LINK_SIZE; /* Last one doesn't nest */
3668
3669 /* This is compiling for real */
3670
3671 else for (i = repeat_max - 1; i >= 0; i--)
8ac170f3 3672 {
6bf342e1
PH
3673 uschar *hc;
3674 uschar *this_hwm = cd->hwm;
3675
8ac170f3
PH
3676 *code++ = OP_BRAZERO + repeat_type;
3677
3678 /* All but the final copy start a new nesting, maintaining the
3679 chain of brackets outstanding. */
3680
3681 if (i != 0)
3682 {
3683 int offset;
3684 *code++ = OP_BRA;
3685 offset = (bralink == NULL)? 0 : code - bralink;
3686 bralink = code;
3687 PUTINC(code, 0, offset);
3688 }
3689
3690 memcpy(code, previous, len);
6bf342e1
PH
3691 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3692 {
3693 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3694 cd->hwm += LINK_SIZE;
3695 }
3696 save_hwm = this_hwm;
8ac170f3
PH
3697 code += len;
3698 }
3699
3700 /* Now chain through the pending brackets, and fill in their length
3701 fields (which are holding the chain links pro tem). */
3702
3703 while (bralink != NULL)
3704 {
3705 int oldlinkoffset;
3706 int offset = code - bralink + 1;
3707 uschar *bra = code - offset;
3708 oldlinkoffset = GET(bra, 1);
3709 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3710 *code++ = OP_KET;
3711 PUTINC(code, 0, offset);
3712 PUT(bra, 1, offset);
3713 }
3714 }
3715
3716 /* If the maximum is unlimited, set a repeater in the final copy. We
3717 can't just offset backwards from the current code point, because we
3718 don't know if there's been an options resetting after the ket. The
6bf342e1 3719 correct offset was computed above.
8ac170f3 3720
6bf342e1
PH
3721 Then, when we are doing the actual compile phase, check to see whether
3722 this group is a non-atomic one that could match an empty string. If so,
3723 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3724 that runtime checking can be done. [This check is also applied to
3725 atomic groups at runtime, but in a different way.] */
3726
3727 else
3728 {
3729 uschar *ketcode = code - ketoffset;
3730 uschar *bracode = ketcode - GET(ketcode, 1);
3731 *ketcode = OP_KETRMAX + repeat_type;
3732 if (lengthptr == NULL && *bracode != OP_ONCE)
3733 {
3734 uschar *scode = bracode;
3735 do
3736 {
3737 if (could_be_empty_branch(scode, ketcode, utf8))
3738 {
3739 *bracode += OP_SBRA - OP_BRA;
3740 break;
3741 }
3742 scode += GET(scode, 1);
3743 }
3744 while (*scode == OP_ALT);
3745 }
3746 }
8ac170f3
PH
3747 }
3748
3749 /* Else there's some kind of shambles */
3750
3751 else
3752 {
3753 *errorcodeptr = ERR11;
3754 goto FAILED;
3755 }
3756
6bf342e1
PH
3757 /* If the character following a repeat is '+', or if certain optimization
3758 tests above succeeded, possessive_quantifier is TRUE. For some of the
3759 simpler opcodes, there is an special alternative opcode for this. For
3760 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3761 The '+' notation is just syntactic sugar, taken from Sun's Java package,
3762 but the special opcodes can optimize it a bit. The repeated item starts at
3763 tempcode, not at previous, which might be the first part of a string whose
3764 (former) last char we repeated.
3765
3766 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3767 an 'upto' may follow. We skip over an 'exact' item, and then test the
3768 length of what remains before proceeding. */
8ac170f3
PH
3769
3770 if (possessive_quantifier)
3771 {
6bf342e1
PH
3772 int len;
3773 if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3774 *tempcode == OP_NOTEXACT)
3775 tempcode += _pcre_OP_lengths[*tempcode];
3776 len = code - tempcode;
3777 if (len > 0) switch (*tempcode)
3778 {
3779 case OP_STAR: *tempcode = OP_POSSTAR; break;
3780 case OP_PLUS: *tempcode = OP_POSPLUS; break;
3781 case OP_QUERY: *tempcode = OP_POSQUERY; break;
3782 case OP_UPTO: *tempcode = OP_POSUPTO; break;
3783
3784 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
3785 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
3786 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3787 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
3788
3789 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
3790 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
3791 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3792 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
3793
3794 default:
3795 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3796 code += 1 + LINK_SIZE;
3797 len += 1 + LINK_SIZE;
3798 tempcode[0] = OP_ONCE;
3799 *code++ = OP_KET;
3800 PUTINC(code, 0, len);
3801 PUT(tempcode, 1, len);
3802 break;
3803 }
8ac170f3
PH
3804 }
3805
3806 /* In all case we no longer have a previous item. We also set the
3807 "follows varying string" flag for subsequently encountered reqbytes if
3808 it isn't already set and we have just passed a varying length item. */
3809
3810 END_REPEAT:
3811 previous = NULL;
3812 cd->req_varyopt |= reqvary;
3813 break;
3814
3815
6bf342e1
PH
3816 /* ===================================================================*/
3817 /* Start of nested parenthesized sub-expression, or comment or lookahead or
3818 lookbehind or option setting or condition or all the other extended
3819 parenthesis forms. First deal with the specials; all are introduced by ?,
3820 and the appearance of any of them means that this is not a capturing
3821 group. */
8ac170f3
PH
3822
3823 case '(':
3824 newoptions = options;
3825 skipbytes = 0;
6bf342e1
PH
3826 bravalue = OP_CBRA;
3827 save_hwm = cd->hwm;
64f2600a 3828 reset_bracount = FALSE;
8ac170f3
PH
3829
3830 if (*(++ptr) == '?')
3831 {
6bf342e1 3832 int i, set, unset, namelen;
8ac170f3 3833 int *optset;
6bf342e1
PH
3834 const uschar *name;
3835 uschar *slot;
8ac170f3
PH
3836
3837 switch (*(++ptr))
3838 {
3839 case '#': /* Comment; skip to ket */
3840 ptr++;
6bf342e1
PH
3841 while (*ptr != 0 && *ptr != ')') ptr++;
3842 if (*ptr == 0)
3843 {
3844 *errorcodeptr = ERR18;
3845 goto FAILED;
3846 }
8ac170f3
PH
3847 continue;
3848
6bf342e1 3849
64f2600a
PH
3850 /* ------------------------------------------------------------ */
3851 case '|': /* Reset capture count for each branch */
3852 reset_bracount = TRUE;
3853 /* Fall through */
3854
6bf342e1
PH
3855 /* ------------------------------------------------------------ */
3856 case ':': /* Non-capturing bracket */
8ac170f3
PH
3857 bravalue = OP_BRA;
3858 ptr++;
3859 break;
3860
6bf342e1
PH
3861
3862 /* ------------------------------------------------------------ */
8ac170f3
PH
3863 case '(':
3864 bravalue = OP_COND; /* Conditional group */
3865
6bf342e1
PH
3866 /* A condition can be an assertion, a number (referring to a numbered
3867 group), a name (referring to a named group), or 'R', referring to
3868 recursion. R<digits> and R&name are also permitted for recursion tests.
3869
3870 There are several syntaxes for testing a named group: (?(name)) is used
3871 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3872
3873 There are two unfortunate ambiguities, caused by history. (a) 'R' can
3874 be the recursive thing or the name 'R' (and similarly for 'R' followed
3875 by digits), and (b) a number could be a name that consists of digits.
3876 In both cases, we look for a name first; if not found, we try the other
3877 cases. */
3878
3879 /* For conditions that are assertions, check the syntax, and then exit
3880 the switch. This will take control down to where bracketed groups,
3881 including assertions, are processed. */
3882
3883 if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3884 break;
3885
3886 /* Most other conditions use OP_CREF (a couple change to OP_RREF
3887 below), and all need to skip 3 bytes at the start of the group. */
3888
3889 code[1+LINK_SIZE] = OP_CREF;
3890 skipbytes = 3;
64f2600a 3891 refsign = -1;
6bf342e1
PH
3892
3893 /* Check for a test for recursion in a named group. */
3894
3895 if (ptr[1] == 'R' && ptr[2] == '&')
8ac170f3 3896 {
6bf342e1
PH
3897 terminator = -1;
3898 ptr += 2;
3899 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
3900 }
aa41d2de 3901
6bf342e1
PH
3902 /* Check for a test for a named group's having been set, using the Perl
3903 syntax (?(<name>) or (?('name') */
aa41d2de 3904
6bf342e1
PH
3905 else if (ptr[1] == '<')
3906 {
3907 terminator = '>';
3908 ptr++;
3909 }
3910 else if (ptr[1] == '\'')
3911 {
3912 terminator = '\'';
3913 ptr++;
3914 }
64f2600a
PH
3915 else
3916 {
3917 terminator = 0;
3918 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
3919 }
8ac170f3 3920
6bf342e1 3921 /* We now expect to read a name; any thing else is an error */
8ac170f3 3922
6bf342e1
PH
3923 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
3924 {
3925 ptr += 1; /* To get the right offset */
3926 *errorcodeptr = ERR28;
3927 goto FAILED;
3928 }
3929
3930 /* Read the name, but also get it as a number if it's all digits */
3931
3932 recno = 0;
3933 name = ++ptr;
3934 while ((cd->ctypes[*ptr] & ctype_word) != 0)
3935 {
3936 if (recno >= 0)
3937 recno = ((digitab[*ptr] & ctype_digit) != 0)?
3938 recno * 10 + *ptr - '0' : -1;
8ac170f3 3939 ptr++;
6bf342e1
PH
3940 }
3941 namelen = ptr - name;
aa41d2de 3942
6bf342e1
PH
3943 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
3944 {
3945 ptr--; /* Error offset */
3946 *errorcodeptr = ERR26;
3947 goto FAILED;
3948 }
aa41d2de 3949
6bf342e1 3950 /* Do no further checking in the pre-compile phase. */
aa41d2de 3951
6bf342e1 3952 if (lengthptr != NULL) break;
aa41d2de 3953
6bf342e1 3954 /* In the real compile we do the work of looking for the actual
64f2600a
PH
3955 reference. If the string started with "+" or "-" we require the rest to
3956 be digits, in which case recno will be set. */
3957
3958 if (refsign > 0)
3959 {
3960 if (recno <= 0)
3961 {
3962 *errorcodeptr = ERR58;
3963 goto FAILED;
3964 }
3965 if (refsign == '-')
3966 {
3967 recno = cd->bracount - recno + 1;
3968 if (recno <= 0)
3969 {
3970 *errorcodeptr = ERR15;
3971 goto FAILED;
3972 }
3973 }
3974 else recno += cd->bracount;
3975 PUT2(code, 2+LINK_SIZE, recno);
3976 break;
3977 }
3978
3979 /* Otherwise (did not start with "+" or "-"), start by looking for the
3980 name. */
aa41d2de 3981
6bf342e1
PH
3982 slot = cd->name_table;
3983 for (i = 0; i < cd->names_found; i++)
3984 {
3985 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3986 slot += cd->name_entry_size;
3987 }
aa41d2de 3988
6bf342e1 3989 /* Found a previous named subpattern */
aa41d2de 3990
6bf342e1
PH
3991 if (i < cd->names_found)
3992 {
3993 recno = GET2(slot, 0);
3994 PUT2(code, 2+LINK_SIZE, recno);
3995 }
aa41d2de 3996
6bf342e1 3997 /* Search the pattern for a forward reference */
aa41d2de 3998
6bf342e1
PH
3999 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4000 (options & PCRE_EXTENDED) != 0)) > 0)
4001 {
4002 PUT2(code, 2+LINK_SIZE, i);
4003 }
aa41d2de 4004
6bf342e1
PH
4005 /* If terminator == 0 it means that the name followed directly after
4006 the opening parenthesis [e.g. (?(abc)...] and in this case there are
4007 some further alternatives to try. For the cases where terminator != 0
4008 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4009 now checked all the possibilities, so give an error. */
aa41d2de 4010
6bf342e1
PH
4011 else if (terminator != 0)
4012 {
4013 *errorcodeptr = ERR15;
4014 goto FAILED;
4015 }
4016
4017 /* Check for (?(R) for recursion. Allow digits after R to specify a
4018 specific group number. */
4019
4020 else if (*name == 'R')
4021 {
4022 recno = 0;
4023 for (i = 1; i < namelen; i++)
aa41d2de 4024 {
6bf342e1
PH
4025 if ((digitab[name[i]] & ctype_digit) == 0)
4026 {
4027 *errorcodeptr = ERR15;
4028 goto FAILED;
4029 }
4030 recno = recno * 10 + name[i] - '0';
aa41d2de 4031 }
6bf342e1
PH
4032 if (recno == 0) recno = RREF_ANY;
4033 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4034 PUT2(code, 2+LINK_SIZE, recno);
4035 }
4036
4037 /* Similarly, check for the (?(DEFINE) "condition", which is always
4038 false. */
4039
4040 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4041 {
4042 code[1+LINK_SIZE] = OP_DEF;
4043 skipbytes = 1;
4044 }
4045
4046 /* Check for the "name" actually being a subpattern number. */
4047
4048 else if (recno > 0)
4049 {
4050 PUT2(code, 2+LINK_SIZE, recno);
8ac170f3 4051 }
aa41d2de 4052
6bf342e1 4053 /* Either an unidentified subpattern, or a reference to (?(0) */
aa41d2de 4054
6bf342e1
PH
4055 else
4056 {
4057 *errorcodeptr = (recno == 0)? ERR35: ERR15;
4058 goto FAILED;
4059 }
8ac170f3
PH
4060 break;
4061
6bf342e1
PH
4062
4063 /* ------------------------------------------------------------ */
8ac170f3
PH
4064 case '=': /* Positive lookahead */
4065 bravalue = OP_ASSERT;
4066 ptr++;
4067 break;
4068
6bf342e1
PH
4069
4070 /* ------------------------------------------------------------ */
8ac170f3
PH
4071 case '!': /* Negative lookahead */
4072 bravalue = OP_ASSERT_NOT;
4073 ptr++;
4074 break;
4075
6bf342e1
PH
4076
4077 /* ------------------------------------------------------------ */
4078 case '<': /* Lookbehind or named define */
4079 switch (ptr[1])
8ac170f3
PH
4080 {
4081 case '=': /* Positive lookbehind */
4082 bravalue = OP_ASSERTBACK;
6bf342e1 4083 ptr += 2;
8ac170f3
PH
4084 break;
4085
4086 case '!': /* Negative lookbehind */
4087 bravalue = OP_ASSERTBACK_NOT;
6bf342e1 4088 ptr += 2;
8ac170f3 4089 break;
6bf342e1
PH
4090
4091 default: /* Could be name define, else bad */
4092 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4093 ptr++; /* Correct offset for error */
4094 *errorcodeptr = ERR24;
4095 goto FAILED;
8ac170f3
PH
4096 }
4097 break;
4098
6bf342e1
PH
4099
4100 /* ------------------------------------------------------------ */
8ac170f3
PH
4101 case '>': /* One-time brackets */
4102 bravalue = OP_ONCE;
4103 ptr++;
4104 break;
4105
6bf342e1
PH
4106
4107 /* ------------------------------------------------------------ */
8ac170f3
PH
4108 case 'C': /* Callout - may be followed by digits; */
4109 previous_callout = code; /* Save for later completion */
4110 after_manual_callout = 1; /* Skip one item before completing */
6bf342e1
PH
4111 *code++ = OP_CALLOUT;
4112 {
8ac170f3
PH
4113 int n = 0;
4114 while ((digitab[*(++ptr)] & ctype_digit) != 0)
4115 n = n * 10 + *ptr - '0';
6bf342e1
PH
4116 if (*ptr != ')')
4117 {
4118 *errorcodeptr = ERR39;
4119 goto FAILED;
4120 }
8ac170f3
PH
4121 if (n > 255)
4122 {
4123 *errorcodeptr = ERR38;
4124 goto FAILED;
4125 }
4126 *code++ = n;
4127 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4128 PUT(code, LINK_SIZE, 0); /* Default length */
4129 code += 2 * LINK_SIZE;
4130 }
4131 previous = NULL;
4132 continue;
4133
6bf342e1
PH
4134
4135 /* ------------------------------------------------------------ */
4136 case 'P': /* Python-style named subpattern handling */
4137 if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
4138 {
4139 is_recurse = *ptr == '>';
4140 terminator = ')';
4141 goto NAMED_REF_OR_RECURSE;
4142 }
4143 else if (*ptr != '<') /* Test for Python-style definition */
8ac170f3 4144 {
6bf342e1
PH
4145 *errorcodeptr = ERR41;
4146 goto FAILED;
4147 }
4148 /* Fall through to handle (?P< as (?< is handled */
8ac170f3 4149
8ac170f3 4150
6bf342e1
PH
4151 /* ------------------------------------------------------------ */
4152 DEFINE_NAME: /* Come here from (?< handling */
4153 case '\'':
4154 {
4155 terminator = (*ptr == '<')? '>' : '\'';
4156 name = ++ptr;
4157
4158 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4159 namelen = ptr - name;
4160
4161 /* In the pre-compile phase, just do a syntax check. */
4162
4163 if (lengthptr != NULL)
8ac170f3 4164 {
6bf342e1
PH
4165 if (*ptr != terminator)
4166 {
4167 *errorcodeptr = ERR42;
4168 goto FAILED;
4169 }
4170 if (cd->names_found >= MAX_NAME_COUNT)
4171 {
4172 *errorcodeptr = ERR49;
4173 goto FAILED;
4174 }
4175 if (namelen + 3 > cd->name_entry_size)
8ac170f3 4176 {
6bf342e1
PH
4177 cd->name_entry_size = namelen + 3;
4178 if (namelen > MAX_NAME_SIZE)
8ac170f3 4179 {
6bf342e1
PH
4180 *errorcodeptr = ERR48;
4181 goto FAILED;
8ac170f3 4182 }
8ac170f3 4183 }
6bf342e1
PH
4184 }
4185
4186 /* In the real compile, create the entry in the table */
4187
4188 else
4189 {
4190 slot = cd->name_table;
4191 for (i = 0; i < cd->names_found; i++)
8ac170f3 4192 {
6bf342e1
PH
4193 int crc = memcmp(name, slot+2, namelen);
4194 if (crc == 0)
4195 {
4196 if (slot[2+namelen] == 0)
4197 {
4198 if ((options & PCRE_DUPNAMES) == 0)
4199 {
4200 *errorcodeptr = ERR43;
4201 goto FAILED;
4202 }
4203 }
4204 else crc = -1; /* Current name is substring */
4205 }
4206 if (crc < 0)
4207 {
4208 memmove(slot + cd->name_entry_size, slot,
4209 (cd->names_found - i) * cd->name_entry_size);
4210 break;
4211 }
4212 slot += cd->name_entry_size;
8ac170f3 4213 }
8ac170f3 4214
6bf342e1
PH
4215 PUT2(slot, 0, cd->bracount + 1);
4216 memcpy(slot + 2, name, namelen);
4217 slot[2+namelen] = 0;
4218 }
8ac170f3
PH
4219 }
4220
6bf342e1 4221 /* In both cases, count the number of names we've encountered. */
8ac170f3 4222
6bf342e1
PH
4223 ptr++; /* Move past > or ' */
4224 cd->names_found++;
4225 goto NUMBERED_GROUP;
8ac170f3 4226
6bf342e1
PH
4227
4228 /* ------------------------------------------------------------ */
4229 case '&': /* Perl recursion/subroutine syntax */
4230 terminator = ')';
4231 is_recurse = TRUE;
4232 /* Fall through */
4233
4234 /* We come here from the Python syntax above that handles both
4235 references (?P=name) and recursion (?P>name), as well as falling
4236 through from the Perl recursion syntax (?&name). */
4237
4238 NAMED_REF_OR_RECURSE:
4239 name = ++ptr;
4240 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4241 namelen = ptr - name;
4242
4243 /* In the pre-compile phase, do a syntax check and set a dummy
4244 reference number. */
4245
4246 if (lengthptr != NULL)
4247 {
4248 if (*ptr != terminator)
4249 {
4250 *errorcodeptr = ERR42;
4251 goto FAILED;
4252 }
4253 if (namelen > MAX_NAME_SIZE)
4254 {
4255 *errorcodeptr = ERR48;
4256 goto FAILED;
4257 }
4258 recno = 0;
4259 }
4260
4261 /* In the real compile, seek the name in the table */
4262
4263 else
4264 {
4265 slot = cd->name_table;
4266 for (i = 0; i < cd->names_found; i++)
4267 {
4268 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4269 slot += cd->name_entry_size;
4270 }
aa41d2de
PH
4271
4272 if (i < cd->names_found) /* Back reference */
4273 {
4274 recno = GET2(slot, 0);
4275 }
4276 else if ((recno = /* Forward back reference */
6bf342e1
PH
4277 find_parens(ptr, cd->bracount, name, namelen,
4278 (options & PCRE_EXTENDED) != 0)) <= 0)
8ac170f3
PH
4279 {
4280 *errorcodeptr = ERR15;
4281 goto FAILED;
4282 }
6bf342e1 4283 }
8ac170f3 4284
6bf342e1
PH
4285 /* In both phases, we can now go to the code than handles numerical
4286 recursion or backreferences. */
8ac170f3 4287
6bf342e1
PH
4288 if (is_recurse) goto HANDLE_RECURSION;
4289 else goto HANDLE_REFERENCE;
8ac170f3 4290
8ac170f3 4291
6bf342e1
PH
4292 /* ------------------------------------------------------------ */
4293 case 'R': /* Recursion */
8ac170f3
PH
4294 ptr++; /* Same as (?0) */
4295 /* Fall through */
4296
8ac170f3 4297
6bf342e1 4298 /* ------------------------------------------------------------ */
64f2600a 4299 case '-': case '+':
6bf342e1
PH
4300 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4301 case '5': case '6': case '7': case '8': case '9': /* subroutine */
8ac170f3
PH
4302 {
4303 const uschar *called;
64f2600a
PH
4304
4305 if ((refsign = *ptr) == '+') ptr++;
4306 else if (refsign == '-')
4307 {
4308 if ((digitab[ptr[1]] & ctype_digit) == 0)
4309 goto OTHER_CHAR_AFTER_QUERY;
4310 ptr++;
4311 }
4312
8ac170f3
PH
4313 recno = 0;
4314 while((digitab[*ptr] & ctype_digit) != 0)
4315 recno = recno * 10 + *ptr++ - '0';
64f2600a 4316
6bf342e1
PH
4317 if (*ptr != ')')
4318 {
4319 *errorcodeptr = ERR29;
4320 goto FAILED;
4321 }
8ac170f3 4322
64f2600a
PH
4323 if (refsign == '-')
4324 {
4325 if (recno == 0)
4326 {
4327 *errorcodeptr = ERR58;
4328 goto FAILED;
4329 }
4330 recno = cd->bracount - recno + 1;
4331 if (recno <= 0)
4332 {
4333 *errorcodeptr = ERR15;
4334 goto FAILED;
4335 }
4336 }
4337 else if (refsign == '+')
4338 {
4339 if (recno == 0)
4340 {
4341 *errorcodeptr = ERR58;
4342 goto FAILED;
4343 }
4344 recno += cd->bracount;
4345 }
4346
8ac170f3
PH
4347 /* Come here from code above that handles a named recursion */
4348
4349 HANDLE_RECURSION:
4350
4351 previous = code;
6bf342e1 4352 called = cd->start_code;
8ac170f3 4353
6bf342e1
PH
4354 /* When we are actually compiling, find the bracket that is being
4355 referenced. Temporarily end the regex in case it doesn't exist before
4356 this point. If we end up with a forward reference, first check that
4357 the bracket does occur later so we can give the error (and position)
4358 now. Then remember this forward reference in the workspace so it can
4359 be filled in at the end. */
8ac170f3 4360
6bf342e1 4361 if (lengthptr == NULL)
8ac170f3 4362 {
6bf342e1
PH
4363 *code = OP_END;
4364 if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
8ac170f3 4365
6bf342e1 4366 /* Forward reference */
8ac170f3 4367
6bf342e1
PH
4368 if (called == NULL)
4369 {
4370 if (find_parens(ptr, cd->bracount, NULL, recno,
4371 (options & PCRE_EXTENDED) != 0) < 0)
4372 {
4373 *errorcodeptr = ERR15;
4374 goto FAILED;
4375 }
4376 called = cd->start_code + recno;
4377 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4378 }
4379
4380 /* If not a forward reference, and the subpattern is still open,
4381 this is a recursive call. We check to see if this is a left
4382 recursion that could loop for ever, and diagnose that case. */
4383
4384 else if (GET(called, 1) == 0 &&
4385 could_be_empty(called, code, bcptr, utf8))
4386 {
4387 *errorcodeptr = ERR40;
4388 goto FAILED;
4389 }
8ac170f3
PH
4390 }
4391
aa41d2de 4392 /* Insert the recursion/subroutine item, automatically wrapped inside
6bf342e1
PH
4393 "once" brackets. Set up a "previous group" length so that a
4394 subsequent quantifier will work. */
aa41d2de
PH
4395
4396 *code = OP_ONCE;
4397 PUT(code, 1, 2 + 2*LINK_SIZE);
4398 code += 1 + LINK_SIZE;
8ac170f3
PH
4399
4400 *code = OP_RECURSE;
4401 PUT(code, 1, called - cd->start_code);
4402 code += 1 + LINK_SIZE;
aa41d2de
PH
4403
4404 *code = OP_KET;
4405 PUT(code, 1, 2 + 2*LINK_SIZE);
4406 code += 1 + LINK_SIZE;
6bf342e1
PH
4407
4408 length_prevgroup = 3 + 3*LINK_SIZE;
8ac170f3 4409 }
6bf342e1
PH
4410
4411 /* Can't determine a first byte now */
4412
4413 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
8ac170f3
PH
4414 continue;
4415
8ac170f3 4416
6bf342e1
PH
4417 /* ------------------------------------------------------------ */
4418 default: /* Other characters: check option setting */
64f2600a 4419 OTHER_CHAR_AFTER_QUERY:
8ac170f3
PH
4420 set = unset = 0;
4421 optset = &set;
4422
4423 while (*ptr != ')' && *ptr != ':')
4424 {
4425 switch (*ptr++)
4426 {
4427 case '-': optset = &unset; break;
4428
6bf342e1
PH
4429 case 'J': /* Record that it changed in the external options */
4430 *optset |= PCRE_DUPNAMES;
4431 cd->external_options |= PCRE_JCHANGED;
4432 break;
4433
8ac170f3
PH
4434 case 'i': *optset |= PCRE_CASELESS; break;
4435 case 'm': *optset |= PCRE_MULTILINE; break;
4436 case 's': *optset |= PCRE_DOTALL; break;
4437 case 'x': *optset |= PCRE_EXTENDED; break;
4438 case 'U': *optset |= PCRE_UNGREEDY; break;
4439 case 'X': *optset |= PCRE_EXTRA; break;
6bf342e1
PH
4440
4441 default: *errorcodeptr = ERR12;
4442 ptr--; /* Correct the offset */
4443 goto FAILED;
8ac170f3
PH
4444 }
4445 }
4446
4447 /* Set up the changed option bits, but don't change anything yet. */
4448
4449 newoptions = (options | set) & (~unset);
4450
4451 /* If the options ended with ')' this is not the start of a nested
6bf342e1
PH
4452 group with option changes, so the options change at this level. If this
4453 item is right at the start of the pattern, the options can be
4454 abstracted and made external in the pre-compile phase, and ignored in
4455 the compile phase. This can be helpful when matching -- for instance in
4456 caseless checking of required bytes.
4457
4458 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4459 definitely *not* at the start of the pattern because something has been
4460 compiled. In the pre-compile phase, however, the code pointer can have
4461 that value after the start, because it gets reset as code is discarded
4462 during the pre-compile. However, this can happen only at top level - if
4463 we are within parentheses, the starting BRA will still be present. At
4464 any parenthesis level, the length value can be used to test if anything
4465 has been compiled at that level. Thus, a test for both these conditions
4466 is necessary to ensure we correctly detect the start of the pattern in
4467 both phases.
4468
4469 If we are not at the pattern start, compile code to change the ims
4470 options if this setting actually changes any of them. We also pass the
4471 new setting back so that it can be put at the start of any following
4472 branches, and when this group ends (if we are in a group), a resetting
4473 item can be compiled. */
8ac170f3
PH
4474
4475 if (*ptr == ')')
4476 {
6bf342e1
PH
4477 if (code == cd->start_code + 1 + LINK_SIZE &&
4478 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
8ac170f3 4479 {
6bf342e1
PH
4480 cd->external_options = newoptions;
4481 options = newoptions;
8ac170f3 4482 }
6bf342e1
PH
4483 else
4484 {
4485 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4486 {
4487 *code++ = OP_OPT;
4488 *code++ = newoptions & PCRE_IMS;
4489 }
8ac170f3 4490
6bf342e1
PH
4491 /* Change options at this level, and pass them back for use
4492 in subsequent branches. Reset the greedy defaults and the case
4493 value for firstbyte and reqbyte. */
8ac170f3 4494
6bf342e1
PH
4495 *optionsptr = options = newoptions;
4496 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4497 greedy_non_default = greedy_default ^ 1;
4498 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4499 }
8ac170f3
PH
4500
4501 previous = NULL; /* This item can't be repeated */
4502 continue; /* It is complete */
4503 }
4504
4505 /* If the options ended with ':' we are heading into a nested group
4506 with possible change of options. Such groups are non-capturing and are
4507 not assertions of any kind. All we need to do is skip over the ':';
4508 the newoptions value is handled below. */
4509
4510 bravalue = OP_BRA;
4511 ptr++;
6bf342e1
PH
4512 } /* End of switch for character following (? */
4513 } /* End of (? handling */
8ac170f3 4514
6bf342e1
PH
4515 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4516 all unadorned brackets become non-capturing and behave like (?:...)
4517 brackets. */
8ac170f3
PH
4518
4519 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4520 {
4521 bravalue = OP_BRA;
4522 }
4523
6bf342e1 4524 /* Else we have a capturing group. */
8ac170f3
PH
4525
4526 else
4527 {
4528 NUMBERED_GROUP:
6bf342e1
PH
4529 cd->bracount += 1;
4530 PUT2(code, 1+LINK_SIZE, cd->bracount);
4531 skipbytes = 2;
8ac170f3
PH
4532 }
4533
6bf342e1
PH
4534 /* Process nested bracketed regex. Assertions may not be repeated, but
4535 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4536 non-register variable in order to be able to pass its address because some
4537 compilers complain otherwise. Pass in a new setting for the ims options if
4538 they have changed. */
8ac170f3
PH
4539
4540 previous = (bravalue >= OP_ONCE)? code : NULL;
4541 *code = bravalue;
4542 tempcode = code;
4543 tempreqvary = cd->req_varyopt; /* Save value before bracket */
6bf342e1 4544 length_prevgroup = 0; /* Initialize for pre-compile phase */
8ac170f3
PH
4545
4546 if (!compile_regex(
4547 newoptions, /* The complete new option state */
4548 options & PCRE_IMS, /* The previous ims option state */
8ac170f3
PH
4549 &tempcode, /* Where to put code (updated) */
4550 &ptr, /* Input pointer (updated) */
4551 errorcodeptr, /* Where to put an error message */
4552 (bravalue == OP_ASSERTBACK ||
4553 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
64f2600a 4554 reset_bracount, /* True if (?| group */
6bf342e1 4555 skipbytes, /* Skip over bracket number */
8ac170f3
PH
4556 &subfirstbyte, /* For possible first char */
4557 &subreqbyte, /* For possible last char */
4558 bcptr, /* Current branch chain */
6bf342e1
PH
4559 cd, /* Tables block */
4560 (lengthptr == NULL)? NULL : /* Actual compile phase */
4561 &length_prevgroup /* Pre-compile phase */
4562 ))
8ac170f3
PH
4563 goto FAILED;
4564
4565 /* At the end of compiling, code is still pointing to the start of the
4566 group, while tempcode has been updated to point past the end of the group
4567 and any option resetting that may follow it. The pattern pointer (ptr)
4568 is on the bracket. */
4569
4570 /* If this is a conditional bracket, check that there are no more than
64f2600a
PH
4571 two branches in the group, or just one if it's a DEFINE group. We do this
4572 in the real compile phase, not in the pre-pass, where the whole group may
4573 not be available. */
8ac170f3 4574
64f2600a 4575 if (bravalue == OP_COND && lengthptr == NULL)
8ac170f3
PH
4576 {
4577 uschar *tc = code;
aa41d2de 4578 int condcount = 0;
8ac170f3
PH
4579
4580 do {
4581 condcount++;
4582 tc += GET(tc,1);
4583 }
4584 while (*tc != OP_KET);
4585
6bf342e1
PH
4586 /* A DEFINE group is never obeyed inline (the "condition" is always
4587 false). It must have only one branch. */
4588
4589 if (code[LINK_SIZE+1] == OP_DEF)
8ac170f3 4590 {
6bf342e1
PH
4591 if (condcount > 1)
4592 {
4593 *errorcodeptr = ERR54;
4594 goto FAILED;
4595 }
4596 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
4597 }
4598
4599 /* A "normal" conditional group. If there is just one branch, we must not
4600 make use of its firstbyte or reqbyte, because this is equivalent to an
4601 empty second branch. */
4602
4603 else
4604 {
4605 if (condcount > 2)
4606 {
4607 *errorcodeptr = ERR27;
4608 goto FAILED;
4609 }
4610 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
8ac170f3 4611 }
6bf342e1 4612 }
8ac170f3 4613
6bf342e1
PH
4614 /* Error if hit end of pattern */
4615
4616 if (*ptr != ')')
4617 {
4618 *errorcodeptr = ERR14;
4619 goto FAILED;
4620 }
4621
4622 /* In the pre-compile phase, update the length by the length of the nested
4623 group, less the brackets at either end. Then reduce the compiled code to
4624 just the brackets so that it doesn't use much memory if it is duplicated by
4625 a quantifier. */
8ac170f3 4626
6bf342e1
PH
4627 if (lengthptr != NULL)
4628 {
4629 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4630 code++;
4631 PUTINC(code, 0, 1 + LINK_SIZE);
4632 *code++ = OP_KET;
4633 PUTINC(code, 0, 1 + LINK_SIZE);
8ac170f3
PH
4634 }
4635
6bf342e1
PH
4636 /* Otherwise update the main code pointer to the end of the group. */
4637
4638 else code = tempcode;
4639
4640 /* For a DEFINE group, required and first character settings are not
4641 relevant. */
4642
4643 if (bravalue == OP_DEF) break;
4644
4645 /* Handle updating of the required and first characters for other types of
4646 group. Update for normal brackets of all kinds, and conditions with two
4647 branches (see code above). If the bracket is followed by a quantifier with
4648 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4649 zerofirstbyte outside the main loop so that they can be accessed for the
4650 back off. */
8ac170f3
PH
4651
4652 zeroreqbyte = reqbyte;
4653 zerofirstbyte = firstbyte;
4654 groupsetfirstbyte = FALSE;
4655
6bf342e1 4656 if (bravalue >= OP_ONCE)
8ac170f3
PH
4657 {
4658 /* If we have not yet set a firstbyte in this branch, take it from the
4659 subpattern, remembering that it was set here so that a repeat of more
4660 than one can replicate it as reqbyte if necessary. If the subpattern has
4661 no firstbyte, set "none" for the whole branch. In both cases, a zero
4662 repeat forces firstbyte to "none". */
4663
4664 if (firstbyte == REQ_UNSET)
4665 {
4666 if (subfirstbyte >= 0)
4667 {
4668 firstbyte = subfirstbyte;
4669 groupsetfirstbyte = TRUE;
4670 }
4671 else firstbyte = REQ_NONE;
4672 zerofirstbyte = REQ_NONE;
4673 }
4674
4675 /* If firstbyte was previously set, convert the subpattern's firstbyte
4676 into reqbyte if there wasn't one, using the vary flag that was in
4677 existence beforehand. */
4678
4679 else if (subfirstbyte >= 0 && subreqbyte < 0)
4680 subreqbyte = subfirstbyte | tempreqvary;
4681
4682 /* If the subpattern set a required byte (or set a first byte that isn't
4683 really the first byte - see above), set it. */
4684
4685 if (subreqbyte >= 0) reqbyte = subreqbyte;
4686 }
4687
4688 /* For a forward assertion, we take the reqbyte, if set. This can be
4689 helpful if the pattern that follows the assertion doesn't set a different
4690 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
4691 for an assertion, however because it leads to incorrect effect for patterns
4692 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
4693 of a firstbyte. This is overcome by a scan at the end if there's no
4694 firstbyte, looking for an asserted first char. */
4695
4696 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
6bf342e1 4697 break; /* End of processing '(' */
8ac170f3 4698
8ac170f3 4699
6bf342e1
PH
4700 /* ===================================================================*/
4701 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
8ac170f3
PH
4702 are arranged to be the negation of the corresponding OP_values. For the
4703 back references, the values are ESC_REF plus the reference number. Only
4704 back references and those types that consume a character may be repeated.
4705 We can test for values between ESC_b and ESC_Z for the latter; this may
4706 have to change if any new ones are ever created. */
4707
6bf342e1
PH
4708 case '\\':
4709 tempptr = ptr;
4710 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
4711 if (*errorcodeptr != 0) goto FAILED;
4712
8ac170f3
PH
4713 if (c < 0)
4714 {
4715 if (-c == ESC_Q) /* Handle start of quoted string */
4716 {
4717 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
4718 else inescq = TRUE;
4719 continue;
4720 }
4721
6bf342e1
PH
4722 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
4723
8ac170f3
PH
4724 /* For metasequences that actually match a character, we disable the
4725 setting of a first character if it hasn't already been set. */
4726
4727 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
4728 firstbyte = REQ_NONE;
4729
4730 /* Set values to reset to if this is followed by a zero repeat. */
4731
4732 zerofirstbyte = firstbyte;
4733 zeroreqbyte = reqbyte;
4734
64f2600a
PH
4735 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
4736 We also support \k{name} (.NET syntax) */
6bf342e1 4737
64f2600a 4738 if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
6bf342e1
PH
4739 {
4740 is_recurse = FALSE;
64f2600a 4741 terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
6bf342e1
PH
4742 goto NAMED_REF_OR_RECURSE;
4743 }
4744
4745 /* Back references are handled specially; must disable firstbyte if
4746 not set to cope with cases like (?=(\w+))\1: which would otherwise set
4747 ':' later. */
8ac170f3
PH
4748
4749 if (-c >= ESC_REF)
4750 {
6bf342e1
PH
4751 recno = -c - ESC_REF;
4752
4753 HANDLE_REFERENCE: /* Come here from named backref handling */
4754 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
8ac170f3
PH
4755 previous = code;
4756 *code++ = OP_REF;
6bf342e1
PH
4757 PUT2INC(code, 0, recno);
4758 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
4759 if (recno > cd->top_backref) cd->top_backref = recno;
8ac170f3
PH
4760 }
4761
6bf342e1 4762 /* So are Unicode property matches, if supported. */
8ac170f3
PH
4763
4764#ifdef SUPPORT_UCP
4765 else if (-c == ESC_P || -c == ESC_p)
4766 {
4767 BOOL negated;
aa41d2de
PH
4768 int pdata;
4769 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
6bf342e1 4770 if (ptype < 0) goto FAILED;
8ac170f3
PH
4771 previous = code;
4772 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
aa41d2de
PH
4773 *code++ = ptype;
4774 *code++ = pdata;
8ac170f3 4775 }
6bf342e1
PH
4776#else
4777
4778 /* If Unicode properties are not supported, \X, \P, and \p are not
4779 allowed. */
4780
4781 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
4782 {
4783 *errorcodeptr = ERR45;
4784 goto FAILED;
4785 }
8ac170f3
PH
4786#endif
4787
6bf342e1
PH
4788 /* For the rest (including \X when Unicode properties are supported), we
4789 can obtain the OP value by negating the escape value. */
8ac170f3
PH
4790
4791 else
4792 {
4793 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
4794 *code++ = -c;
4795 }
4796 continue;
4797 }
4798
4799 /* We have a data character whose value is in c. In UTF-8 mode it may have
4800 a value > 127. We set its representation in the length/buffer, and then
4801 handle it as a data character. */
4802
4803#ifdef SUPPORT_UTF8
4804 if (utf8 && c > 127)
4805 mclength = _pcre_ord2utf8(c, mcbuffer);
4806 else
4807#endif
4808
4809 {
4810 mcbuffer[0] = c;
4811 mclength = 1;
4812 }
8ac170f3
PH
4813 goto ONE_CHAR;
4814
6bf342e1
PH
4815
4816 /* ===================================================================*/
8ac170f3
PH
4817 /* Handle a literal character. It is guaranteed not to be whitespace or #
4818 when the extended flag is set. If we are in UTF-8 mode, it may be a
4819 multi-byte literal character. */
4820
4821 default:
4822 NORMAL_CHAR:
4823 mclength = 1;
4824 mcbuffer[0] = c;
4825
4826#ifdef SUPPORT_UTF8
6bf342e1 4827 if (utf8 && c >= 0xc0)
8ac170f3
PH
4828 {
4829 while ((ptr[1] & 0xc0) == 0x80)
4830 mcbuffer[mclength++] = *(++ptr);
4831 }
4832#endif
4833
4834 /* At this point we have the character's bytes in mcbuffer, and the length
4835 in mclength. When not in UTF-8 mode, the length is always 1. */
4836
4837 ONE_CHAR:
4838 previous = code;
4839 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
4840 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
4841
4842 /* Set the first and required bytes appropriately. If no previous first
4843 byte, set it from this character, but revert to none on a zero repeat.
4844 Otherwise, leave the firstbyte value alone, and don't change it on a zero
4845 repeat. */
4846
4847 if (firstbyte == REQ_UNSET)
4848 {
4849 zerofirstbyte = REQ_NONE;
4850 zeroreqbyte = reqbyte;
4851
4852 /* If the character is more than one byte long, we can set firstbyte
4853 only if it is not to be matched caselessly. */
4854
4855 if (mclength == 1 || req_caseopt == 0)
4856 {
4857 firstbyte = mcbuffer[0] | req_caseopt;
4858 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
4859 }
4860 else firstbyte = reqbyte = REQ_NONE;
4861 }
4862
4863 /* firstbyte was previously set; we can set reqbyte only the length is
4864 1 or the matching is caseful. */
4865
4866 else
4867 {
4868 zerofirstbyte = firstbyte;
4869 zeroreqbyte = reqbyte;
4870 if (mclength == 1 || req_caseopt == 0)
4871 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
4872 }
4873
4874 break; /* End of literal character handling */
4875 }
4876 } /* end of big loop */
4877
6bf342e1 4878
8ac170f3
PH
4879/* Control never reaches here by falling through, only by a goto for all the
4880error states. Pass back the position in the pattern so that it can be displayed
4881to the user for diagnosing the error. */
4882
4883FAILED:
4884*ptrptr = ptr;
4885return FALSE;
4886}
4887
4888
4889
4890
4891/*************************************************
4892* Compile sequence of alternatives *
4893*************************************************/
4894
6bf342e1
PH
4895/* On entry, ptr is pointing past the bracket character, but on return it
4896points to the closing bracket, or vertical bar, or end of string. The code
4897variable is pointing at the byte into which the BRA operator has been stored.
4898If the ims options are changed at the start (for a (?ims: group) or during any
4899branch, we need to insert an OP_OPT item at the start of every following branch
4900to ensure they get set correctly at run time, and also pass the new options
4901into every subsequent branch compile.
4902
4903This function is used during the pre-compile phase when we are trying to find
4904out the amount of memory needed, as well as during the real compile phase. The
4905value of lengthptr distinguishes the two phases.
8ac170f3 4906
64f2600a 4907Arguments:
8ac170f3
PH
4908 options option bits, including any changes for this subpattern
4909 oldims previous settings of ims option bits
8ac170f3
PH
4910 codeptr -> the address of the current code pointer
4911 ptrptr -> the address of the current pattern pointer
4912 errorcodeptr -> pointer to error code variable
4913 lookbehind TRUE if this is a lookbehind assertion
64f2600a 4914 reset_bracount TRUE to reset the count for each branch
6bf342e1 4915 skipbytes skip this many bytes at start (for brackets and OP_COND)
8ac170f3
PH
4916 firstbyteptr place to put the first required character, or a negative number
4917 reqbyteptr place to put the last required character, or a negative number
4918 bcptr pointer to the chain of currently open branches
4919 cd points to the data block with tables pointers etc.
6bf342e1
PH
4920 lengthptr NULL during the real compile phase
4921 points to length accumulator during pre-compile phase
8ac170f3 4922
6bf342e1 4923Returns: TRUE on success
8ac170f3
PH
4924*/
4925
4926static BOOL
6bf342e1 4927compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
64f2600a
PH
4928 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
4929 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
4930 int *lengthptr)
8ac170f3
PH
4931{
4932const uschar *ptr = *ptrptr;
4933uschar *code = *codeptr;
4934uschar *last_branch = code;
4935uschar *start_bracket = code;
4936uschar *reverse_count = NULL;
4937int firstbyte, reqbyte;
4938int branchfirstbyte, branchreqbyte;
6bf342e1 4939int length;
64f2600a
PH
4940int orig_bracount;
4941int max_bracount;
8ac170f3
PH
4942branch_chain bc;
4943
4944bc.outer = bcptr;
4945bc.current = code;
4946
4947firstbyte = reqbyte = REQ_UNSET;
4948
6bf342e1
PH
4949/* Accumulate the length for use in the pre-compile phase. Start with the
4950length of the BRA and KET and any extra bytes that are required at the
4951beginning. We accumulate in a local variable to save frequent testing of
4952lenthptr for NULL. We cannot do this by looking at the value of code at the
4953start and end of each alternative, because compiled items are discarded during
4954the pre-compile phase so that the work space is not exceeded. */
4955
4956length = 2 + 2*LINK_SIZE + skipbytes;
4957
4958/* WARNING: If the above line is changed for any reason, you must also change
4959the code that abstracts option settings at the start of the pattern and makes
4960them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
4961pre-compile phase to find out whether anything has yet been compiled or not. */
4962
8ac170f3
PH
4963/* Offset is set zero to mark that this bracket is still open */
4964
4965PUT(code, 1, 0);
4966code += 1 + LINK_SIZE + skipbytes;
4967
4968/* Loop for each alternative branch */
4969
64f2600a 4970orig_bracount = max_bracount = cd->bracount;
8ac170f3
PH
4971for (;;)
4972 {
64f2600a
PH
4973 /* For a (?| group, reset the capturing bracket count so that each branch
4974 uses the same numbers. */
4975
4976 if (reset_bracount) cd->bracount = orig_bracount;
4977
8ac170f3
PH
4978 /* Handle a change of ims options at the start of the branch */
4979
4980 if ((options & PCRE_IMS) != oldims)
4981 {
4982 *code++ = OP_OPT;
4983 *code++ = options & PCRE_IMS;
6bf342e1 4984 length += 2;
8ac170f3
PH
4985 }
4986
4987 /* Set up dummy OP_REVERSE if lookbehind assertion */
4988
4989 if (lookbehind)
4990 {
4991 *code++ = OP_REVERSE;
4992 reverse_count = code;
4993 PUTINC(code, 0, 0);
6bf342e1 4994 length += 1 + LINK_SIZE;
8ac170f3
PH
4995 }
4996
6bf342e1
PH
4997 /* Now compile the branch; in the pre-compile phase its length gets added
4998 into the length. */
8ac170f3 4999
6bf342e1
PH
5000 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5001 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
8ac170f3
PH
5002 {
5003 *ptrptr = ptr;
5004 return FALSE;
5005 }
5006
64f2600a
PH
5007 /* Keep the highest bracket count in case (?| was used and some branch
5008 has fewer than the rest. */
5009
5010 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5011
6bf342e1 5012 /* In the real compile phase, there is some post-processing to be done. */
8ac170f3 5013
6bf342e1 5014 if (lengthptr == NULL)
8ac170f3 5015 {
6bf342e1
PH
5016 /* If this is the first branch, the firstbyte and reqbyte values for the
5017 branch become the values for the regex. */
8ac170f3 5018
6bf342e1
PH
5019 if (*last_branch != OP_ALT)
5020 {
5021 firstbyte = branchfirstbyte;
5022 reqbyte = branchreqbyte;
5023 }
8ac170f3 5024
6bf342e1
PH
5025 /* If this is not the first branch, the first char and reqbyte have to
5026 match the values from all the previous branches, except that if the
5027 previous value for reqbyte didn't have REQ_VARY set, it can still match,
5028 and we set REQ_VARY for the regex. */
8ac170f3 5029
6bf342e1 5030 else
8ac170f3 5031 {
6bf342e1
PH
5032 /* If we previously had a firstbyte, but it doesn't match the new branch,
5033 we have to abandon the firstbyte for the regex, but if there was
5034 previously no reqbyte, it takes on the value of the old firstbyte. */
5035
5036 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5037 {
5038 if (reqbyte < 0) reqbyte = firstbyte;
5039 firstbyte = REQ_NONE;
5040 }
8ac170f3 5041
6bf342e1
PH
5042 /* If we (now or from before) have no firstbyte, a firstbyte from the
5043 branch becomes a reqbyte if there isn't a branch reqbyte. */
8ac170f3 5044
6bf342e1
PH
5045 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5046 branchreqbyte = branchfirstbyte;
8ac170f3 5047
6bf342e1 5048 /* Now ensure that the reqbytes match */
8ac170f3 5049
6bf342e1
PH
5050 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5051 reqbyte = REQ_NONE;
5052 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
5053 }
8ac170f3 5054
6bf342e1
PH
5055 /* If lookbehind, check that this branch matches a fixed-length string, and
5056 put the length into the OP_REVERSE item. Temporarily mark the end of the
5057 branch with OP_END. */
8ac170f3 5058
6bf342e1 5059 if (lookbehind)
8ac170f3 5060 {
6bf342e1
PH
5061 int fixed_length;
5062 *code = OP_END;
5063 fixed_length = find_fixedlength(last_branch, options);
5064 DPRINTF(("fixed length = %d\n", fixed_length));
5065 if (fixed_length < 0)
5066 {
5067 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5068 *ptrptr = ptr;
5069 return FALSE;
5070 }
5071 PUT(reverse_count, 0, fixed_length);
8ac170f3 5072 }
8ac170f3
PH
5073 }
5074
64f2600a
PH
5075 /* Reached end of expression, either ')' or end of pattern. In the real
5076 compile phase, go back through the alternative branches and reverse the chain
5077 of offsets, with the field in the BRA item now becoming an offset to the
5078 first alternative. If there are no alternatives, it points to the end of the
5079 group. The length in the terminating ket is always the length of the whole
5080 bracketed item. If any of the ims options were changed inside the group,
5081 compile a resetting op-code following, except at the very end of the pattern.
5082 Return leaving the pointer at the terminating char. */
8ac170f3
PH
5083
5084 if (*ptr != '|')
5085 {
64f2600a 5086 if (lengthptr == NULL)
8ac170f3 5087 {
64f2600a
PH
5088 int branch_length = code - last_branch;
5089 do
5090 {
5091 int prev_length = GET(last_branch, 1);
5092 PUT(last_branch, 1, branch_length);
5093 branch_length = prev_length;
5094 last_branch -= branch_length;
5095 }
5096 while (branch_length > 0);
8ac170f3 5097 }
8ac170f3
PH
5098
5099 /* Fill in the ket */
5100
5101 *code = OP_KET;
5102 PUT(code, 1, code - start_bracket);
5103 code += 1 + LINK_SIZE;
5104
5105 /* Resetting option if needed */
5106
5107 if ((options & PCRE_IMS) != oldims && *ptr == ')')
5108 {
5109 *code++ = OP_OPT;
5110 *code++ = oldims;
6bf342e1 5111 length += 2;
8ac170f3
PH
5112 }
5113
64f2600a
PH
5114 /* Retain the highest bracket number, in case resetting was used. */
5115
5116 cd->bracount = max_bracount;
5117
8ac170f3
PH
5118 /* Set values to pass back */
5119
5120 *codeptr = code;
5121 *ptrptr = ptr;
5122 *firstbyteptr = firstbyte;
5123 *reqbyteptr = reqbyte;
6bf342e1 5124 if (lengthptr != NULL) *lengthptr += length;
8ac170f3
PH
5125 return TRUE;
5126 }
5127
64f2600a
PH
5128 /* Another branch follows. In the pre-compile phase, we can move the code
5129 pointer back to where it was for the start of the first branch. (That is,
5130 pretend that each branch is the only one.)
5131
5132 In the real compile phase, insert an ALT node. Its length field points back
8ac170f3
PH
5133 to the previous branch while the bracket remains open. At the end the chain
5134 is reversed. It's done like this so that the start of the bracket has a
5135 zero offset until it is closed, making it possible to detect recursion. */
5136
64f2600a
PH
5137 if (lengthptr != NULL)
5138 {
5139 code = *codeptr + 1 + LINK_SIZE + skipbytes;
5140 length += 1 + LINK_SIZE;
5141 }
5142 else
5143 {
5144 *code = OP_ALT;
5145 PUT(code, 1, code - last_branch);
5146 bc.current = last_branch = code;
5147 code += 1 + LINK_SIZE;
5148 }
5149
8ac170f3
PH
5150 ptr++;
5151 }
5152/* Control never reaches here */
5153}
5154
5155
5156
5157
5158/*************************************************
5159* Check for anchored expression *
5160*************************************************/
5161
5162/* Try to find out if this is an anchored regular expression. Consider each
5163alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
5164all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
5165it's anchored. However, if this is a multiline pattern, then only OP_SOD
5166counts, since OP_CIRC can match in the middle.
5167
5168We can also consider a regex to be anchored if OP_SOM starts all its branches.
5169This is the code for \G, which means "match at start of match position, taking
5170into account the match offset".
5171
5172A branch is also implicitly anchored if it starts with .* and DOTALL is set,
5173because that will try the rest of the pattern at all possible matching points,
5174so there is no point trying again.... er ....
5175
5176.... except when the .* appears inside capturing parentheses, and there is a
5177subsequent back reference to those parentheses. We haven't enough information
5178to catch that case precisely.
5179
5180At first, the best we could do was to detect when .* was in capturing brackets
5181and the highest back reference was greater than or equal to that level.
5182However, by keeping a bitmap of the first 31 back references, we can catch some
5183of the more common cases more precisely.
5184
5185Arguments:
5186 code points to start of expression (the bracket)
5187 options points to the options setting
5188 bracket_map a bitmap of which brackets we are inside while testing; this
5189 handles up to substring 31; after that we just have to take
5190 the less precise approach
5191 backref_map the back reference bitmap
5192
5193Returns: TRUE or FALSE
5194*/
5195
5196static BOOL
5197is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
5198 unsigned int backref_map)
5199{
5200do {
6bf342e1
PH
5201 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5202 options, PCRE_MULTILINE, FALSE);
8ac170f3
PH
5203 register int op = *scode;
5204
6bf342e1
PH
5205 /* Non-capturing brackets */
5206
5207 if (op == OP_BRA)
5208 {
5209 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5210 }
5211
8ac170f3
PH
5212 /* Capturing brackets */
5213
6bf342e1 5214 else if (op == OP_CBRA)
8ac170f3 5215 {
6bf342e1
PH
5216 int n = GET2(scode, 1+LINK_SIZE);
5217 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
8ac170f3
PH
5218 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
5219 }
5220
5221 /* Other brackets */
5222
6bf342e1 5223 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
8ac170f3
PH
5224 {
5225 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5226 }
5227
5228 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
5229 are or may be referenced. */
5230
6bf342e1
PH
5231 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5232 op == OP_TYPEPOSSTAR) &&
8ac170f3
PH
5233 (*options & PCRE_DOTALL) != 0)
5234 {
5235 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5236 }
5237
5238 /* Check for explicit anchoring */
5239
5240 else if (op != OP_SOD && op != OP_SOM &&
5241 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
5242 return FALSE;
5243 code += GET(code, 1);
5244 }
5245while (*code == OP_ALT); /* Loop for each alternative */
5246return TRUE;
5247}
5248
5249
5250
5251/*************************************************
5252* Check for starting with ^ or .* *
5253*************************************************/
5254
5255/* This is called to find out if every branch starts with ^ or .* so that
5256"first char" processing can be done to speed things up in multiline
5257matching and for non-DOTALL patterns that start with .* (which must start at
5258the beginning or after \n). As in the case of is_anchored() (see above), we
5259have to take account of back references to capturing brackets that contain .*
5260because in that case we can't make the assumption.
5261
5262Arguments:
5263 code points to start of expression (the bracket)
5264 bracket_map a bitmap of which brackets we are inside while testing; this
5265 handles up to substring 31; after that we just have to take
5266 the less precise approach
5267 backref_map the back reference bitmap
5268
5269Returns: TRUE or FALSE
5270*/
5271
5272static BOOL
5273is_startline(const uschar *code, unsigned int bracket_map,
5274 unsigned int backref_map)
5275{
5276do {
6bf342e1
PH
5277 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5278 NULL, 0, FALSE);
8ac170f3
PH
5279 register int op = *scode;
5280
6bf342e1
PH
5281 /* Non-capturing brackets */
5282
5283 if (op == OP_BRA)
5284 {
5285 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5286 }
5287
8ac170f3
PH
5288 /* Capturing brackets */
5289
6bf342e1 5290 else if (op == OP_CBRA)
8ac170f3 5291 {
6bf342e1
PH
5292 int n = GET2(scode, 1+LINK_SIZE);
5293 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
8ac170f3
PH
5294 if (!is_startline(scode, new_map, backref_map)) return FALSE;
5295 }
5296
5297 /* Other brackets */
5298
6bf342e1 5299 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
8ac170f3
PH
5300 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
5301
5302 /* .* means "start at start or after \n" if it isn't in brackets that
5303 may be referenced. */
5304
6bf342e1 5305 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
8ac170f3
PH
5306 {
5307 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5308 }
5309
5310 /* Check for explicit circumflex */
5311
5312 else if (op != OP_CIRC) return FALSE;
5313
5314 /* Move on to the next alternative */
5315
5316 code += GET(code, 1);
5317 }
5318while (*code == OP_ALT); /* Loop for each alternative */
5319return TRUE;
5320}
5321
5322
5323
5324/*************************************************
5325* Check for asserted fixed first char *
5326*************************************************/
5327
5328/* During compilation, the "first char" settings from forward assertions are
5329discarded, because they can cause conflicts with actual literals that follow.
5330However, if we end up without a first char setting for an unanchored pattern,
5331it is worth scanning the regex to see if there is an initial asserted first
5332char. If all branches start with the same asserted char, or with a bracket all
5333of whose alternatives start with the same asserted char (recurse ad lib), then
5334we return that char, otherwise -1.
5335
5336Arguments:
5337 code points to start of expression (the bracket)
5338 options pointer to the options (used to check casing changes)
5339 inassert TRUE if in an assertion
5340
5341Returns: -1 or the fixed first char
5342*/
5343
5344static int
5345find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
5346{
5347register int c = -1;
5348do {
5349 int d;
5350 const uschar *scode =
5351 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5352 register int op = *scode;
5353
8ac170f3
PH
5354 switch(op)
5355 {
5356 default:
5357 return -1;
5358
5359 case OP_BRA:
6bf342e1 5360 case OP_CBRA:
8ac170f3
PH
5361 case OP_ASSERT:
5362 case OP_ONCE:
5363 case OP_COND:
5364 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
5365 return -1;
5366 if (c < 0) c = d; else if (c != d) return -1;
5367 break;
5368
5369 case OP_EXACT: /* Fall through */
5370 scode += 2;
5371
5372 case OP_CHAR:
5373 case OP_CHARNC:
5374 case OP_PLUS:
5375 case OP_MINPLUS:
6bf342e1 5376 case OP_POSPLUS:
8ac170f3
PH
5377 if (!inassert) return -1;
5378 if (c < 0)
5379 {
5380 c = scode[1];
5381 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5382 }
5383 else if (c != scode[1]) return -1;
5384 break;
5385 }
5386
5387 code += GET(code, 1);
5388 }
5389while (*code == OP_ALT);
5390return c;
5391}
5392
5393
5394
5395/*************************************************
5396* Compile a Regular Expression *
5397*************************************************/
5398
5399/* This function takes a string and returns a pointer to a block of store
5400holding a compiled version of the expression. The original API for this
5401function had no error code return variable; it is retained for backwards
5402compatibility. The new function is given a new name.
5403
5404Arguments:
5405 pattern the regular expression
5406 options various option bits
5407 errorcodeptr pointer to error code variable (pcre_compile2() only)
5408 can be NULL if you don't want a code value
5409 errorptr pointer to pointer to error text
5410 erroroffset ptr offset in pattern where error was detected
5411 tables pointer to character tables or NULL
5412
5413Returns: pointer to compiled data block, or NULL on error,
5414 with errorptr and erroroffset set
5415*/
5416
64f2600a 5417PCRE_EXP_DEFN pcre *
8ac170f3
PH
5418pcre_compile(const char *pattern, int options, const char **errorptr,
5419 int *erroroffset, const unsigned char *tables)
5420{
5421return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5422}
5423
5424
64f2600a 5425PCRE_EXP_DEFN pcre *
8ac170f3
PH
5426pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5427 const char **errorptr, int *erroroffset, const unsigned char *tables)
5428{
5429real_pcre *re;
6bf342e1
PH
5430int length = 1; /* For final END opcode */
5431int firstbyte, reqbyte, newline;
8ac170f3
PH
5432int errorcode = 0;
5433#ifdef SUPPORT_UTF8
5434BOOL utf8;
8ac170f3 5435#endif
8ac170f3
PH
5436size_t size;
5437uschar *code;
5438const uschar *codestart;
5439const uschar *ptr;
5440compile_data compile_block;
aa41d2de 5441compile_data *cd = &compile_block;
6bf342e1
PH
5442
5443/* This space is used for "compiling" into during the first phase, when we are
5444computing the amount of memory that is needed. Compiled items are thrown away
5445as soon as possible, so that a fairly large buffer should be sufficient for
5446this purpose. The same space is used in the second phase for remembering where
5447to fill in forward references to subpatterns. */
5448
5449uschar cworkspace[COMPILE_WORK_SIZE];
5450
5451
5452/* Set this early so that early errors get offset 0. */
5453
5454ptr = (const uschar *)pattern;
8ac170f3
PH
5455
5456/* We can't pass back an error message if errorptr is NULL; I guess the best we
5457can do is just return NULL, but we can set a code value if there is a code
5458pointer. */
5459
5460if (errorptr == NULL)
5461 {
5462 if (errorcodeptr != NULL) *errorcodeptr = 99;
5463 return NULL;
5464 }
5465
5466*errorptr = NULL;
5467if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5468
5469/* However, we can give a message for this error */
5470
5471if (erroroffset == NULL)
5472 {
5473 errorcode = ERR16;
64f2600a 5474 goto PCRE_EARLY_ERROR_RETURN2;
8ac170f3
PH
5475 }
5476
5477*erroroffset = 0;
5478
5479/* Can't support UTF8 unless PCRE has been compiled to include the code. */
5480
5481#ifdef SUPPORT_UTF8
5482utf8 = (options & PCRE_UTF8) != 0;
5483if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
5484 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5485 {
5486 errorcode = ERR44;
64f2600a 5487 goto PCRE_EARLY_ERROR_RETURN2;
8ac170f3
PH
5488 }
5489#else
5490if ((options & PCRE_UTF8) != 0)
5491 {
5492 errorcode = ERR32;
5493 goto PCRE_EARLY_ERROR_RETURN;
5494 }
5495#endif
5496
5497if ((options & ~PUBLIC_OPTIONS) != 0)
5498 {
5499 errorcode = ERR17;
5500 goto PCRE_EARLY_ERROR_RETURN;
5501 }
5502
5503/* Set up pointers to the individual character tables */
5504
5505if (tables == NULL) tables = _pcre_default_tables;
aa41d2de
PH
5506cd->lcc = tables + lcc_offset;
5507cd->fcc = tables + fcc_offset;
5508cd->cbits = tables + cbits_offset;
5509cd->ctypes = tables + ctypes_offset;
5510
6bf342e1 5511/* Handle different types of newline. The three bits give seven cases. The
64f2600a
PH
5512current code allows for fixed one- or two-byte sequences, plus "any" and
5513"anycrlf". */
aa41d2de 5514
6bf342e1 5515switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))
aa41d2de 5516 {
6bf342e1 5517 case 0: newline = NEWLINE; break; /* Compile-time default */
aa41d2de
PH
5518 case PCRE_NEWLINE_CR: newline = '\r'; break;
5519 case PCRE_NEWLINE_LF: newline = '\n'; break;
5520 case PCRE_NEWLINE_CR+
5521 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
6bf342e1 5522 case PCRE_NEWLINE_ANY: newline = -1; break;
64f2600a 5523 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6bf342e1 5524 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
aa41d2de
PH
5525 }
5526
64f2600a
PH
5527if (newline == -2)
5528 {
5529 cd->nltype = NLTYPE_ANYCRLF;
5530 }
5531else if (newline < 0)
aa41d2de 5532 {
6bf342e1 5533 cd->nltype = NLTYPE_ANY;
aa41d2de
PH
5534 }
5535else
5536 {
6bf342e1
PH
5537 cd->nltype = NLTYPE_FIXED;
5538 if (newline > 255)
5539 {
5540 cd->nllen = 2;
5541 cd->nl[0] = (newline >> 8) & 255;
5542 cd->nl[1] = newline & 255;
5543 }
5544 else
5545 {
5546 cd->nllen = 1;
5547 cd->nl[0] = newline;
5548 }
aa41d2de 5549 }
8ac170f3 5550
6bf342e1
PH
5551/* Maximum back reference and backref bitmap. The bitmap records up to 31 back
5552references to help in deciding whether (.*) can be treated as anchored or not.
5553*/
8ac170f3 5554
aa41d2de
PH
5555cd->top_backref = 0;
5556cd->backref_map = 0;
8ac170f3
PH
5557
5558/* Reflect pattern for debugging output */
5559
5560DPRINTF(("------------------------------------------------------------------\n"));
5561DPRINTF(("%s\n", pattern));
5562
6bf342e1
PH
5563/* Pretend to compile the pattern while actually just accumulating the length
5564of memory required. This behaviour is triggered by passing a non-NULL final
5565argument to compile_regex(). We pass a block of workspace (cworkspace) for it
5566to compile parts of the pattern into; the compiled code is discarded when it is
5567no longer needed, so hopefully this workspace will never overflow, though there
5568is a test for its doing so. */
8ac170f3 5569
6bf342e1
PH
5570cd->bracount = 0;
5571cd->names_found = 0;
5572cd->name_entry_size = 0;
5573cd->name_table = NULL;
5574cd->start_workspace = cworkspace;
5575cd->start_code = cworkspace;
5576cd->hwm = cworkspace;
5577cd->start_pattern = (const uschar *)pattern;
5578cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
5579cd->req_varyopt = 0;
5580cd->nopartial = FALSE;
5581cd->external_options = options;
8ac170f3 5582
6bf342e1
PH
5583/* Now do the pre-compile. On error, errorcode will be set non-zero, so we
5584don't need to look at the result of the function here. The initial options have
5585been put into the cd block so that they can be changed if an option setting is
5586found within the regex right at the beginning. Bringing initial option settings
5587outside can help speed up starting point checks. */
8ac170f3 5588
6bf342e1
PH
5589code = cworkspace;
5590*code = OP_BRA;
5591(void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
64f2600a
PH
5592 &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
5593 &length);
6bf342e1 5594if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
8ac170f3 5595
6bf342e1
PH
5596DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
5597 cd->hwm - cworkspace));
8ac170f3 5598
6bf342e1
PH
5599if (length > MAX_PATTERN_SIZE)
5600 {
5601 errorcode = ERR20;
5602 goto PCRE_EARLY_ERROR_RETURN;
5603 }
8ac170f3 5604
6bf342e1
PH
5605/* Compute the size of data block needed and get it, either from malloc or
5606externally provided function. Integer overflow should no longer be possible
5607because nowadays we limit the maximum value of cd->names_found and
5608cd->name_entry_size. */
8ac170f3 5609
6bf342e1
PH
5610size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
5611re = (real_pcre *)(pcre_malloc)(size);
8ac170f3 5612
6bf342e1
PH
5613if (re == NULL)
5614 {
5615 errorcode = ERR21;
5616 goto PCRE_EARLY_ERROR_RETURN;
5617 }
8ac170f3 5618
6bf342e1
PH
5619/* Put in the magic number, and save the sizes, initial options, and character
5620table pointer. NULL is used for the default character tables. The nullpad field
5621is at the end; it's there to help in the case when a regex compiled on a system
5622with 4-byte pointers is run on another with 8-byte pointers. */
8ac170f3 5623
6bf342e1
PH
5624re->magic_number = MAGIC_NUMBER;
5625re->size = size;
5626re->options = cd->external_options;
5627re->dummy1 = 0;
5628re->first_byte = 0;
5629re->req_byte = 0;
5630re->name_table_offset = sizeof(real_pcre);
5631re->name_entry_size = cd->name_entry_size;
5632re->name_count = cd->names_found;
5633re->ref_count = 0;
5634re->tables = (tables == _pcre_default_tables)? NULL : tables;
5635re->nullpad = NULL;
8ac170f3 5636
6bf342e1
PH
5637/* The starting points of the name/number translation table and of the code are
5638passed around in the compile data block. The start/end pattern and initial
5639options are already set from the pre-compile phase, as is the name_entry_size
5640field. Reset the bracket count and the names_found field. Also reset the hwm
5641field; this time it's used for remembering forward references to subpatterns.
5642*/
8ac170f3 5643
6bf342e1
PH
5644cd->bracount = 0;
5645cd->names_found = 0;
5646cd->name_table = (uschar *)re + re->name_table_offset;
5647codestart = cd->name_table + re->name_entry_size * re->name_count;
5648cd->start_code = codestart;
5649cd->hwm = cworkspace;
5650cd->req_varyopt = 0;
5651cd->nopartial = FALSE;
8ac170f3 5652
6bf342e1
PH
5653/* Set up a starting, non-extracting bracket, then compile the expression. On
5654error, errorcode will be set non-zero, so we don't need to look at the result
5655of the function here. */
8ac170f3 5656
6bf342e1
PH
5657ptr = (const uschar *)pattern;
5658code = (uschar *)codestart;
5659*code = OP_BRA;
5660(void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
64f2600a 5661 &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
6bf342e1
PH
5662re->top_bracket = cd->bracount;
5663re->top_backref = cd->top_backref;
8ac170f3 5664
6bf342e1 5665if (cd->nopartial) re->options |= PCRE_NOPARTIAL;
8ac170f3 5666
6bf342e1 5667/* If not reached end of pattern on success, there's an excess bracket. */
8ac170f3 5668
6bf342e1 5669if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
8ac170f3 5670
6bf342e1
PH
5671/* Fill in the terminating state and check for disastrous overflow, but
5672if debugging, leave the test till after things are printed out. */
8ac170f3 5673
6bf342e1 5674*code++ = OP_END;
8ac170f3 5675
6bf342e1
PH
5676#ifndef DEBUG
5677if (code - codestart > length) errorcode = ERR23;
8ac170f3
PH
5678#endif
5679
6bf342e1 5680/* Fill in any forward references that are required. */
8ac170f3 5681
6bf342e1 5682while (errorcode == 0 && cd->hwm > cworkspace)
8ac170f3 5683 {
6bf342e1
PH
5684 int offset, recno;
5685 const uschar *groupptr;
5686 cd->hwm -= LINK_SIZE;
5687 offset = GET(cd->hwm, 0);
5688 recno = GET(codestart, offset);
5689 groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
5690 if (groupptr == NULL) errorcode = ERR53;
5691 else PUT(((uschar *)codestart), offset, groupptr - codestart);
8ac170f3
PH
5692 }
5693
8ac170f3
PH
5694/* Give an error if there's back reference to a non-existent capturing
5695subpattern. */
5696
6bf342e1 5697if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
8ac170f3
PH
5698
5699/* Failed to compile, or error while post-processing */
5700
5701if (errorcode != 0)
5702 {
5703 (pcre_free)(re);
8ac170f3 5704 PCRE_EARLY_ERROR_RETURN:
6bf342e1 5705 *erroroffset = ptr - (const uschar *)pattern;
64f2600a 5706 PCRE_EARLY_ERROR_RETURN2:
8ac170f3
PH
5707 *errorptr = error_texts[errorcode];
5708 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
5709 return NULL;
5710 }
5711
5712/* If the anchored option was not passed, set the flag if we can determine that
5713the pattern is anchored by virtue of ^ characters or \A or anything else (such
5714as starting with .* when DOTALL is set).
5715
6bf342e1 5716Otherwise, if we know what the first byte has to be, save it, because that
8ac170f3
PH
5717speeds up unanchored matches no end. If not, see if we can set the
5718PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
5719start with ^. and also when all branches start with .* for non-DOTALL matches.
5720*/
5721
6bf342e1 5722if ((re->options & PCRE_ANCHORED) == 0)
8ac170f3 5723 {
6bf342e1 5724 int temp_options = re->options; /* May get changed during these scans */
aa41d2de 5725 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
8ac170f3
PH
5726 re->options |= PCRE_ANCHORED;
5727 else
5728 {
5729 if (firstbyte < 0)
5730 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
5731 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
5732 {
5733 int ch = firstbyte & 255;
5734 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
aa41d2de 5735 cd->fcc[ch] == ch)? ch : firstbyte;
8ac170f3
PH
5736 re->options |= PCRE_FIRSTSET;
5737 }
aa41d2de 5738 else if (is_startline(codestart, 0, cd->backref_map))
8ac170f3
PH
5739 re->options |= PCRE_STARTLINE;
5740 }
5741 }
5742
5743/* For an anchored pattern, we use the "required byte" only if it follows a
5744variable length item in the regex. Remove the caseless flag for non-caseable
5745bytes. */
5746
5747if (reqbyte >= 0 &&
5748 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
5749 {
5750 int ch = reqbyte & 255;
5751 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
aa41d2de 5752 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
8ac170f3
PH
5753 re->options |= PCRE_REQCHSET;
5754 }
5755
aa41d2de
PH
5756/* Print out the compiled data if debugging is enabled. This is never the
5757case when building a production library. */
8ac170f3
PH
5758
5759#ifdef DEBUG
5760
5761printf("Length = %d top_bracket = %d top_backref = %d\n",
5762 length, re->top_bracket, re->top_backref);
5763
5764if (re->options != 0)
5765 {
aa41d2de 5766 printf("%s%s%s%s%s%s%s%s%s\n",
8ac170f3
PH
5767 ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5768 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5769 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
8ac170f3
PH
5770 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5771 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5772 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5773 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5774 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5775 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5776 }
5777
5778if ((re->options & PCRE_FIRSTSET) != 0)
5779 {
5780 int ch = re->first_byte & 255;
aa41d2de
PH
5781 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
5782 "" : " (caseless)";
8ac170f3
PH
5783 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5784 else printf("First char = \\x%02x%s\n", ch, caseless);
5785 }
5786
5787if ((re->options & PCRE_REQCHSET) != 0)
5788 {
5789 int ch = re->req_byte & 255;
aa41d2de
PH
5790 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
5791 "" : " (caseless)";
8ac170f3
PH
5792 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5793 else printf("Req char = \\x%02x%s\n", ch, caseless);
5794 }
5795
64f2600a 5796pcre_printint(re, stdout, TRUE);
8ac170f3
PH
5797
5798/* This check is done here in the debugging case so that the code that
5799was compiled can be seen. */
5800
5801if (code - codestart > length)
5802 {
5803 (pcre_free)(re);
5804 *errorptr = error_texts[ERR23];
5805 *erroroffset = ptr - (uschar *)pattern;
5806 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
5807 return NULL;
5808 }
6bf342e1 5809#endif /* DEBUG */
8ac170f3
PH
5810
5811return (pcre *)re;
5812}
5813
5814/* End of pcre_compile.c */