Updated embedded PCRE to version 7.4 to avoid 2 CVE issues:-
[exim.git] / src / src / pcre / pcre_compile.c
CommitLineData
47db1125 1/* $Cambridge: exim/src/src/pcre/pcre_compile.c,v 1.6 2007/11/12 13:02:19 nm4 Exp $ */
8ac170f3
PH
2
3/*************************************************
4* Perl-Compatible Regular Expressions *
5*************************************************/
6
7/* PCRE is a library of functions to support regular expressions whose syntax
8and semantics are as close as possible to those of the Perl 5 language.
9
10 Written by Philip Hazel
64f2600a 11 Copyright (c) 1997-2007 University of Cambridge
8ac170f3
PH
12
13-----------------------------------------------------------------------------
14Redistribution and use in source and binary forms, with or without
15modification, are permitted provided that the following conditions are met:
16
17 * Redistributions of source code must retain the above copyright notice,
18 this list of conditions and the following disclaimer.
19
20 * Redistributions in binary form must reproduce the above copyright
21 notice, this list of conditions and the following disclaimer in the
22 documentation and/or other materials provided with the distribution.
23
24 * Neither the name of the University of Cambridge nor the names of its
25 contributors may be used to endorse or promote products derived from
26 this software without specific prior written permission.
27
28THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38POSSIBILITY OF SUCH DAMAGE.
39-----------------------------------------------------------------------------
40*/
41
42
43/* This module contains the external function pcre_compile(), along with
44supporting internal functions that are not used by other modules. */
45
46
47db1125
NM
47#ifdef HAVE_CONFIG_H
48#include "config.h"
49#endif
50
6bf342e1
PH
51#define NLBLOCK cd /* Block containing newline information */
52#define PSSTART start_pattern /* Field containing processed string start */
53#define PSEND end_pattern /* Field containing processed string end */
54
8ac170f3
PH
55#include "pcre_internal.h"
56
57
aa41d2de
PH
58/* When DEBUG is defined, we need the pcre_printint() function, which is also
59used by pcretest. DEBUG is not defined when building a production library. */
60
61#ifdef DEBUG
62#include "pcre_printint.src"
63#endif
64
65
64f2600a
PH
66/* Macro for setting individual bits in class bitmaps. */
67
68#define SETBIT(a,b) a[b/8] |= (1 << (b%8))
69
47db1125
NM
70/* Maximum length value to check against when making sure that the integer that
71holds the compiled pattern length does not overflow. We make it a bit less than
72INT_MAX to allow for adding in group terminating bytes, so that we don't have
73to check them every time. */
74
75#define OFLOW_MAX (INT_MAX - 20)
76
64f2600a 77
8ac170f3
PH
78/*************************************************
79* Code parameters and static tables *
80*************************************************/
81
6bf342e1
PH
82/* This value specifies the size of stack workspace that is used during the
83first pre-compile phase that determines how much memory is required. The regex
84is partly compiled into this space, but the compiled parts are discarded as
85soon as they can be, so that hopefully there will never be an overrun. The code
86does, however, check for an overrun. The largest amount I've seen used is 218,
87so this number is very generous.
88
89The same workspace is used during the second, actual compile phase for
90remembering forward references to groups so that they can be filled in at the
91end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
92is 4 there is plenty of room. */
8ac170f3 93
6bf342e1 94#define COMPILE_WORK_SIZE (4096)
8ac170f3
PH
95
96
97/* Table for handling escaped characters in the range '0'-'z'. Positive returns
98are simple data values; negative values are for special things like \d and so
99on. Zero means further processing is needed (for things like \x), or the escape
100is invalid. */
101
64f2600a 102#ifndef EBCDIC /* This is the "normal" table for ASCII systems */
8ac170f3
PH
103static const short int escapes[] = {
104 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
105 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
106 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
64f2600a
PH
107-ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
108-ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
8ac170f3
PH
109-ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
110 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
64f2600a
PH
111-ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
112-ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
8ac170f3
PH
113 0, 0, -ESC_z /* x - z */
114};
115
64f2600a 116#else /* This is the "abnormal" table for EBCDIC systems */
8ac170f3
PH
117static const short int escapes[] = {
118/* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
119/* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
120/* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
121/* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
122/* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
123/* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
124/* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
125/* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
64f2600a 126/* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
6bf342e1 127/* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
8ac170f3 128/* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
64f2600a 129/* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
8ac170f3
PH
130/* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
131/* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
132/* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
133/* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
64f2600a 134/* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
47db1125 135/* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
6bf342e1 136/* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
64f2600a 137/* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
8ac170f3
PH
138/* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
139/* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
140/* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
141};
142#endif
143
144
47db1125
NM
145/* Table of special "verbs" like (*PRUNE). This is a short table, so it is
146searched linearly. Put all the names into a single string, in order to reduce
147the number of relocations when a shared library is dynamically linked. */
148
149typedef struct verbitem {
150 int len;
151 int op;
152} verbitem;
153
154static const char verbnames[] =
155 "ACCEPT\0"
156 "COMMIT\0"
157 "F\0"
158 "FAIL\0"
159 "PRUNE\0"
160 "SKIP\0"
161 "THEN";
162
163static verbitem verbs[] = {
164 { 6, OP_ACCEPT },
165 { 6, OP_COMMIT },
166 { 1, OP_FAIL },
167 { 4, OP_FAIL },
168 { 5, OP_PRUNE },
169 { 4, OP_SKIP },
170 { 4, OP_THEN }
171};
172
173static int verbcount = sizeof(verbs)/sizeof(verbitem);
174
8ac170f3 175
47db1125
NM
176/* Tables of names of POSIX character classes and their lengths. The names are
177now all in a single string, to reduce the number of relocations when a shared
178library is dynamically loaded. The list of lengths is terminated by a zero
179length entry. The first three must be alpha, lower, upper, as this is assumed
180for handling case independence. */
181
182static const char posix_names[] =
183 "alpha\0" "lower\0" "upper\0" "alnum\0" "ascii\0" "blank\0"
184 "cntrl\0" "digit\0" "graph\0" "print\0" "punct\0" "space\0"
185 "word\0" "xdigit";
8ac170f3
PH
186
187static const uschar posix_name_lengths[] = {
188 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
189
aa41d2de
PH
190/* Table of class bit maps for each POSIX class. Each class is formed from a
191base map, with an optional addition or removal of another map. Then, for some
192classes, there is some additional tweaking: for [:blank:] the vertical space
193characters are removed, and for [:alpha:] and [:alnum:] the underscore
194character is removed. The triples in the table consist of the base map offset,
195second map offset or -1 if no second map, and a non-negative value for map
196addition or a negative value for map subtraction (if there are two maps). The
197absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
198remove vertical space characters, 2 => remove underscore. */
8ac170f3
PH
199
200static const int posix_class_maps[] = {
aa41d2de
PH
201 cbit_word, cbit_digit, -2, /* alpha */
202 cbit_lower, -1, 0, /* lower */
203 cbit_upper, -1, 0, /* upper */
204 cbit_word, -1, 2, /* alnum - word without underscore */
205 cbit_print, cbit_cntrl, 0, /* ascii */
206 cbit_space, -1, 1, /* blank - a GNU extension */
207 cbit_cntrl, -1, 0, /* cntrl */
208 cbit_digit, -1, 0, /* digit */
209 cbit_graph, -1, 0, /* graph */
210 cbit_print, -1, 0, /* print */
211 cbit_punct, -1, 0, /* punct */
212 cbit_space, -1, 0, /* space */
213 cbit_word, -1, 0, /* word - a Perl extension */
214 cbit_xdigit,-1, 0 /* xdigit */
8ac170f3
PH
215};
216
217
6bf342e1
PH
218#define STRING(a) # a
219#define XSTRING(s) STRING(s)
220
8ac170f3 221/* The texts of compile-time error messages. These are "char *" because they
6bf342e1
PH
222are passed to the outside world. Do not ever re-use any error number, because
223they are documented. Always add a new error instead. Messages marked DEAD below
47db1125
NM
224are no longer used. This used to be a table of strings, but in order to reduce
225the number of relocations needed when a shared library is loaded dynamically,
226it is now one long string. We cannot use a table of offsets, because the
227lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
228simply count through to the one we want - this isn't a performance issue
229because these strings are used only when there is a compilation error. */
230
231static const char error_texts[] =
232 "no error\0"
233 "\\ at end of pattern\0"
234 "\\c at end of pattern\0"
235 "unrecognized character follows \\\0"
236 "numbers out of order in {} quantifier\0"
8ac170f3 237 /* 5 */
47db1125
NM
238 "number too big in {} quantifier\0"
239 "missing terminating ] for character class\0"
240 "invalid escape sequence in character class\0"
241 "range out of order in character class\0"
242 "nothing to repeat\0"
8ac170f3 243 /* 10 */
47db1125
NM
244 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
245 "internal error: unexpected repeat\0"
246 "unrecognized character after (?\0"
247 "POSIX named classes are supported only within a class\0"
248 "missing )\0"
8ac170f3 249 /* 15 */
47db1125
NM
250 "reference to non-existent subpattern\0"
251 "erroffset passed as NULL\0"
252 "unknown option bit(s) set\0"
253 "missing ) after comment\0"
254 "parentheses nested too deeply\0" /** DEAD **/
8ac170f3 255 /* 20 */
47db1125
NM
256 "regular expression is too large\0"
257 "failed to get memory\0"
258 "unmatched parentheses\0"
259 "internal error: code overflow\0"
260 "unrecognized character after (?<\0"
8ac170f3 261 /* 25 */
47db1125
NM
262 "lookbehind assertion is not fixed length\0"
263 "malformed number or name after (?(\0"
264 "conditional group contains more than two branches\0"
265 "assertion expected after (?(\0"
266 "(?R or (?[+-]digits must be followed by )\0"
8ac170f3 267 /* 30 */
47db1125
NM
268 "unknown POSIX class name\0"
269 "POSIX collating elements are not supported\0"
270 "this version of PCRE is not compiled with PCRE_UTF8 support\0"
271 "spare error\0" /** DEAD **/
272 "character value in \\x{...} sequence is too large\0"
8ac170f3 273 /* 35 */
47db1125
NM
274 "invalid condition (?(0)\0"
275 "\\C not allowed in lookbehind assertion\0"
276 "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
277 "number after (?C is > 255\0"
278 "closing ) for (?C expected\0"
8ac170f3 279 /* 40 */
47db1125
NM
280 "recursive call could loop indefinitely\0"
281 "unrecognized character after (?P\0"
282 "syntax error in subpattern name (missing terminator)\0"
283 "two named subpatterns have the same name\0"
284 "invalid UTF-8 string\0"
8ac170f3 285 /* 45 */
47db1125
NM
286 "support for \\P, \\p, and \\X has not been compiled\0"
287 "malformed \\P or \\p sequence\0"
288 "unknown property name after \\P or \\p\0"
289 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
290 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
aa41d2de 291 /* 50 */
47db1125
NM
292 "repeated subpattern is too long\0" /** DEAD **/
293 "octal value is greater than \\377 (not in UTF-8 mode)\0"
294 "internal error: overran compiling workspace\0"
295 "internal error: previously-checked referenced subpattern not found\0"
296 "DEFINE group contains more than one branch\0"
6bf342e1 297 /* 55 */
47db1125
NM
298 "repeating a DEFINE group is not allowed\0"
299 "inconsistent NEWLINE options\0"
300 "\\g is not followed by a braced name or an optionally braced non-zero number\0"
301 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number\0"
302 "(*VERB) with an argument is not supported\0"
303 /* 60 */
304 "(*VERB) not recognized\0"
305 "number is too big";
8ac170f3
PH
306
307
308/* Table to identify digits and hex digits. This is used when compiling
309patterns. Note that the tables in chartables are dependent on the locale, and
310may mark arbitrary characters as digits - but the PCRE compiling code expects
311to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
312a private table here. It costs 256 bytes, but it is a lot faster than doing
313character value tests (at least in some simple cases I timed), and in some
314applications one wants PCRE to compile efficiently as well as match
315efficiently.
316
317For convenience, we use the same bit definitions as in chartables:
318
319 0x04 decimal digit
320 0x08 hexadecimal digit
321
322Then we can use ctype_digit and ctype_xdigit in the code. */
323
64f2600a 324#ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
8ac170f3
PH
325static const unsigned char digitab[] =
326 {
327 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
328 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
329 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
330 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
331 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
332 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
333 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
334 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
335 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
336 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
337 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
338 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
339 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
340 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
341 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
342 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
343 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
344 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
345 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
346 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
347 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
348 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
349 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
350 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
351 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
352 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
353 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
354 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
355 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
356 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
357 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
358 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
359
64f2600a 360#else /* This is the "abnormal" case, for EBCDIC systems */
8ac170f3
PH
361static const unsigned char digitab[] =
362 {
363 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
364 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
365 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
366 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
367 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
368 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
369 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
370 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
371 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
372 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
373 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
64f2600a 374 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
8ac170f3
PH
375 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
376 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
377 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
378 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
379 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
380 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
381 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
382 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
383 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
384 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
385 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
386 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
387 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
388 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
389 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
390 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
391 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
392 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
393 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
394 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
395
396static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
397 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
398 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
399 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
400 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
401 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
402 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
403 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
404 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
405 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
406 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
407 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
64f2600a 408 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
8ac170f3
PH
409 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
410 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
411 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
412 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
413 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
414 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
415 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
416 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
417 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
418 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
419 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
420 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
421 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
422 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
423 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
424 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
425 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
426 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
427 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
428 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
429#endif
430
431
432/* Definition to allow mutual recursion */
433
434static BOOL
64f2600a
PH
435 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
436 int *, int *, branch_chain *, compile_data *, int *);
8ac170f3
PH
437
438
439
47db1125
NM
440/*************************************************
441* Find an error text *
442*************************************************/
443
444/* The error texts are now all in one long string, to save on relocations. As
445some of the text is of unknown length, we can't use a table of offsets.
446Instead, just count through the strings. This is not a performance issue
447because it happens only when there has been a compilation error.
448
449Argument: the error number
450Returns: pointer to the error string
451*/
452
453static const char *
454find_error_text(int n)
455{
456const char *s = error_texts;
457for (; n > 0; n--) while (*s++ != 0);
458return s;
459}
460
461
8ac170f3
PH
462/*************************************************
463* Handle escapes *
464*************************************************/
465
466/* This function is called when a \ has been encountered. It either returns a
467positive value for a simple escape such as \n, or a negative value which
6bf342e1
PH
468encodes one of the more complicated things such as \d. A backreference to group
469n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
470UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
471ptr is pointing at the \. On exit, it is on the final character of the escape
472sequence.
8ac170f3
PH
473
474Arguments:
475 ptrptr points to the pattern position pointer
476 errorcodeptr points to the errorcode variable
477 bracount number of previous extracting brackets
478 options the options bits
479 isclass TRUE if inside a character class
480
481Returns: zero or positive => a data character
482 negative => a special escape sequence
47db1125 483 on error, errorcodeptr is set
8ac170f3
PH
484*/
485
486static int
487check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
488 int options, BOOL isclass)
489{
aa41d2de
PH
490BOOL utf8 = (options & PCRE_UTF8) != 0;
491const uschar *ptr = *ptrptr + 1;
8ac170f3
PH
492int c, i;
493
aa41d2de
PH
494GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
495ptr--; /* Set pointer back to the last byte */
496
8ac170f3
PH
497/* If backslash is at the end of the pattern, it's an error. */
498
8ac170f3
PH
499if (c == 0) *errorcodeptr = ERR1;
500
501/* Non-alphamerics are literals. For digits or letters, do an initial lookup in
502a table. A non-zero result is something that can be returned immediately.
503Otherwise further processing may be required. */
504
64f2600a 505#ifndef EBCDIC /* ASCII coding */
8ac170f3
PH
506else if (c < '0' || c > 'z') {} /* Not alphameric */
507else if ((i = escapes[c - '0']) != 0) c = i;
508
64f2600a 509#else /* EBCDIC coding */
8ac170f3
PH
510else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
511else if ((i = escapes[c - 0x48]) != 0) c = i;
512#endif
513
514/* Escapes that need further processing, or are illegal. */
515
516else
517 {
518 const uschar *oldptr;
6bf342e1
PH
519 BOOL braced, negated;
520
8ac170f3
PH
521 switch (c)
522 {
523 /* A number of Perl escapes are not handled by PCRE. We give an explicit
524 error. */
525
526 case 'l':
527 case 'L':
528 case 'N':
529 case 'u':
530 case 'U':
531 *errorcodeptr = ERR37;
532 break;
533
6bf342e1
PH
534 /* \g must be followed by a number, either plain or braced. If positive, it
535 is an absolute backreference. If negative, it is a relative backreference.
64f2600a
PH
536 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
537 reference to a named group. This is part of Perl's movement towards a
538 unified syntax for back references. As this is synonymous with \k{name}, we
539 fudge it up by pretending it really was \k. */
6bf342e1
PH
540
541 case 'g':
542 if (ptr[1] == '{')
543 {
64f2600a
PH
544 const uschar *p;
545 for (p = ptr+2; *p != 0 && *p != '}'; p++)
546 if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
547 if (*p != 0 && *p != '}')
548 {
549 c = -ESC_k;
550 break;
551 }
6bf342e1
PH
552 braced = TRUE;
553 ptr++;
554 }
555 else braced = FALSE;
556
557 if (ptr[1] == '-')
558 {
559 negated = TRUE;
560 ptr++;
561 }
562 else negated = FALSE;
563
564 c = 0;
565 while ((digitab[ptr[1]] & ctype_digit) != 0)
566 c = c * 10 + *(++ptr) - '0';
567
47db1125
NM
568 if (c < 0)
569 {
570 *errorcodeptr = ERR61;
571 break;
572 }
573
6bf342e1
PH
574 if (c == 0 || (braced && *(++ptr) != '}'))
575 {
576 *errorcodeptr = ERR57;
47db1125 577 break;
6bf342e1
PH
578 }
579
580 if (negated)
581 {
582 if (c > bracount)
583 {
584 *errorcodeptr = ERR15;
47db1125 585 break;
6bf342e1
PH
586 }
587 c = bracount - (c - 1);
588 }
589
590 c = -(ESC_REF + c);
591 break;
592
8ac170f3
PH
593 /* The handling of escape sequences consisting of a string of digits
594 starting with one that is not zero is not straightforward. By experiment,
595 the way Perl works seems to be as follows:
596
597 Outside a character class, the digits are read as a decimal number. If the
598 number is less than 10, or if there are that many previous extracting
599 left brackets, then it is a back reference. Otherwise, up to three octal
600 digits are read to form an escaped byte. Thus \123 is likely to be octal
601 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
602 value is greater than 377, the least significant 8 bits are taken. Inside a
603 character class, \ followed by a digit is always an octal number. */
604
605 case '1': case '2': case '3': case '4': case '5':
606 case '6': case '7': case '8': case '9':
607
608 if (!isclass)
609 {
610 oldptr = ptr;
611 c -= '0';
612 while ((digitab[ptr[1]] & ctype_digit) != 0)
613 c = c * 10 + *(++ptr) - '0';
47db1125
NM
614 if (c < 0)
615 {
616 *errorcodeptr = ERR61;
617 break;
618 }
8ac170f3
PH
619 if (c < 10 || c <= bracount)
620 {
621 c = -(ESC_REF + c);
622 break;
623 }
624 ptr = oldptr; /* Put the pointer back and fall through */
625 }
626
627 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
628 generates a binary zero byte and treats the digit as a following literal.
629 Thus we have to pull back the pointer by one. */
630
631 if ((c = *ptr) >= '8')
632 {
633 ptr--;
634 c = 0;
635 break;
636 }
637
638 /* \0 always starts an octal number, but we may drop through to here with a
aa41d2de
PH
639 larger first octal digit. The original code used just to take the least
640 significant 8 bits of octal numbers (I think this is what early Perls used
641 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
642 than 3 octal digits. */
8ac170f3
PH
643
644 case '0':
645 c -= '0';
646 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
647 c = c * 8 + *(++ptr) - '0';
aa41d2de 648 if (!utf8 && c > 255) *errorcodeptr = ERR51;
8ac170f3
PH
649 break;
650
aa41d2de
PH
651 /* \x is complicated. \x{ddd} is a character number which can be greater
652 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
653 treated as a data character. */
8ac170f3
PH
654
655 case 'x':
aa41d2de 656 if (ptr[1] == '{')
8ac170f3
PH
657 {
658 const uschar *pt = ptr + 2;
aa41d2de
PH
659 int count = 0;
660
8ac170f3
PH
661 c = 0;
662 while ((digitab[*pt] & ctype_xdigit) != 0)
663 {
aa41d2de
PH
664 register int cc = *pt++;
665 if (c == 0 && cc == '0') continue; /* Leading zeroes */
8ac170f3 666 count++;
aa41d2de 667
64f2600a 668#ifndef EBCDIC /* ASCII coding */
8ac170f3 669 if (cc >= 'a') cc -= 32; /* Convert to upper case */
aa41d2de 670 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
64f2600a 671#else /* EBCDIC coding */
8ac170f3 672 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
aa41d2de 673 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
8ac170f3
PH
674#endif
675 }
aa41d2de 676
8ac170f3
PH
677 if (*pt == '}')
678 {
aa41d2de 679 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
8ac170f3
PH
680 ptr = pt;
681 break;
682 }
aa41d2de 683
8ac170f3
PH
684 /* If the sequence of hex digits does not end with '}', then we don't
685 recognize this construct; fall through to the normal \x handling. */
686 }
8ac170f3 687
aa41d2de 688 /* Read just a single-byte hex-defined char */
8ac170f3
PH
689
690 c = 0;
691 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
692 {
693 int cc; /* Some compilers don't like ++ */
694 cc = *(++ptr); /* in initializers */
64f2600a 695#ifndef EBCDIC /* ASCII coding */
8ac170f3
PH
696 if (cc >= 'a') cc -= 32; /* Convert to upper case */
697 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
64f2600a 698#else /* EBCDIC coding */
8ac170f3
PH
699 if (cc <= 'z') cc += 64; /* Convert to upper case */
700 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
701#endif
702 }
703 break;
704
6bf342e1
PH
705 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
706 This coding is ASCII-specific, but then the whole concept of \cx is
707 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
8ac170f3
PH
708
709 case 'c':
710 c = *(++ptr);
711 if (c == 0)
712 {
713 *errorcodeptr = ERR2;
47db1125 714 break;
8ac170f3
PH
715 }
716
64f2600a 717#ifndef EBCDIC /* ASCII coding */
8ac170f3
PH
718 if (c >= 'a' && c <= 'z') c -= 32;
719 c ^= 0x40;
64f2600a 720#else /* EBCDIC coding */
8ac170f3
PH
721 if (c >= 'a' && c <= 'z') c += 64;
722 c ^= 0xC0;
723#endif
724 break;
725
726 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
727 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
728 for Perl compatibility, it is a literal. This code looks a bit odd, but
729 there used to be some cases other than the default, and there may be again
730 in future, so I haven't "optimized" it. */
731
732 default:
733 if ((options & PCRE_EXTRA) != 0) switch(c)
734 {
735 default:
736 *errorcodeptr = ERR3;
737 break;
738 }
739 break;
740 }
741 }
742
743*ptrptr = ptr;
744return c;
745}
746
747
748
749#ifdef SUPPORT_UCP
750/*************************************************
751* Handle \P and \p *
752*************************************************/
753
754/* This function is called after \P or \p has been encountered, provided that
755PCRE is compiled with support for Unicode properties. On entry, ptrptr is
756pointing at the P or p. On exit, it is pointing at the final character of the
757escape sequence.
758
759Argument:
760 ptrptr points to the pattern position pointer
761 negptr points to a boolean that is set TRUE for negation else FALSE
aa41d2de 762 dptr points to an int that is set to the detailed property value
8ac170f3
PH
763 errorcodeptr points to the error code variable
764
aa41d2de 765Returns: type value from ucp_type_table, or -1 for an invalid type
8ac170f3
PH
766*/
767
768static int
aa41d2de 769get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
8ac170f3
PH
770{
771int c, i, bot, top;
772const uschar *ptr = *ptrptr;
aa41d2de 773char name[32];
8ac170f3
PH
774
775c = *(++ptr);
776if (c == 0) goto ERROR_RETURN;
777
778*negptr = FALSE;
779
aa41d2de
PH
780/* \P or \p can be followed by a name in {}, optionally preceded by ^ for
781negation. */
8ac170f3
PH
782
783if (c == '{')
784 {
785 if (ptr[1] == '^')
786 {
787 *negptr = TRUE;
788 ptr++;
789 }
47db1125 790 for (i = 0; i < (int)sizeof(name) - 1; i++)
8ac170f3
PH
791 {
792 c = *(++ptr);
793 if (c == 0) goto ERROR_RETURN;
794 if (c == '}') break;
795 name[i] = c;
796 }
aa41d2de 797 if (c !='}') goto ERROR_RETURN;
8ac170f3
PH
798 name[i] = 0;
799 }
800
801/* Otherwise there is just one following character */
802
803else
804 {
805 name[0] = c;
806 name[1] = 0;
807 }
808
809*ptrptr = ptr;
810
811/* Search for a recognized property name using binary chop */
812
813bot = 0;
814top = _pcre_utt_size;
815
816while (bot < top)
817 {
aa41d2de 818 i = (bot + top) >> 1;
47db1125 819 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
aa41d2de
PH
820 if (c == 0)
821 {
822 *dptr = _pcre_utt[i].value;
823 return _pcre_utt[i].type;
824 }
8ac170f3
PH
825 if (c > 0) bot = i + 1; else top = i;
826 }
827
8ac170f3
PH
828*errorcodeptr = ERR47;
829*ptrptr = ptr;
830return -1;
831
832ERROR_RETURN:
833*errorcodeptr = ERR46;
834*ptrptr = ptr;
835return -1;
836}
837#endif
838
839
840
841
842/*************************************************
843* Check for counted repeat *
844*************************************************/
845
846/* This function is called when a '{' is encountered in a place where it might
847start a quantifier. It looks ahead to see if it really is a quantifier or not.
848It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
849where the ddds are digits.
850
851Arguments:
852 p pointer to the first char after '{'
853
854Returns: TRUE or FALSE
855*/
856
857static BOOL
858is_counted_repeat(const uschar *p)
859{
860if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
861while ((digitab[*p] & ctype_digit) != 0) p++;
862if (*p == '}') return TRUE;
863
864if (*p++ != ',') return FALSE;
865if (*p == '}') return TRUE;
866
867if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
868while ((digitab[*p] & ctype_digit) != 0) p++;
869
870return (*p == '}');
871}
872
873
874
875/*************************************************
876* Read repeat counts *
877*************************************************/
878
879/* Read an item of the form {n,m} and return the values. This is called only
880after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
881so the syntax is guaranteed to be correct, but we need to check the values.
882
883Arguments:
884 p pointer to first char after '{'
885 minp pointer to int for min
886 maxp pointer to int for max
887 returned as -1 if no max
888 errorcodeptr points to error code variable
889
890Returns: pointer to '}' on success;
891 current ptr on error, with errorcodeptr set non-zero
892*/
893
894static const uschar *
895read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
896{
897int min = 0;
898int max = -1;
899
92e772ff
PH
900/* Read the minimum value and do a paranoid check: a negative value indicates
901an integer overflow. */
902
8ac170f3 903while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
92e772ff
PH
904if (min < 0 || min > 65535)
905 {
906 *errorcodeptr = ERR5;
907 return p;
908 }
909
910/* Read the maximum value if there is one, and again do a paranoid on its size.
911Also, max must not be less than min. */
8ac170f3
PH
912
913if (*p == '}') max = min; else
914 {
915 if (*(++p) != '}')
916 {
917 max = 0;
918 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
92e772ff
PH
919 if (max < 0 || max > 65535)
920 {
921 *errorcodeptr = ERR5;
922 return p;
923 }
8ac170f3
PH
924 if (max < min)
925 {
926 *errorcodeptr = ERR4;
927 return p;
928 }
929 }
930 }
931
92e772ff
PH
932/* Fill in the required variables, and pass back the pointer to the terminating
933'}'. */
8ac170f3 934
92e772ff
PH
935*minp = min;
936*maxp = max;
8ac170f3
PH
937return p;
938}
939
940
941
aa41d2de 942/*************************************************
6bf342e1 943* Find forward referenced subpattern *
aa41d2de
PH
944*************************************************/
945
6bf342e1
PH
946/* This function scans along a pattern's text looking for capturing
947subpatterns, and counting them. If it finds a named pattern that matches the
948name it is given, it returns its number. Alternatively, if the name is NULL, it
949returns when it reaches a given numbered subpattern. This is used for forward
950references to subpatterns. We know that if (?P< is encountered, the name will
951be terminated by '>' because that is checked in the first pass.
aa41d2de
PH
952
953Arguments:
6bf342e1
PH
954 ptr current position in the pattern
955 count current count of capturing parens so far encountered
956 name name to seek, or NULL if seeking a numbered subpattern
957 lorn name length, or subpattern number if name is NULL
958 xmode TRUE if we are in /x mode
aa41d2de
PH
959
960Returns: the number of the named subpattern, or -1 if not found
961*/
962
963static int
6bf342e1
PH
964find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
965 BOOL xmode)
aa41d2de
PH
966{
967const uschar *thisname;
6bf342e1 968
aa41d2de
PH
969for (; *ptr != 0; ptr++)
970 {
6bf342e1
PH
971 int term;
972
973 /* Skip over backslashed characters and also entire \Q...\E */
974
975 if (*ptr == '\\')
976 {
977 if (*(++ptr) == 0) return -1;
978 if (*ptr == 'Q') for (;;)
979 {
980 while (*(++ptr) != 0 && *ptr != '\\');
981 if (*ptr == 0) return -1;
982 if (*(++ptr) == 'E') break;
983 }
984 continue;
985 }
986
987 /* Skip over character classes */
988
989 if (*ptr == '[')
990 {
991 while (*(++ptr) != ']')
992 {
47db1125 993 if (*ptr == 0) return -1;
6bf342e1
PH
994 if (*ptr == '\\')
995 {
996 if (*(++ptr) == 0) return -1;
997 if (*ptr == 'Q') for (;;)
998 {
999 while (*(++ptr) != 0 && *ptr != '\\');
1000 if (*ptr == 0) return -1;
1001 if (*(++ptr) == 'E') break;
1002 }
1003 continue;
1004 }
1005 }
1006 continue;
1007 }
1008
1009 /* Skip comments in /x mode */
1010
1011 if (xmode && *ptr == '#')
1012 {
1013 while (*(++ptr) != 0 && *ptr != '\n');
1014 if (*ptr == 0) return -1;
1015 continue;
1016 }
1017
1018 /* An opening parens must now be a real metacharacter */
1019
aa41d2de 1020 if (*ptr != '(') continue;
47db1125 1021 if (ptr[1] != '?' && ptr[1] != '*')
6bf342e1
PH
1022 {
1023 count++;
1024 if (name == NULL && count == lorn) return count;
1025 continue;
1026 }
1027
1028 ptr += 2;
1029 if (*ptr == 'P') ptr++; /* Allow optional P */
1030
1031 /* We have to disambiguate (?<! and (?<= from (?<name> */
1032
1033 if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
1034 *ptr != '\'')
1035 continue;
1036
aa41d2de 1037 count++;
6bf342e1
PH
1038
1039 if (name == NULL && count == lorn) return count;
1040 term = *ptr++;
1041 if (term == '<') term = '>';
aa41d2de 1042 thisname = ptr;
6bf342e1
PH
1043 while (*ptr != term) ptr++;
1044 if (name != NULL && lorn == ptr - thisname &&
1045 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
aa41d2de
PH
1046 return count;
1047 }
6bf342e1 1048
aa41d2de
PH
1049return -1;
1050}
1051
1052
1053
8ac170f3
PH
1054/*************************************************
1055* Find first significant op code *
1056*************************************************/
1057
1058/* This is called by several functions that scan a compiled expression looking
1059for a fixed first character, or an anchoring op code etc. It skips over things
1060that do not influence this. For some calls, a change of option is important.
1061For some calls, it makes sense to skip negative forward and all backward
1062assertions, and also the \b assertion; for others it does not.
1063
1064Arguments:
1065 code pointer to the start of the group
1066 options pointer to external options
1067 optbit the option bit whose changing is significant, or
1068 zero if none are
1069 skipassert TRUE if certain assertions are to be skipped
1070
1071Returns: pointer to the first significant opcode
1072*/
1073
1074static const uschar*
1075first_significant_code(const uschar *code, int *options, int optbit,
1076 BOOL skipassert)
1077{
1078for (;;)
1079 {
1080 switch ((int)*code)
1081 {
1082 case OP_OPT:
1083 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1084 *options = (int)code[1];
1085 code += 2;
1086 break;
1087
1088 case OP_ASSERT_NOT:
1089 case OP_ASSERTBACK:
1090 case OP_ASSERTBACK_NOT:
1091 if (!skipassert) return code;
1092 do code += GET(code, 1); while (*code == OP_ALT);
1093 code += _pcre_OP_lengths[*code];
1094 break;
1095
1096 case OP_WORD_BOUNDARY:
1097 case OP_NOT_WORD_BOUNDARY:
1098 if (!skipassert) return code;
1099 /* Fall through */
1100
1101 case OP_CALLOUT:
1102 case OP_CREF:
6bf342e1
PH
1103 case OP_RREF:
1104 case OP_DEF:
8ac170f3
PH
1105 code += _pcre_OP_lengths[*code];
1106 break;
1107
1108 default:
1109 return code;
1110 }
1111 }
1112/* Control never reaches here */
1113}
1114
1115
1116
1117
1118/*************************************************
1119* Find the fixed length of a pattern *
1120*************************************************/
1121
1122/* Scan a pattern and compute the fixed length of subject that will match it,
1123if the length is fixed. This is needed for dealing with backward assertions.
1124In UTF8 mode, the result is in characters rather than bytes.
1125
1126Arguments:
1127 code points to the start of the pattern (the bracket)
1128 options the compiling options
1129
1130Returns: the fixed length, or -1 if there is no fixed length,
1131 or -2 if \C was encountered
1132*/
1133
1134static int
1135find_fixedlength(uschar *code, int options)
1136{
1137int length = -1;
1138
1139register int branchlength = 0;
1140register uschar *cc = code + 1 + LINK_SIZE;
1141
1142/* Scan along the opcodes for this branch. If we get to the end of the
1143branch, check the length against that of the other branches. */
1144
1145for (;;)
1146 {
1147 int d;
1148 register int op = *cc;
8ac170f3
PH
1149 switch (op)
1150 {
6bf342e1 1151 case OP_CBRA:
8ac170f3
PH
1152 case OP_BRA:
1153 case OP_ONCE:
1154 case OP_COND:
6bf342e1 1155 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
8ac170f3
PH
1156 if (d < 0) return d;
1157 branchlength += d;
1158 do cc += GET(cc, 1); while (*cc == OP_ALT);
1159 cc += 1 + LINK_SIZE;
1160 break;
1161
1162 /* Reached end of a branch; if it's a ket it is the end of a nested
1163 call. If it's ALT it is an alternation in a nested call. If it is
1164 END it's the end of the outer call. All can be handled by the same code. */
1165
1166 case OP_ALT:
1167 case OP_KET:
1168 case OP_KETRMAX:
1169 case OP_KETRMIN:
1170 case OP_END:
1171 if (length < 0) length = branchlength;
1172 else if (length != branchlength) return -1;
1173 if (*cc != OP_ALT) return length;
1174 cc += 1 + LINK_SIZE;
1175 branchlength = 0;
1176 break;
1177
1178 /* Skip over assertive subpatterns */
1179
1180 case OP_ASSERT:
1181 case OP_ASSERT_NOT:
1182 case OP_ASSERTBACK:
1183 case OP_ASSERTBACK_NOT:
1184 do cc += GET(cc, 1); while (*cc == OP_ALT);
1185 /* Fall through */
1186
1187 /* Skip over things that don't match chars */
1188
1189 case OP_REVERSE:
8ac170f3 1190 case OP_CREF:
6bf342e1
PH
1191 case OP_RREF:
1192 case OP_DEF:
8ac170f3
PH
1193 case OP_OPT:
1194 case OP_CALLOUT:
1195 case OP_SOD:
1196 case OP_SOM:
1197 case OP_EOD:
1198 case OP_EODN:
1199 case OP_CIRC:
1200 case OP_DOLL:
1201 case OP_NOT_WORD_BOUNDARY:
1202 case OP_WORD_BOUNDARY:
1203 cc += _pcre_OP_lengths[*cc];
1204 break;
1205
1206 /* Handle literal characters */
1207
1208 case OP_CHAR:
1209 case OP_CHARNC:
aa41d2de 1210 case OP_NOT:
8ac170f3
PH
1211 branchlength++;
1212 cc += 2;
1213#ifdef SUPPORT_UTF8
1214 if ((options & PCRE_UTF8) != 0)
1215 {
1216 while ((*cc & 0xc0) == 0x80) cc++;
1217 }
1218#endif
1219 break;
1220
1221 /* Handle exact repetitions. The count is already in characters, but we
1222 need to skip over a multibyte character in UTF8 mode. */
1223
1224 case OP_EXACT:
1225 branchlength += GET2(cc,1);
1226 cc += 4;
1227#ifdef SUPPORT_UTF8
1228 if ((options & PCRE_UTF8) != 0)
1229 {
1230 while((*cc & 0x80) == 0x80) cc++;
1231 }
1232#endif
1233 break;
1234
1235 case OP_TYPEEXACT:
1236 branchlength += GET2(cc,1);
47db1125 1237 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
8ac170f3
PH
1238 cc += 4;
1239 break;
1240
1241 /* Handle single-char matchers */
1242
1243 case OP_PROP:
1244 case OP_NOTPROP:
aa41d2de 1245 cc += 2;
8ac170f3
PH
1246 /* Fall through */
1247
1248 case OP_NOT_DIGIT:
1249 case OP_DIGIT:
1250 case OP_NOT_WHITESPACE:
1251 case OP_WHITESPACE:
1252 case OP_NOT_WORDCHAR:
1253 case OP_WORDCHAR:
1254 case OP_ANY:
1255 branchlength++;
1256 cc++;
1257 break;
1258
1259 /* The single-byte matcher isn't allowed */
1260
1261 case OP_ANYBYTE:
1262 return -2;
1263
1264 /* Check a class for variable quantification */
1265
1266#ifdef SUPPORT_UTF8
1267 case OP_XCLASS:
1268 cc += GET(cc, 1) - 33;
1269 /* Fall through */
1270#endif
1271
1272 case OP_CLASS:
1273 case OP_NCLASS:
1274 cc += 33;
1275
1276 switch (*cc)
1277 {
1278 case OP_CRSTAR:
1279 case OP_CRMINSTAR:
1280 case OP_CRQUERY:
1281 case OP_CRMINQUERY:
1282 return -1;
1283
1284 case OP_CRRANGE:
1285 case OP_CRMINRANGE:
1286 if (GET2(cc,1) != GET2(cc,3)) return -1;
1287 branchlength += GET2(cc,1);
1288 cc += 5;
1289 break;
1290
1291 default:
1292 branchlength++;
1293 }
1294 break;
1295
1296 /* Anything else is variable length */
1297
1298 default:
1299 return -1;
1300 }
1301 }
1302/* Control never gets here */
1303}
1304
1305
1306
1307
1308/*************************************************
1309* Scan compiled regex for numbered bracket *
1310*************************************************/
1311
1312/* This little function scans through a compiled pattern until it finds a
1313capturing bracket with the given number.
1314
1315Arguments:
1316 code points to start of expression
1317 utf8 TRUE in UTF-8 mode
1318 number the required bracket number
1319
1320Returns: pointer to the opcode for the bracket, or NULL if not found
1321*/
1322
1323static const uschar *
1324find_bracket(const uschar *code, BOOL utf8, int number)
1325{
8ac170f3
PH
1326for (;;)
1327 {
1328 register int c = *code;
1329 if (c == OP_END) return NULL;
aa41d2de
PH
1330
1331 /* XCLASS is used for classes that cannot be represented just by a bit
1332 map. This includes negated single high-valued characters. The length in
1333 the table is zero; the actual length is stored in the compiled code. */
1334
1335 if (c == OP_XCLASS) code += GET(code, 1);
1336
6bf342e1 1337 /* Handle capturing bracket */
aa41d2de 1338
6bf342e1 1339 else if (c == OP_CBRA)
8ac170f3 1340 {
6bf342e1 1341 int n = GET2(code, 1+LINK_SIZE);
8ac170f3 1342 if (n == number) return (uschar *)code;
6bf342e1 1343 code += _pcre_OP_lengths[c];
8ac170f3 1344 }
aa41d2de 1345
47db1125
NM
1346 /* Otherwise, we can get the item's length from the table, except that for
1347 repeated character types, we have to test for \p and \P, which have an extra
1348 two bytes of parameters. */
aa41d2de 1349
8ac170f3
PH
1350 else
1351 {
47db1125
NM
1352 switch(c)
1353 {
1354 case OP_TYPESTAR:
1355 case OP_TYPEMINSTAR:
1356 case OP_TYPEPLUS:
1357 case OP_TYPEMINPLUS:
1358 case OP_TYPEQUERY:
1359 case OP_TYPEMINQUERY:
1360 case OP_TYPEPOSSTAR:
1361 case OP_TYPEPOSPLUS:
1362 case OP_TYPEPOSQUERY:
1363 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1364 break;
1365
1366 case OP_TYPEUPTO:
1367 case OP_TYPEMINUPTO:
1368 case OP_TYPEEXACT:
1369 case OP_TYPEPOSUPTO:
1370 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1371 break;
1372 }
1373
1374 /* Add in the fixed length from the table */
1375
8ac170f3 1376 code += _pcre_OP_lengths[c];
47db1125
NM
1377
1378 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1379 a multi-byte character. The length in the table is a minimum, so we have to
1380 arrange to skip the extra bytes. */
1381
64f2600a 1382#ifdef SUPPORT_UTF8
8ac170f3
PH
1383 if (utf8) switch(c)
1384 {
1385 case OP_CHAR:
1386 case OP_CHARNC:
1387 case OP_EXACT:
1388 case OP_UPTO:
1389 case OP_MINUPTO:
6bf342e1 1390 case OP_POSUPTO:
8ac170f3
PH
1391 case OP_STAR:
1392 case OP_MINSTAR:
6bf342e1 1393 case OP_POSSTAR:
8ac170f3
PH
1394 case OP_PLUS:
1395 case OP_MINPLUS:
6bf342e1 1396 case OP_POSPLUS:
8ac170f3
PH
1397 case OP_QUERY:
1398 case OP_MINQUERY:
6bf342e1
PH
1399 case OP_POSQUERY:
1400 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
8ac170f3 1401 break;
8ac170f3 1402 }
64f2600a 1403#endif
8ac170f3
PH
1404 }
1405 }
1406}
1407
1408
1409
1410/*************************************************
1411* Scan compiled regex for recursion reference *
1412*************************************************/
1413
1414/* This little function scans through a compiled pattern until it finds an
1415instance of OP_RECURSE.
1416
1417Arguments:
1418 code points to start of expression
1419 utf8 TRUE in UTF-8 mode
1420
1421Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1422*/
1423
1424static const uschar *
1425find_recurse(const uschar *code, BOOL utf8)
1426{
8ac170f3
PH
1427for (;;)
1428 {
1429 register int c = *code;
1430 if (c == OP_END) return NULL;
aa41d2de
PH
1431 if (c == OP_RECURSE) return code;
1432
1433 /* XCLASS is used for classes that cannot be represented just by a bit
1434 map. This includes negated single high-valued characters. The length in
1435 the table is zero; the actual length is stored in the compiled code. */
1436
1437 if (c == OP_XCLASS) code += GET(code, 1);
1438
47db1125
NM
1439 /* Otherwise, we can get the item's length from the table, except that for
1440 repeated character types, we have to test for \p and \P, which have an extra
1441 two bytes of parameters. */
aa41d2de 1442
8ac170f3
PH
1443 else
1444 {
47db1125
NM
1445 switch(c)
1446 {
1447 case OP_TYPESTAR:
1448 case OP_TYPEMINSTAR:
1449 case OP_TYPEPLUS:
1450 case OP_TYPEMINPLUS:
1451 case OP_TYPEQUERY:
1452 case OP_TYPEMINQUERY:
1453 case OP_TYPEPOSSTAR:
1454 case OP_TYPEPOSPLUS:
1455 case OP_TYPEPOSQUERY:
1456 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1457 break;
1458
1459 case OP_TYPEPOSUPTO:
1460 case OP_TYPEUPTO:
1461 case OP_TYPEMINUPTO:
1462 case OP_TYPEEXACT:
1463 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1464 break;
1465 }
1466
1467 /* Add in the fixed length from the table */
1468
8ac170f3 1469 code += _pcre_OP_lengths[c];
47db1125
NM
1470
1471 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1472 by a multi-byte character. The length in the table is a minimum, so we have
1473 to arrange to skip the extra bytes. */
1474
64f2600a 1475#ifdef SUPPORT_UTF8
8ac170f3
PH
1476 if (utf8) switch(c)
1477 {
1478 case OP_CHAR:
1479 case OP_CHARNC:
1480 case OP_EXACT:
1481 case OP_UPTO:
1482 case OP_MINUPTO:
6bf342e1 1483 case OP_POSUPTO:
8ac170f3
PH
1484 case OP_STAR:
1485 case OP_MINSTAR:
6bf342e1 1486 case OP_POSSTAR:
8ac170f3
PH
1487 case OP_PLUS:
1488 case OP_MINPLUS:
6bf342e1 1489 case OP_POSPLUS:
8ac170f3
PH
1490 case OP_QUERY:
1491 case OP_MINQUERY:
6bf342e1
PH
1492 case OP_POSQUERY:
1493 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
8ac170f3 1494 break;
8ac170f3 1495 }
64f2600a 1496#endif
8ac170f3
PH
1497 }
1498 }
1499}
1500
1501
1502
1503/*************************************************
1504* Scan compiled branch for non-emptiness *
1505*************************************************/
1506
1507/* This function scans through a branch of a compiled pattern to see whether it
6bf342e1
PH
1508can match the empty string or not. It is called from could_be_empty()
1509below and from compile_branch() when checking for an unlimited repeat of a
1510group that can match nothing. Note that first_significant_code() skips over
1511assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1512struck an inner bracket whose current branch will already have been scanned.
8ac170f3
PH
1513
1514Arguments:
1515 code points to start of search
1516 endcode points to where to stop
1517 utf8 TRUE if in UTF8 mode
1518
1519Returns: TRUE if what is matched could be empty
1520*/
1521
1522static BOOL
1523could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1524{
1525register int c;
6bf342e1 1526for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
8ac170f3
PH
1527 code < endcode;
1528 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1529 {
1530 const uschar *ccode;
1531
1532 c = *code;
1533
64f2600a
PH
1534 /* Groups with zero repeats can of course be empty; skip them. */
1535
1536 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1537 {
1538 code += _pcre_OP_lengths[c];
1539 do code += GET(code, 1); while (*code == OP_ALT);
1540 c = *code;
1541 continue;
1542 }
1543
1544 /* For other groups, scan the branches. */
1545
47db1125 1546 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
8ac170f3
PH
1547 {
1548 BOOL empty_branch;
1549 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1550
1551 /* Scan a closed bracket */
1552
1553 empty_branch = FALSE;
1554 do
1555 {
1556 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1557 empty_branch = TRUE;
1558 code += GET(code, 1);
1559 }
1560 while (*code == OP_ALT);
1561 if (!empty_branch) return FALSE; /* All branches are non-empty */
64f2600a 1562 c = *code;
6bf342e1 1563 continue;
8ac170f3
PH
1564 }
1565
6bf342e1
PH
1566 /* Handle the other opcodes */
1567
1568 switch (c)
8ac170f3 1569 {
47db1125
NM
1570 /* Check for quantifiers after a class. XCLASS is used for classes that
1571 cannot be represented just by a bit map. This includes negated single
1572 high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1573 actual length is stored in the compiled code, so we must update "code"
1574 here. */
8ac170f3
PH
1575
1576#ifdef SUPPORT_UTF8
1577 case OP_XCLASS:
47db1125 1578 ccode = code += GET(code, 1);
8ac170f3
PH
1579 goto CHECK_CLASS_REPEAT;
1580#endif
1581
1582 case OP_CLASS:
1583 case OP_NCLASS:
1584 ccode = code + 33;
1585
1586#ifdef SUPPORT_UTF8
1587 CHECK_CLASS_REPEAT:
1588#endif
1589
1590 switch (*ccode)
1591 {
1592 case OP_CRSTAR: /* These could be empty; continue */
1593 case OP_CRMINSTAR:
1594 case OP_CRQUERY:
1595 case OP_CRMINQUERY:
1596 break;
1597
1598 default: /* Non-repeat => class must match */
1599 case OP_CRPLUS: /* These repeats aren't empty */
1600 case OP_CRMINPLUS:
1601 return FALSE;
1602
1603 case OP_CRRANGE:
1604 case OP_CRMINRANGE:
1605 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1606 break;
1607 }
1608 break;
1609
1610 /* Opcodes that must match a character */
1611
1612 case OP_PROP:
1613 case OP_NOTPROP:
1614 case OP_EXTUNI:
1615 case OP_NOT_DIGIT:
1616 case OP_DIGIT:
1617 case OP_NOT_WHITESPACE:
1618 case OP_WHITESPACE:
1619 case OP_NOT_WORDCHAR:
1620 case OP_WORDCHAR:
1621 case OP_ANY:
1622 case OP_ANYBYTE:
1623 case OP_CHAR:
1624 case OP_CHARNC:
1625 case OP_NOT:
1626 case OP_PLUS:
1627 case OP_MINPLUS:
6bf342e1 1628 case OP_POSPLUS:
8ac170f3
PH
1629 case OP_EXACT:
1630 case OP_NOTPLUS:
1631 case OP_NOTMINPLUS:
6bf342e1 1632 case OP_NOTPOSPLUS:
8ac170f3
PH
1633 case OP_NOTEXACT:
1634 case OP_TYPEPLUS:
1635 case OP_TYPEMINPLUS:
6bf342e1 1636 case OP_TYPEPOSPLUS:
8ac170f3
PH
1637 case OP_TYPEEXACT:
1638 return FALSE;
1639
47db1125
NM
1640 /* These are going to continue, as they may be empty, but we have to
1641 fudge the length for the \p and \P cases. */
1642
1643 case OP_TYPESTAR:
1644 case OP_TYPEMINSTAR:
1645 case OP_TYPEPOSSTAR:
1646 case OP_TYPEQUERY:
1647 case OP_TYPEMINQUERY:
1648 case OP_TYPEPOSQUERY:
1649 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1650 break;
1651
1652 /* Same for these */
1653
1654 case OP_TYPEUPTO:
1655 case OP_TYPEMINUPTO:
1656 case OP_TYPEPOSUPTO:
1657 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1658 break;
1659
8ac170f3
PH
1660 /* End of branch */
1661
1662 case OP_KET:
1663 case OP_KETRMAX:
1664 case OP_KETRMIN:
1665 case OP_ALT:
1666 return TRUE;
1667
6bf342e1
PH
1668 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1669 MINUPTO, and POSUPTO may be followed by a multibyte character */
8ac170f3
PH
1670
1671#ifdef SUPPORT_UTF8
1672 case OP_STAR:
1673 case OP_MINSTAR:
6bf342e1 1674 case OP_POSSTAR:
8ac170f3
PH
1675 case OP_QUERY:
1676 case OP_MINQUERY:
6bf342e1 1677 case OP_POSQUERY:
8ac170f3
PH
1678 case OP_UPTO:
1679 case OP_MINUPTO:
6bf342e1 1680 case OP_POSUPTO:
8ac170f3
PH
1681 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1682 break;
1683#endif
1684 }
1685 }
1686
1687return TRUE;
1688}
1689
1690
1691
1692/*************************************************
1693* Scan compiled regex for non-emptiness *
1694*************************************************/
1695
1696/* This function is called to check for left recursive calls. We want to check
1697the current branch of the current pattern to see if it could match the empty
1698string. If it could, we must look outwards for branches at other levels,
1699stopping when we pass beyond the bracket which is the subject of the recursion.
1700
1701Arguments:
1702 code points to start of the recursion
1703 endcode points to where to stop (current RECURSE item)
1704 bcptr points to the chain of current (unclosed) branch starts
1705 utf8 TRUE if in UTF-8 mode
1706
1707Returns: TRUE if what is matched could be empty
1708*/
1709
1710static BOOL
1711could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1712 BOOL utf8)
1713{
1714while (bcptr != NULL && bcptr->current >= code)
1715 {
1716 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1717 bcptr = bcptr->outer;
1718 }
1719return TRUE;
1720}
1721
1722
1723
1724/*************************************************
1725* Check for POSIX class syntax *
1726*************************************************/
1727
1728/* This function is called when the sequence "[:" or "[." or "[=" is
1729encountered in a character class. It checks whether this is followed by an
1730optional ^ and then a sequence of letters, terminated by a matching ":]" or
1731".]" or "=]".
1732
1733Argument:
1734 ptr pointer to the initial [
1735 endptr where to return the end pointer
1736 cd pointer to compile data
1737
1738Returns: TRUE or FALSE
1739*/
1740
1741static BOOL
1742check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1743{
1744int terminator; /* Don't combine these lines; the Solaris cc */
1745terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1746if (*(++ptr) == '^') ptr++;
1747while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1748if (*ptr == terminator && ptr[1] == ']')
1749 {
1750 *endptr = ptr;
1751 return TRUE;
1752 }
1753return FALSE;
1754}
1755
1756
1757
1758
1759/*************************************************
1760* Check POSIX class name *
1761*************************************************/
1762
1763/* This function is called to check the name given in a POSIX-style class entry
1764such as [:alnum:].
1765
1766Arguments:
1767 ptr points to the first letter
1768 len the length of the name
1769
1770Returns: a value representing the name, or -1 if unknown
1771*/
1772
1773static int
1774check_posix_name(const uschar *ptr, int len)
1775{
47db1125 1776const char *pn = posix_names;
8ac170f3
PH
1777register int yield = 0;
1778while (posix_name_lengths[yield] != 0)
1779 {
1780 if (len == posix_name_lengths[yield] &&
47db1125
NM
1781 strncmp((const char *)ptr, pn, len) == 0) return yield;
1782 pn += posix_name_lengths[yield] + 1;
8ac170f3
PH
1783 yield++;
1784 }
1785return -1;
1786}
1787
1788
1789/*************************************************
1790* Adjust OP_RECURSE items in repeated group *
1791*************************************************/
1792
1793/* OP_RECURSE items contain an offset from the start of the regex to the group
1794that is referenced. This means that groups can be replicated for fixed
1795repetition simply by copying (because the recursion is allowed to refer to
1796earlier groups that are outside the current group). However, when a group is
1797optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1798it, after it has been compiled. This means that any OP_RECURSE items within it
1799that refer to the group itself or any contained groups have to have their
6bf342e1
PH
1800offsets adjusted. That one of the jobs of this function. Before it is called,
1801the partially compiled regex must be temporarily terminated with OP_END.
1802
1803This function has been extended with the possibility of forward references for
1804recursions and subroutine calls. It must also check the list of such references
1805for the group we are dealing with. If it finds that one of the recursions in
1806the current group is on this list, it adjusts the offset in the list, not the
1807value in the reference (which is a group number).
8ac170f3
PH
1808
1809Arguments:
1810 group points to the start of the group
1811 adjust the amount by which the group is to be moved
1812 utf8 TRUE in UTF-8 mode
1813 cd contains pointers to tables etc.
6bf342e1 1814 save_hwm the hwm forward reference pointer at the start of the group
8ac170f3
PH
1815
1816Returns: nothing
1817*/
1818
1819static void
6bf342e1
PH
1820adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1821 uschar *save_hwm)
8ac170f3
PH
1822{
1823uschar *ptr = group;
47db1125 1824
8ac170f3
PH
1825while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1826 {
6bf342e1
PH
1827 int offset;
1828 uschar *hc;
1829
1830 /* See if this recursion is on the forward reference list. If so, adjust the
1831 reference. */
1832
1833 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1834 {
1835 offset = GET(hc, 0);
1836 if (cd->start_code + offset == ptr + 1)
1837 {
1838 PUT(hc, 0, offset + adjust);
1839 break;
1840 }
1841 }
1842
1843 /* Otherwise, adjust the recursion offset if it's after the start of this
1844 group. */
1845
1846 if (hc >= cd->hwm)
1847 {
1848 offset = GET(ptr, 1);
1849 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1850 }
1851
8ac170f3
PH
1852 ptr += 1 + LINK_SIZE;
1853 }
1854}
1855
1856
1857
1858/*************************************************
1859* Insert an automatic callout point *
1860*************************************************/
1861
1862/* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1863callout points before each pattern item.
1864
1865Arguments:
1866 code current code pointer
1867 ptr current pattern pointer
1868 cd pointers to tables etc
1869
1870Returns: new code pointer
1871*/
1872
1873static uschar *
1874auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1875{
1876*code++ = OP_CALLOUT;
1877*code++ = 255;
1878PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1879PUT(code, LINK_SIZE, 0); /* Default length */
1880return code + 2*LINK_SIZE;
1881}
1882
1883
1884
1885/*************************************************
1886* Complete a callout item *
1887*************************************************/
1888
1889/* A callout item contains the length of the next item in the pattern, which
1890we can't fill in till after we have reached the relevant point. This is used
1891for both automatic and manual callouts.
1892
1893Arguments:
1894 previous_callout points to previous callout item
1895 ptr current pattern pointer
1896 cd pointers to tables etc
1897
1898Returns: nothing
1899*/
1900
1901static void
1902complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1903{
1904int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1905PUT(previous_callout, 2 + LINK_SIZE, length);
1906}
1907
1908
1909
1910#ifdef SUPPORT_UCP
1911/*************************************************
1912* Get othercase range *
1913*************************************************/
1914
1915/* This function is passed the start and end of a class range, in UTF-8 mode
1916with UCP support. It searches up the characters, looking for internal ranges of
1917characters in the "other" case. Each call returns the next one, updating the
1918start address.
1919
1920Arguments:
1921 cptr points to starting character value; updated
1922 d end value
1923 ocptr where to put start of othercase range
1924 odptr where to put end of othercase range
1925
1926Yield: TRUE when range returned; FALSE when no more
1927*/
1928
1929static BOOL
6bf342e1
PH
1930get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1931 unsigned int *odptr)
8ac170f3 1932{
6bf342e1 1933unsigned int c, othercase, next;
8ac170f3
PH
1934
1935for (c = *cptr; c <= d; c++)
6bf342e1 1936 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
8ac170f3
PH
1937
1938if (c > d) return FALSE;
1939
1940*ocptr = othercase;
1941next = othercase + 1;
1942
1943for (++c; c <= d; c++)
1944 {
aa41d2de 1945 if (_pcre_ucp_othercase(c) != next) break;
8ac170f3
PH
1946 next++;
1947 }
1948
1949*odptr = next - 1;
1950*cptr = c;
1951
1952return TRUE;
1953}
1954#endif /* SUPPORT_UCP */
1955
1956
6bf342e1
PH
1957
1958/*************************************************
1959* Check if auto-possessifying is possible *
1960*************************************************/
1961
1962/* This function is called for unlimited repeats of certain items, to see
1963whether the next thing could possibly match the repeated item. If not, it makes
1964sense to automatically possessify the repeated item.
1965
1966Arguments:
1967 op_code the repeated op code
1968 this data for this item, depends on the opcode
1969 utf8 TRUE in UTF-8 mode
1970 utf8_char used for utf8 character bytes, NULL if not relevant
1971 ptr next character in pattern
1972 options options bits
1973 cd contains pointers to tables etc.
1974
1975Returns: TRUE if possessifying is wanted
1976*/
1977
1978static BOOL
1979check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1980 const uschar *ptr, int options, compile_data *cd)
1981{
1982int next;
1983
1984/* Skip whitespace and comments in extended mode */
1985
1986if ((options & PCRE_EXTENDED) != 0)
1987 {
1988 for (;;)
1989 {
1990 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1991 if (*ptr == '#')
1992 {
1993 while (*(++ptr) != 0)
1994 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1995 }
1996 else break;
1997 }
1998 }
1999
2000/* If the next item is one that we can handle, get its value. A non-negative
2001value is a character, a negative value is an escape value. */
2002
2003if (*ptr == '\\')
2004 {
2005 int temperrorcode = 0;
2006 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2007 if (temperrorcode != 0) return FALSE;
2008 ptr++; /* Point after the escape sequence */
2009 }
2010
2011else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2012 {
2013#ifdef SUPPORT_UTF8
2014 if (utf8) { GETCHARINC(next, ptr); } else
2015#endif
2016 next = *ptr++;
2017 }
2018
2019else return FALSE;
2020
2021/* Skip whitespace and comments in extended mode */
2022
2023if ((options & PCRE_EXTENDED) != 0)
2024 {
2025 for (;;)
2026 {
2027 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2028 if (*ptr == '#')
2029 {
2030 while (*(++ptr) != 0)
2031 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2032 }
2033 else break;
2034 }
2035 }
2036
2037/* If the next thing is itself optional, we have to give up. */
2038
2039if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
2040 return FALSE;
2041
2042/* Now compare the next item with the previous opcode. If the previous is a
2043positive single character match, "item" either contains the character or, if
2044"item" is greater than 127 in utf8 mode, the character's bytes are in
2045utf8_char. */
2046
2047
2048/* Handle cases when the next item is a character. */
2049
2050if (next >= 0) switch(op_code)
2051 {
2052 case OP_CHAR:
2053#ifdef SUPPORT_UTF8
2054 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2055#endif
2056 return item != next;
2057
2058 /* For CHARNC (caseless character) we must check the other case. If we have
2059 Unicode property support, we can use it to test the other case of
2060 high-valued characters. */
2061
2062 case OP_CHARNC:
2063#ifdef SUPPORT_UTF8
2064 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2065#endif
2066 if (item == next) return FALSE;
2067#ifdef SUPPORT_UTF8
2068 if (utf8)
2069 {
2070 unsigned int othercase;
2071 if (next < 128) othercase = cd->fcc[next]; else
2072#ifdef SUPPORT_UCP
2073 othercase = _pcre_ucp_othercase((unsigned int)next);
2074#else
2075 othercase = NOTACHAR;
2076#endif
2077 return (unsigned int)item != othercase;
2078 }
2079 else
2080#endif /* SUPPORT_UTF8 */
2081 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2082
2083 /* For OP_NOT, "item" must be a single-byte character. */
2084
2085 case OP_NOT:
2086 if (next < 0) return FALSE; /* Not a character */
2087 if (item == next) return TRUE;
2088 if ((options & PCRE_CASELESS) == 0) return FALSE;
2089#ifdef SUPPORT_UTF8
2090 if (utf8)
2091 {
2092 unsigned int othercase;
2093 if (next < 128) othercase = cd->fcc[next]; else
2094#ifdef SUPPORT_UCP
2095 othercase = _pcre_ucp_othercase(next);
2096#else
2097 othercase = NOTACHAR;
2098#endif
2099 return (unsigned int)item == othercase;
2100 }
2101 else
2102#endif /* SUPPORT_UTF8 */
2103 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2104
2105 case OP_DIGIT:
2106 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2107
2108 case OP_NOT_DIGIT:
2109 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2110
2111 case OP_WHITESPACE:
2112 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2113
2114 case OP_NOT_WHITESPACE:
2115 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2116
2117 case OP_WORDCHAR:
2118 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2119
2120 case OP_NOT_WORDCHAR:
2121 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2122
64f2600a
PH
2123 case OP_HSPACE:
2124 case OP_NOT_HSPACE:
2125 switch(next)
2126 {
2127 case 0x09:
2128 case 0x20:
2129 case 0xa0:
2130 case 0x1680:
2131 case 0x180e:
2132 case 0x2000:
2133 case 0x2001:
2134 case 0x2002:
2135 case 0x2003:
2136 case 0x2004:
2137 case 0x2005:
2138 case 0x2006:
2139 case 0x2007:
2140 case 0x2008:
2141 case 0x2009:
2142 case 0x200A:
2143 case 0x202f:
2144 case 0x205f:
2145 case 0x3000:
2146 return op_code != OP_HSPACE;
2147 default:
2148 return op_code == OP_HSPACE;
2149 }
2150
2151 case OP_VSPACE:
2152 case OP_NOT_VSPACE:
2153 switch(next)
2154 {
2155 case 0x0a:
2156 case 0x0b:
2157 case 0x0c:
2158 case 0x0d:
2159 case 0x85:
2160 case 0x2028:
2161 case 0x2029:
2162 return op_code != OP_VSPACE;
2163 default:
2164 return op_code == OP_VSPACE;
2165 }
2166
6bf342e1
PH
2167 default:
2168 return FALSE;
2169 }
2170
2171
2172/* Handle the case when the next item is \d, \s, etc. */
2173
2174switch(op_code)
2175 {
2176 case OP_CHAR:
2177 case OP_CHARNC:
2178#ifdef SUPPORT_UTF8
2179 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2180#endif
2181 switch(-next)
2182 {
2183 case ESC_d:
2184 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2185
2186 case ESC_D:
2187 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2188
2189 case ESC_s:
2190 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2191
2192 case ESC_S:
2193 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2194
2195 case ESC_w:
2196 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2197
2198 case ESC_W:
2199 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2200
64f2600a
PH
2201 case ESC_h:
2202 case ESC_H:
2203 switch(item)
2204 {
2205 case 0x09:
2206 case 0x20:
2207 case 0xa0:
2208 case 0x1680:
2209 case 0x180e:
2210 case 0x2000:
2211 case 0x2001:
2212 case 0x2002:
2213 case 0x2003:
2214 case 0x2004:
2215 case 0x2005:
2216 case 0x2006:
2217 case 0x2007:
2218 case 0x2008:
2219 case 0x2009:
2220 case 0x200A:
2221 case 0x202f:
2222 case 0x205f:
2223 case 0x3000:
2224 return -next != ESC_h;
2225 default:
2226 return -next == ESC_h;
2227 }
2228
2229 case ESC_v:
2230 case ESC_V:
2231 switch(item)
2232 {
2233 case 0x0a:
2234 case 0x0b:
2235 case 0x0c:
2236 case 0x0d:
2237 case 0x85:
2238 case 0x2028:
2239 case 0x2029:
2240 return -next != ESC_v;
2241 default:
2242 return -next == ESC_v;
2243 }
2244
6bf342e1
PH
2245 default:
2246 return FALSE;
2247 }
2248
2249 case OP_DIGIT:
64f2600a
PH
2250 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2251 next == -ESC_h || next == -ESC_v;
6bf342e1
PH
2252
2253 case OP_NOT_DIGIT:
2254 return next == -ESC_d;
2255
2256 case OP_WHITESPACE:
2257 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2258
2259 case OP_NOT_WHITESPACE:
64f2600a
PH
2260 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2261
2262 case OP_HSPACE:
2263 return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2264
2265 case OP_NOT_HSPACE:
2266 return next == -ESC_h;
2267
2268 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2269 case OP_VSPACE:
2270 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2271
2272 case OP_NOT_VSPACE:
2273 return next == -ESC_v;
6bf342e1
PH
2274
2275 case OP_WORDCHAR:
64f2600a 2276 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
6bf342e1
PH
2277
2278 case OP_NOT_WORDCHAR:
2279 return next == -ESC_w || next == -ESC_d;
2280
2281 default:
2282 return FALSE;
2283 }
2284
2285/* Control does not reach here */
2286}
2287
2288
2289
8ac170f3
PH
2290/*************************************************
2291* Compile one branch *
2292*************************************************/
2293
6bf342e1 2294/* Scan the pattern, compiling it into the a vector. If the options are
8ac170f3 2295changed during the branch, the pointer is used to change the external options
6bf342e1
PH
2296bits. This function is used during the pre-compile phase when we are trying
2297to find out the amount of memory needed, as well as during the real compile
2298phase. The value of lengthptr distinguishes the two phases.
8ac170f3
PH
2299
2300Arguments:
2301 optionsptr pointer to the option bits
8ac170f3
PH
2302 codeptr points to the pointer to the current code point
2303 ptrptr points to the current pattern pointer
2304 errorcodeptr points to error code variable
2305 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2306 reqbyteptr set to the last literal character required, else < 0
2307 bcptr points to current branch chain
2308 cd contains pointers to tables etc.
6bf342e1
PH
2309 lengthptr NULL during the real compile phase
2310 points to length accumulator during pre-compile phase
8ac170f3
PH
2311
2312Returns: TRUE on success
2313 FALSE, with *errorcodeptr set non-zero on error
2314*/
2315
2316static BOOL
6bf342e1
PH
2317compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2318 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2319 compile_data *cd, int *lengthptr)
8ac170f3
PH
2320{
2321int repeat_type, op_type;
2322int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2323int bravalue = 0;
2324int greedy_default, greedy_non_default;
2325int firstbyte, reqbyte;
2326int zeroreqbyte, zerofirstbyte;
2327int req_caseopt, reqvary, tempreqvary;
8ac170f3
PH
2328int options = *optionsptr;
2329int after_manual_callout = 0;
6bf342e1 2330int length_prevgroup = 0;
8ac170f3
PH
2331register int c;
2332register uschar *code = *codeptr;
6bf342e1
PH
2333uschar *last_code = code;
2334uschar *orig_code = code;
8ac170f3
PH
2335uschar *tempcode;
2336BOOL inescq = FALSE;
2337BOOL groupsetfirstbyte = FALSE;
2338const uschar *ptr = *ptrptr;
2339const uschar *tempptr;
2340uschar *previous = NULL;
2341uschar *previous_callout = NULL;
6bf342e1 2342uschar *save_hwm = NULL;
8ac170f3
PH
2343uschar classbits[32];
2344
2345#ifdef SUPPORT_UTF8
2346BOOL class_utf8;
2347BOOL utf8 = (options & PCRE_UTF8) != 0;
2348uschar *class_utf8data;
2349uschar utf8_char[6];
2350#else
2351BOOL utf8 = FALSE;
6bf342e1
PH
2352uschar *utf8_char = NULL;
2353#endif
2354
2355#ifdef DEBUG
2356if (lengthptr != NULL) DPRINTF((">> start branch\n"));
8ac170f3
PH
2357#endif
2358
2359/* Set up the default and non-default settings for greediness */
2360
2361greedy_default = ((options & PCRE_UNGREEDY) != 0);
2362greedy_non_default = greedy_default ^ 1;
2363
2364/* Initialize no first byte, no required byte. REQ_UNSET means "no char
2365matching encountered yet". It gets changed to REQ_NONE if we hit something that
2366matches a non-fixed char first char; reqbyte just remains unset if we never
2367find one.
2368
2369When we hit a repeat whose minimum is zero, we may have to adjust these values
2370to take the zero repeat into account. This is implemented by setting them to
2371zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2372item types that can be repeated set these backoff variables appropriately. */
2373
2374firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2375
2376/* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2377according to the current setting of the caseless flag. REQ_CASELESS is a bit
2378value > 255. It is added into the firstbyte or reqbyte variables to record the
2379case status of the value. This is used only for ASCII characters. */
2380
2381req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2382
2383/* Switch on next character until the end of the branch */
2384
2385for (;; ptr++)
2386 {
2387 BOOL negate_class;
2388 BOOL possessive_quantifier;
2389 BOOL is_quantifier;
6bf342e1 2390 BOOL is_recurse;
64f2600a 2391 BOOL reset_bracount;
8ac170f3
PH
2392 int class_charcount;
2393 int class_lastchar;
2394 int newoptions;
2395 int recno;
64f2600a 2396 int refsign;
8ac170f3
PH
2397 int skipbytes;
2398 int subreqbyte;
2399 int subfirstbyte;
6bf342e1 2400 int terminator;
8ac170f3
PH
2401 int mclength;
2402 uschar mcbuffer[8];
2403
6bf342e1 2404 /* Get next byte in the pattern */
8ac170f3
PH
2405
2406 c = *ptr;
2407
6bf342e1
PH
2408 /* If we are in the pre-compile phase, accumulate the length used for the
2409 previous cycle of this loop. */
2410
2411 if (lengthptr != NULL)
2412 {
2413#ifdef DEBUG
2414 if (code > cd->hwm) cd->hwm = code; /* High water info */
2415#endif
2416 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2417 {
2418 *errorcodeptr = ERR52;
2419 goto FAILED;
2420 }
2421
2422 /* There is at least one situation where code goes backwards: this is the
2423 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2424 the class is simply eliminated. However, it is created first, so we have to
2425 allow memory for it. Therefore, don't ever reduce the length at this point.
2426 */
2427
2428 if (code < last_code) code = last_code;
47db1125
NM
2429
2430 /* Paranoid check for integer overflow */
2431
2432 if (OFLOW_MAX - *lengthptr < code - last_code)
2433 {
2434 *errorcodeptr = ERR20;
2435 goto FAILED;
2436 }
2437
6bf342e1
PH
2438 *lengthptr += code - last_code;
2439 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2440
2441 /* If "previous" is set and it is not at the start of the work space, move
2442 it back to there, in order to avoid filling up the work space. Otherwise,
2443 if "previous" is NULL, reset the current code pointer to the start. */
2444
2445 if (previous != NULL)
2446 {
2447 if (previous > orig_code)
2448 {
2449 memmove(orig_code, previous, code - previous);
2450 code -= previous - orig_code;
2451 previous = orig_code;
2452 }
2453 }
2454 else code = orig_code;
2455
2456 /* Remember where this code item starts so we can pick up the length
2457 next time round. */
2458
2459 last_code = code;
2460 }
2461
2462 /* In the real compile phase, just check the workspace used by the forward
2463 reference list. */
2464
2465 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2466 {
2467 *errorcodeptr = ERR52;
2468 goto FAILED;
2469 }
2470
8ac170f3
PH
2471 /* If in \Q...\E, check for the end; if not, we have a literal */
2472
2473 if (inescq && c != 0)
2474 {
2475 if (c == '\\' && ptr[1] == 'E')
2476 {
2477 inescq = FALSE;
2478 ptr++;
2479 continue;
2480 }
2481 else
2482 {
2483 if (previous_callout != NULL)
2484 {
6bf342e1
PH
2485 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2486 complete_callout(previous_callout, ptr, cd);
8ac170f3
PH
2487 previous_callout = NULL;
2488 }
2489 if ((options & PCRE_AUTO_CALLOUT) != 0)
2490 {
2491 previous_callout = code;
2492 code = auto_callout(code, ptr, cd);
2493 }
2494 goto NORMAL_CHAR;
2495 }
2496 }
2497
2498 /* Fill in length of a previous callout, except when the next thing is
2499 a quantifier. */
2500
2501 is_quantifier = c == '*' || c == '+' || c == '?' ||
2502 (c == '{' && is_counted_repeat(ptr+1));
2503
2504 if (!is_quantifier && previous_callout != NULL &&
2505 after_manual_callout-- <= 0)
2506 {
6bf342e1
PH
2507 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2508 complete_callout(previous_callout, ptr, cd);
8ac170f3
PH
2509 previous_callout = NULL;
2510 }
2511
2512 /* In extended mode, skip white space and comments */
2513
2514 if ((options & PCRE_EXTENDED) != 0)
2515 {
2516 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2517 if (c == '#')
2518 {
6bf342e1 2519 while (*(++ptr) != 0)
aa41d2de 2520 {
6bf342e1 2521 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
aa41d2de 2522 }
6bf342e1
PH
2523 if (*ptr != 0) continue;
2524
aa41d2de
PH
2525 /* Else fall through to handle end of string */
2526 c = 0;
8ac170f3
PH
2527 }
2528 }
2529
2530 /* No auto callout for quantifiers. */
2531
2532 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2533 {
2534 previous_callout = code;
2535 code = auto_callout(code, ptr, cd);
2536 }
2537
2538 switch(c)
2539 {
6bf342e1
PH
2540 /* ===================================================================*/
2541 case 0: /* The branch terminates at string end */
2542 case '|': /* or | or ) */
8ac170f3
PH
2543 case ')':
2544 *firstbyteptr = firstbyte;
2545 *reqbyteptr = reqbyte;
2546 *codeptr = code;
2547 *ptrptr = ptr;
6bf342e1
PH
2548 if (lengthptr != NULL)
2549 {
47db1125
NM
2550 if (OFLOW_MAX - *lengthptr < code - last_code)
2551 {
2552 *errorcodeptr = ERR20;
2553 goto FAILED;
2554 }
6bf342e1
PH
2555 *lengthptr += code - last_code; /* To include callout length */
2556 DPRINTF((">> end branch\n"));
2557 }
8ac170f3
PH
2558 return TRUE;
2559
6bf342e1
PH
2560
2561 /* ===================================================================*/
8ac170f3
PH
2562 /* Handle single-character metacharacters. In multiline mode, ^ disables
2563 the setting of any following char as a first character. */
2564
2565 case '^':
2566 if ((options & PCRE_MULTILINE) != 0)
2567 {
2568 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2569 }
2570 previous = NULL;
2571 *code++ = OP_CIRC;
2572 break;
2573
2574 case '$':
2575 previous = NULL;
2576 *code++ = OP_DOLL;
2577 break;
2578
2579 /* There can never be a first char if '.' is first, whatever happens about
2580 repeats. The value of reqbyte doesn't change either. */
2581
2582 case '.':
2583 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2584 zerofirstbyte = firstbyte;
2585 zeroreqbyte = reqbyte;
2586 previous = code;
2587 *code++ = OP_ANY;
2588 break;
2589
6bf342e1
PH
2590
2591 /* ===================================================================*/
aa41d2de
PH
2592 /* Character classes. If the included characters are all < 256, we build a
2593 32-byte bitmap of the permitted characters, except in the special case
2594 where there is only one such character. For negated classes, we build the
2595 map as usual, then invert it at the end. However, we use a different opcode
2596 so that data characters > 255 can be handled correctly.
8ac170f3
PH
2597
2598 If the class contains characters outside the 0-255 range, a different
2599 opcode is compiled. It may optionally have a bit map for characters < 256,
2600 but those above are are explicitly listed afterwards. A flag byte tells
2601 whether the bitmap is present, and whether this is a negated class or not.
2602 */
2603
2604 case '[':
2605 previous = code;
2606
2607 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2608 they are encountered at the top level, so we'll do that too. */
2609
2610 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2611 check_posix_syntax(ptr, &tempptr, cd))
2612 {
2613 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2614 goto FAILED;
2615 }
2616
47db1125
NM
2617 /* If the first character is '^', set the negation flag and skip it. Also,
2618 if the first few characters (either before or after ^) are \Q\E or \E we
2619 skip them too. This makes for compatibility with Perl. */
8ac170f3 2620
47db1125
NM
2621 negate_class = FALSE;
2622 for (;;)
8ac170f3 2623 {
8ac170f3 2624 c = *(++ptr);
47db1125
NM
2625 if (c == '\\')
2626 {
2627 if (ptr[1] == 'E') ptr++;
2628 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2629 else break;
2630 }
2631 else if (!negate_class && c == '^')
2632 negate_class = TRUE;
2633 else break;
8ac170f3
PH
2634 }
2635
2636 /* Keep a count of chars with values < 256 so that we can optimize the case
6bf342e1
PH
2637 of just a single character (as long as it's < 256). However, For higher
2638 valued UTF-8 characters, we don't yet do any optimization. */
8ac170f3
PH
2639
2640 class_charcount = 0;
2641 class_lastchar = -1;
2642
6bf342e1
PH
2643 /* Initialize the 32-char bit map to all zeros. We build the map in a
2644 temporary bit of memory, in case the class contains only 1 character (less
2645 than 256), because in that case the compiled code doesn't use the bit map.
2646 */
2647
2648 memset(classbits, 0, 32 * sizeof(uschar));
2649
8ac170f3
PH
2650#ifdef SUPPORT_UTF8
2651 class_utf8 = FALSE; /* No chars >= 256 */
6bf342e1 2652 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
8ac170f3
PH
2653#endif
2654
8ac170f3 2655 /* Process characters until ] is reached. By writing this as a "do" it
6bf342e1
PH
2656 means that an initial ] is taken as a data character. At the start of the
2657 loop, c contains the first byte of the character. */
8ac170f3 2658
6bf342e1 2659 if (c != 0) do
8ac170f3 2660 {
6bf342e1
PH
2661 const uschar *oldptr;
2662
8ac170f3
PH
2663#ifdef SUPPORT_UTF8
2664 if (utf8 && c > 127)
2665 { /* Braces are required because the */
2666 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2667 }
2668#endif
2669
2670 /* Inside \Q...\E everything is literal except \E */
2671
2672 if (inescq)
2673 {
6bf342e1 2674 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
8ac170f3 2675 {
6bf342e1
PH
2676 inescq = FALSE; /* Reset literal state */
2677 ptr++; /* Skip the 'E' */
2678 continue; /* Carry on with next */
8ac170f3 2679 }
6bf342e1 2680 goto CHECK_RANGE; /* Could be range if \E follows */
8ac170f3
PH
2681 }
2682
2683 /* Handle POSIX class names. Perl allows a negation extension of the
2684 form [:^name:]. A square bracket that doesn't match the syntax is
2685 treated as a literal. We also recognize the POSIX constructions
2686 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2687 5.6 and 5.8 do. */
2688
2689 if (c == '[' &&
2690 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2691 check_posix_syntax(ptr, &tempptr, cd))
2692 {
2693 BOOL local_negate = FALSE;
aa41d2de 2694 int posix_class, taboffset, tabopt;
8ac170f3 2695 register const uschar *cbits = cd->cbits;
aa41d2de 2696 uschar pbits[32];
8ac170f3
PH
2697
2698 if (ptr[1] != ':')
2699 {
2700 *errorcodeptr = ERR31;
2701 goto FAILED;
2702 }
2703
2704 ptr += 2;
2705 if (*ptr == '^')
2706 {
2707 local_negate = TRUE;
2708 ptr++;
2709 }
2710
2711 posix_class = check_posix_name(ptr, tempptr - ptr);
2712 if (posix_class < 0)
2713 {
2714 *errorcodeptr = ERR30;
2715 goto FAILED;
2716 }
2717
2718 /* If matching is caseless, upper and lower are converted to
2719 alpha. This relies on the fact that the class table starts with
2720 alpha, lower, upper as the first 3 entries. */
2721
2722 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2723 posix_class = 0;
2724
aa41d2de
PH
2725 /* We build the bit map for the POSIX class in a chunk of local store
2726 because we may be adding and subtracting from it, and we don't want to
2727 subtract bits that may be in the main map already. At the end we or the
2728 result into the bit map that is being built. */
8ac170f3
PH
2729
2730 posix_class *= 3;
aa41d2de
PH
2731
2732 /* Copy in the first table (always present) */
2733
2734 memcpy(pbits, cbits + posix_class_maps[posix_class],
2735 32 * sizeof(uschar));
2736
2737 /* If there is a second table, add or remove it as required. */
2738
2739 taboffset = posix_class_maps[posix_class + 1];
2740 tabopt = posix_class_maps[posix_class + 2];
2741
2742 if (taboffset >= 0)
8ac170f3 2743 {
aa41d2de
PH
2744 if (tabopt >= 0)
2745 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
8ac170f3 2746 else
aa41d2de 2747 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
8ac170f3
PH
2748 }
2749
aa41d2de
PH
2750 /* Not see if we need to remove any special characters. An option
2751 value of 1 removes vertical space and 2 removes underscore. */
2752
2753 if (tabopt < 0) tabopt = -tabopt;
2754 if (tabopt == 1) pbits[1] &= ~0x3c;
2755 else if (tabopt == 2) pbits[11] &= 0x7f;
2756
2757 /* Add the POSIX table or its complement into the main table that is
2758 being built and we are done. */
2759
2760 if (local_negate)
2761 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2762 else
2763 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2764
8ac170f3
PH
2765 ptr = tempptr + 1;
2766 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2767 continue; /* End of POSIX syntax handling */
2768 }
2769
2770 /* Backslash may introduce a single character, or it may introduce one
6bf342e1
PH
2771 of the specials, which just set a flag. The sequence \b is a special
2772 case. Inside a class (and only there) it is treated as backspace.
2773 Elsewhere it marks a word boundary. Other escapes have preset maps ready
47db1125 2774 to 'or' into the one we are building. We assume they have more than one
8ac170f3
PH
2775 character in them, so set class_charcount bigger than one. */
2776
2777 if (c == '\\')
2778 {
6bf342e1
PH
2779 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2780 if (*errorcodeptr != 0) goto FAILED;
8ac170f3
PH
2781
2782 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2783 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
6bf342e1 2784 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
8ac170f3
PH
2785 else if (-c == ESC_Q) /* Handle start of quoted string */
2786 {
2787 if (ptr[1] == '\\' && ptr[2] == 'E')
2788 {
2789 ptr += 2; /* avoid empty string */
2790 }
2791 else inescq = TRUE;
2792 continue;
2793 }
47db1125 2794 else if (-c == ESC_E) continue; /* Ignore orphan \E */
8ac170f3
PH
2795
2796 if (c < 0)
2797 {
2798 register const uschar *cbits = cd->cbits;
2799 class_charcount += 2; /* Greater than 1 is what matters */
6bf342e1
PH
2800
2801 /* Save time by not doing this in the pre-compile phase. */
2802
2803 if (lengthptr == NULL) switch (-c)
8ac170f3
PH
2804 {
2805 case ESC_d:
2806 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2807 continue;
2808
2809 case ESC_D:
2810 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2811 continue;
2812
2813 case ESC_w:
2814 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2815 continue;
2816
2817 case ESC_W:
2818 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2819 continue;
2820
2821 case ESC_s:
2822 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2823 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2824 continue;
2825
2826 case ESC_S:
2827 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2828 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2829 continue;
2830
6bf342e1 2831 case ESC_E: /* Perl ignores an orphan \E */
8ac170f3 2832 continue;
8ac170f3 2833
6bf342e1
PH
2834 default: /* Not recognized; fall through */
2835 break; /* Need "default" setting to stop compiler warning. */
8ac170f3 2836 }
8ac170f3 2837
6bf342e1 2838 /* In the pre-compile phase, just do the recognition. */
8ac170f3 2839
6bf342e1
PH
2840 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2841 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2842
64f2600a
PH
2843 /* We need to deal with \H, \h, \V, and \v in both phases because
2844 they use extra memory. */
2845
2846 if (-c == ESC_h)
2847 {
2848 SETBIT(classbits, 0x09); /* VT */
2849 SETBIT(classbits, 0x20); /* SPACE */
2850 SETBIT(classbits, 0xa0); /* NSBP */
2851#ifdef SUPPORT_UTF8
2852 if (utf8)
2853 {
2854 class_utf8 = TRUE;
2855 *class_utf8data++ = XCL_SINGLE;
2856 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2857 *class_utf8data++ = XCL_SINGLE;
2858 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2859 *class_utf8data++ = XCL_RANGE;
2860 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2861 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2862 *class_utf8data++ = XCL_SINGLE;
2863 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2864 *class_utf8data++ = XCL_SINGLE;
2865 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2866 *class_utf8data++ = XCL_SINGLE;
2867 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2868 }
2869#endif
2870 continue;
2871 }
2872
2873 if (-c == ESC_H)
2874 {
2875 for (c = 0; c < 32; c++)
2876 {
2877 int x = 0xff;
2878 switch (c)
2879 {
2880 case 0x09/8: x ^= 1 << (0x09%8); break;
2881 case 0x20/8: x ^= 1 << (0x20%8); break;
2882 case 0xa0/8: x ^= 1 << (0xa0%8); break;
2883 default: break;
2884 }
2885 classbits[c] |= x;
2886 }
2887
2888#ifdef SUPPORT_UTF8
2889 if (utf8)
2890 {
2891 class_utf8 = TRUE;
2892 *class_utf8data++ = XCL_RANGE;
2893 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2894 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2895 *class_utf8data++ = XCL_RANGE;
2896 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2897 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2898 *class_utf8data++ = XCL_RANGE;
2899 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2900 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2901 *class_utf8data++ = XCL_RANGE;
2902 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2903 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2904 *class_utf8data++ = XCL_RANGE;
2905 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2906 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2907 *class_utf8data++ = XCL_RANGE;
2908 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2909 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2910 *class_utf8data++ = XCL_RANGE;
2911 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2912 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2913 }
2914#endif
2915 continue;
2916 }
2917
2918 if (-c == ESC_v)
2919 {
2920 SETBIT(classbits, 0x0a); /* LF */
2921 SETBIT(classbits, 0x0b); /* VT */
2922 SETBIT(classbits, 0x0c); /* FF */
2923 SETBIT(classbits, 0x0d); /* CR */
2924 SETBIT(classbits, 0x85); /* NEL */
2925#ifdef SUPPORT_UTF8
2926 if (utf8)
2927 {
2928 class_utf8 = TRUE;
2929 *class_utf8data++ = XCL_RANGE;
2930 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2931 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2932 }
2933#endif
2934 continue;
2935 }
2936
2937 if (-c == ESC_V)
2938 {
2939 for (c = 0; c < 32; c++)
2940 {
2941 int x = 0xff;
2942 switch (c)
2943 {
2944 case 0x0a/8: x ^= 1 << (0x0a%8);
2945 x ^= 1 << (0x0b%8);
2946 x ^= 1 << (0x0c%8);
2947 x ^= 1 << (0x0d%8);
2948 break;
2949 case 0x85/8: x ^= 1 << (0x85%8); break;
2950 default: break;
2951 }
2952 classbits[c] |= x;
2953 }
2954
2955#ifdef SUPPORT_UTF8
2956 if (utf8)
2957 {
2958 class_utf8 = TRUE;
2959 *class_utf8data++ = XCL_RANGE;
2960 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2961 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2962 *class_utf8data++ = XCL_RANGE;
2963 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2964 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2965 }
2966#endif
2967 continue;
2968 }
2969
6bf342e1
PH
2970 /* We need to deal with \P and \p in both phases. */
2971
2972#ifdef SUPPORT_UCP
2973 if (-c == ESC_p || -c == ESC_P)
2974 {
2975 BOOL negated;
2976 int pdata;
2977 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2978 if (ptype < 0) goto FAILED;
2979 class_utf8 = TRUE;
2980 *class_utf8data++ = ((-c == ESC_p) != negated)?
2981 XCL_PROP : XCL_NOTPROP;
2982 *class_utf8data++ = ptype;
2983 *class_utf8data++ = pdata;
2984 class_charcount -= 2; /* Not a < 256 character */
2985 continue;
2986 }
2987#endif
2988 /* Unrecognized escapes are faulted if PCRE is running in its
2989 strict mode. By default, for compatibility with Perl, they are
2990 treated as literals. */
2991
2992 if ((options & PCRE_EXTRA) != 0)
2993 {
2994 *errorcodeptr = ERR7;
2995 goto FAILED;
2996 }
2997
2998 class_charcount -= 2; /* Undo the default count from above */
2999 c = *ptr; /* Get the final character and fall through */
3000 }
3001
3002 /* Fall through if we have a single character (c >= 0). This may be
3003 greater than 256 in UTF-8 mode. */
3004
3005 } /* End of backslash handling */
8ac170f3
PH
3006
3007 /* A single character may be followed by '-' to form a range. However,
3008 Perl does not permit ']' to be the end of the range. A '-' character
6bf342e1
PH
3009 at the end is treated as a literal. Perl ignores orphaned \E sequences
3010 entirely. The code for handling \Q and \E is messy. */
3011
3012 CHECK_RANGE:
3013 while (ptr[1] == '\\' && ptr[2] == 'E')
3014 {
3015 inescq = FALSE;
3016 ptr += 2;
3017 }
3018
3019 oldptr = ptr;
8ac170f3 3020
47db1125
NM
3021 /* Remember \r or \n */
3022
3023 if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
3024
3025 /* Check for range */
3026
6bf342e1 3027 if (!inescq && ptr[1] == '-')
8ac170f3
PH
3028 {
3029 int d;
3030 ptr += 2;
6bf342e1
PH
3031 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
3032
3033 /* If we hit \Q (not followed by \E) at this point, go into escaped
3034 mode. */
3035
3036 while (*ptr == '\\' && ptr[1] == 'Q')
3037 {
3038 ptr += 2;
3039 if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
3040 inescq = TRUE;
3041 break;
3042 }
3043
3044 if (*ptr == 0 || (!inescq && *ptr == ']'))
3045 {
3046 ptr = oldptr;
3047 goto LONE_SINGLE_CHARACTER;
3048 }
8ac170f3
PH
3049
3050#ifdef SUPPORT_UTF8
3051 if (utf8)
3052 { /* Braces are required because the */
3053 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3054 }
3055 else
3056#endif
3057 d = *ptr; /* Not UTF-8 mode */
3058
3059 /* The second part of a range can be a single-character escape, but
3060 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3061 in such circumstances. */
3062
6bf342e1 3063 if (!inescq && d == '\\')
8ac170f3 3064 {
6bf342e1
PH
3065 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3066 if (*errorcodeptr != 0) goto FAILED;
8ac170f3 3067
6bf342e1
PH
3068 /* \b is backslash; \X is literal X; \R is literal R; any other
3069 special means the '-' was literal */
8ac170f3
PH
3070
3071 if (d < 0)
3072 {
3073 if (d == -ESC_b) d = '\b';
6bf342e1
PH
3074 else if (d == -ESC_X) d = 'X';
3075 else if (d == -ESC_R) d = 'R'; else
8ac170f3 3076 {
6bf342e1 3077 ptr = oldptr;
8ac170f3
PH
3078 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3079 }
3080 }
3081 }
3082
6bf342e1
PH
3083 /* Check that the two values are in the correct order. Optimize
3084 one-character ranges */
3085
3086 if (d < c)
3087 {
3088 *errorcodeptr = ERR8;
3089 goto FAILED;
3090 }
8ac170f3
PH
3091
3092 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3093
47db1125
NM
3094 /* Remember \r or \n */
3095
3096 if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
3097
8ac170f3
PH
3098 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3099 matching, we have to use an XCLASS with extra data items. Caseless
3100 matching for characters > 127 is available only if UCP support is
3101 available. */
3102
3103#ifdef SUPPORT_UTF8
3104 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3105 {
3106 class_utf8 = TRUE;
3107
3108 /* With UCP support, we can find the other case equivalents of
3109 the relevant characters. There may be several ranges. Optimize how
3110 they fit with the basic range. */
3111
3112#ifdef SUPPORT_UCP
3113 if ((options & PCRE_CASELESS) != 0)
3114 {
6bf342e1
PH
3115 unsigned int occ, ocd;
3116 unsigned int cc = c;
3117 unsigned int origd = d;
8ac170f3
PH
3118 while (get_othercase_range(&cc, origd, &occ, &ocd))
3119 {
64f2600a
PH
3120 if (occ >= (unsigned int)c &&
3121 ocd <= (unsigned int)d)
3122 continue; /* Skip embedded ranges */
8ac170f3 3123
64f2600a
PH
3124 if (occ < (unsigned int)c &&
3125 ocd >= (unsigned int)c - 1) /* Extend the basic range */
8ac170f3
PH
3126 { /* if there is overlap, */
3127 c = occ; /* noting that if occ < c */
3128 continue; /* we can't have ocd > d */
3129 } /* because a subrange is */
64f2600a
PH
3130 if (ocd > (unsigned int)d &&
3131 occ <= (unsigned int)d + 1) /* always shorter than */
8ac170f3
PH
3132 { /* the basic range. */
3133 d = ocd;
3134 continue;
3135 }
3136
3137 if (occ == ocd)
3138 {
3139 *class_utf8data++ = XCL_SINGLE;
3140 }
3141 else
3142 {
3143 *class_utf8data++ = XCL_RANGE;
3144 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3145 }
3146 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3147 }
3148 }
3149#endif /* SUPPORT_UCP */
3150
3151 /* Now record the original range, possibly modified for UCP caseless
3152 overlapping ranges. */
3153
3154 *class_utf8data++ = XCL_RANGE;
3155 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3156 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3157
3158 /* With UCP support, we are done. Without UCP support, there is no
3159 caseless matching for UTF-8 characters > 127; we can use the bit map
3160 for the smaller ones. */
3161
3162#ifdef SUPPORT_UCP
3163 continue; /* With next character in the class */
3164#else
3165 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3166
3167 /* Adjust upper limit and fall through to set up the map */
3168
3169 d = 127;
3170
3171#endif /* SUPPORT_UCP */
3172 }
3173#endif /* SUPPORT_UTF8 */
3174
3175 /* We use the bit map for all cases when not in UTF-8 mode; else
3176 ranges that lie entirely within 0-127 when there is UCP support; else
3177 for partial ranges without UCP support. */
3178
6bf342e1
PH
3179 class_charcount += d - c + 1;
3180 class_lastchar = d;
3181
3182 /* We can save a bit of time by skipping this in the pre-compile. */
3183
3184 if (lengthptr == NULL) for (; c <= d; c++)
8ac170f3
PH
3185 {
3186 classbits[c/8] |= (1 << (c&7));
3187 if ((options & PCRE_CASELESS) != 0)
3188 {
3189 int uc = cd->fcc[c]; /* flip case */
3190 classbits[uc/8] |= (1 << (uc&7));
3191 }
8ac170f3
PH
3192 }
3193
3194 continue; /* Go get the next char in the class */
3195 }
3196
3197 /* Handle a lone single character - we can get here for a normal
3198 non-escape char, or after \ that introduces a single character or for an
3199 apparent range that isn't. */
3200
3201 LONE_SINGLE_CHARACTER:
3202
3203 /* Handle a character that cannot go in the bit map */
3204
3205#ifdef SUPPORT_UTF8
3206 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3207 {
3208 class_utf8 = TRUE;
3209 *class_utf8data++ = XCL_SINGLE;
3210 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3211
3212#ifdef SUPPORT_UCP
3213 if ((options & PCRE_CASELESS) != 0)
3214 {
6bf342e1
PH
3215 unsigned int othercase;
3216 if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
8ac170f3
PH
3217 {
3218 *class_utf8data++ = XCL_SINGLE;
3219 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3220 }
3221 }
3222#endif /* SUPPORT_UCP */
3223
3224 }
3225 else
3226#endif /* SUPPORT_UTF8 */
3227
3228 /* Handle a single-byte character */
3229 {
3230 classbits[c/8] |= (1 << (c&7));
3231 if ((options & PCRE_CASELESS) != 0)
3232 {
3233 c = cd->fcc[c]; /* flip case */
3234 classbits[c/8] |= (1 << (c&7));
3235 }
3236 class_charcount++;
3237 class_lastchar = c;
3238 }
3239 }
3240
6bf342e1 3241 /* Loop until ']' reached. This "while" is the end of the "do" above. */
8ac170f3 3242
6bf342e1
PH
3243 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3244
3245 if (c == 0) /* Missing terminating ']' */
3246 {
3247 *errorcodeptr = ERR6;
3248 goto FAILED;
3249 }
8ac170f3 3250
47db1125
NM
3251
3252/* This code has been disabled because it would mean that \s counts as
3253an explicit \r or \n reference, and that's not really what is wanted. Now
3254we set the flag only if there is a literal "\r" or "\n" in the class. */
3255
3256#if 0
3257 /* Remember whether \r or \n are in this class */
3258
3259 if (negate_class)
3260 {
3261 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3262 }
3263 else
3264 {
3265 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3266 }
3267#endif
3268
3269
8ac170f3 3270 /* If class_charcount is 1, we saw precisely one character whose value is
47db1125
NM
3271 less than 256. As long as there were no characters >= 128 and there was no
3272 use of \p or \P, in other words, no use of any XCLASS features, we can
3273 optimize.
3274
3275 In UTF-8 mode, we can optimize the negative case only if there were no
3276 characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3277 operate on single-bytes only. This is an historical hangover. Maybe one day
3278 we can tidy these opcodes to handle multi-byte characters.
8ac170f3
PH
3279
3280 The optimization throws away the bit map. We turn the item into a
3281 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3282 that OP_NOT does not support multibyte characters. In the positive case, it
3283 can cause firstbyte to be set. Otherwise, there can be no first char if
3284 this item is first, whatever repeat count may follow. In the case of
3285 reqbyte, save the previous value for reinstating. */
3286
3287#ifdef SUPPORT_UTF8
47db1125
NM
3288 if (class_charcount == 1 && !class_utf8 &&
3289 (!utf8 || !negate_class || class_lastchar < 128))
8ac170f3
PH
3290#else
3291 if (class_charcount == 1)
3292#endif
3293 {
3294 zeroreqbyte = reqbyte;
3295
3296 /* The OP_NOT opcode works on one-byte characters only. */
3297
3298 if (negate_class)
3299 {
3300 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3301 zerofirstbyte = firstbyte;
3302 *code++ = OP_NOT;
3303 *code++ = class_lastchar;
3304 break;
3305 }
3306
3307 /* For a single, positive character, get the value into mcbuffer, and
3308 then we can handle this with the normal one-character code. */
3309
3310#ifdef SUPPORT_UTF8
3311 if (utf8 && class_lastchar > 127)
3312 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3313 else
3314#endif
3315 {
3316 mcbuffer[0] = class_lastchar;
3317 mclength = 1;
3318 }
3319 goto ONE_CHAR;
3320 } /* End of 1-char optimization */
3321
3322 /* The general case - not the one-char optimization. If this is the first
3323 thing in the branch, there can be no first char setting, whatever the
3324 repeat count. Any reqbyte setting must remain unchanged after any kind of
3325 repeat. */
3326
3327 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3328 zerofirstbyte = firstbyte;
3329 zeroreqbyte = reqbyte;
3330
3331 /* If there are characters with values > 255, we have to compile an
3332 extended class, with its own opcode. If there are no characters < 256,
6bf342e1 3333 we can omit the bitmap in the actual compiled code. */
8ac170f3
PH
3334
3335#ifdef SUPPORT_UTF8
3336 if (class_utf8)
3337 {
3338 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3339 *code++ = OP_XCLASS;
3340 code += LINK_SIZE;
3341 *code = negate_class? XCL_NOT : 0;
3342
6bf342e1
PH
3343 /* If the map is required, move up the extra data to make room for it;
3344 otherwise just move the code pointer to the end of the extra data. */
8ac170f3
PH
3345
3346 if (class_charcount > 0)
3347 {
3348 *code++ |= XCL_MAP;
6bf342e1 3349 memmove(code + 32, code, class_utf8data - code);
8ac170f3 3350 memcpy(code, classbits, 32);
6bf342e1 3351 code = class_utf8data + 32;
8ac170f3 3352 }
6bf342e1 3353 else code = class_utf8data;
8ac170f3
PH
3354
3355 /* Now fill in the complete length of the item */
3356
3357 PUT(previous, 1, code - previous);
3358 break; /* End of class handling */
3359 }
3360#endif
3361
3362 /* If there are no characters > 255, negate the 32-byte map if necessary,
3363 and copy it into the code vector. If this is the first thing in the branch,
3364 there can be no first char setting, whatever the repeat count. Any reqbyte
3365 setting must remain unchanged after any kind of repeat. */
3366
3367 if (negate_class)
3368 {
3369 *code++ = OP_NCLASS;
6bf342e1
PH
3370 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3371 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
8ac170f3
PH
3372 }
3373 else
3374 {
3375 *code++ = OP_CLASS;
3376 memcpy(code, classbits, 32);
3377 }
3378 code += 32;
3379 break;
3380
6bf342e1
PH
3381
3382 /* ===================================================================*/
8ac170f3
PH
3383 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3384 has been tested above. */
3385
3386 case '{':
3387 if (!is_quantifier) goto NORMAL_CHAR;
3388 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3389 if (*errorcodeptr != 0) goto FAILED;
3390 goto REPEAT;
3391
3392 case '*':
3393 repeat_min = 0;
3394 repeat_max = -1;
3395 goto REPEAT;
3396
3397 case '+':
3398 repeat_min = 1;
3399 repeat_max = -1;
3400 goto REPEAT;
3401
3402 case '?':
3403 repeat_min = 0;
3404 repeat_max = 1;
3405
3406 REPEAT:
3407 if (previous == NULL)
3408 {
3409 *errorcodeptr = ERR9;
3410 goto FAILED;
3411 }
3412
3413 if (repeat_min == 0)
3414 {
3415 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3416 reqbyte = zeroreqbyte; /* Ditto */
3417 }
3418
3419 /* Remember whether this is a variable length repeat */
3420
3421 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3422
3423 op_type = 0; /* Default single-char op codes */
3424 possessive_quantifier = FALSE; /* Default not possessive quantifier */
3425
3426 /* Save start of previous item, in case we have to move it up to make space
3427 for an inserted OP_ONCE for the additional '+' extension. */
3428
3429 tempcode = previous;
3430
3431 /* If the next character is '+', we have a possessive quantifier. This
3432 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3433 If the next character is '?' this is a minimizing repeat, by default,
3434 but if PCRE_UNGREEDY is set, it works the other way round. We change the
3435 repeat type to the non-default. */
3436
3437 if (ptr[1] == '+')
3438 {
3439 repeat_type = 0; /* Force greedy */
3440 possessive_quantifier = TRUE;
3441 ptr++;
3442 }
3443 else if (ptr[1] == '?')
3444 {
3445 repeat_type = greedy_non_default;
3446 ptr++;
3447 }
3448 else repeat_type = greedy_default;
3449
8ac170f3
PH
3450 /* If previous was a character match, abolish the item and generate a
3451 repeat item instead. If a char item has a minumum of more than one, ensure
3452 that it is set in reqbyte - it might not be if a sequence such as x{3} is
3453 the first thing in a branch because the x will have gone into firstbyte
3454 instead. */
3455
3456 if (*previous == OP_CHAR || *previous == OP_CHARNC)
3457 {
3458 /* Deal with UTF-8 characters that take up more than one byte. It's
3459 easier to write this out separately than try to macrify it. Use c to
3460 hold the length of the character in bytes, plus 0x80 to flag that it's a
3461 length rather than a small character. */
3462
3463#ifdef SUPPORT_UTF8
3464 if (utf8 && (code[-1] & 0x80) != 0)
3465 {
3466 uschar *lastchar = code - 1;
3467 while((*lastchar & 0xc0) == 0x80) lastchar--;
3468 c = code - lastchar; /* Length of UTF-8 character */
3469 memcpy(utf8_char, lastchar, c); /* Save the char */
3470 c |= 0x80; /* Flag c as a length */
3471 }
3472 else
3473#endif
3474
3475 /* Handle the case of a single byte - either with no UTF8 support, or
3476 with UTF-8 disabled, or for a UTF-8 character < 128. */
3477
3478 {
3479 c = code[-1];
3480 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3481 }
3482
6bf342e1
PH
3483 /* If the repetition is unlimited, it pays to see if the next thing on
3484 the line is something that cannot possibly match this character. If so,
3485 automatically possessifying this item gains some performance in the case
3486 where the match fails. */
3487
3488 if (!possessive_quantifier &&
3489 repeat_max < 0 &&
3490 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3491 options, cd))
3492 {
3493 repeat_type = 0; /* Force greedy */
3494 possessive_quantifier = TRUE;
3495 }
3496
8ac170f3
PH
3497 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3498 }
3499
3500 /* If previous was a single negated character ([^a] or similar), we use
3501 one of the special opcodes, replacing it. The code is shared with single-
3502 character repeats by setting opt_type to add a suitable offset into
6bf342e1
PH
3503 repeat_type. We can also test for auto-possessification. OP_NOT is
3504 currently used only for single-byte chars. */
8ac170f3
PH
3505
3506 else if (*previous == OP_NOT)
3507 {
3508 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3509 c = previous[1];
6bf342e1
PH
3510 if (!possessive_quantifier &&
3511 repeat_max < 0 &&
3512 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3513 {
3514 repeat_type = 0; /* Force greedy */
3515 possessive_quantifier = TRUE;
3516 }
8ac170f3
PH
3517 goto OUTPUT_SINGLE_REPEAT;
3518 }
3519
3520 /* If previous was a character type match (\d or similar), abolish it and
3521 create a suitable repeat item. The code is shared with single-character
3522 repeats by setting op_type to add a suitable offset into repeat_type. Note
3523 the the Unicode property types will be present only when SUPPORT_UCP is
3524 defined, but we don't wrap the little bits of code here because it just
3525 makes it horribly messy. */
3526
3527 else if (*previous < OP_EODN)
3528 {
3529 uschar *oldcode;
aa41d2de 3530 int prop_type, prop_value;
8ac170f3
PH
3531 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3532 c = *previous;
3533
6bf342e1
PH
3534 if (!possessive_quantifier &&
3535 repeat_max < 0 &&
3536 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3537 {
3538 repeat_type = 0; /* Force greedy */
3539 possessive_quantifier = TRUE;
3540 }
3541
8ac170f3 3542 OUTPUT_SINGLE_REPEAT:
aa41d2de
PH
3543 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3544 {
3545 prop_type = previous[1];
3546 prop_value = previous[2];
3547 }
3548 else prop_type = prop_value = -1;
8ac170f3
PH
3549
3550 oldcode = code;
3551 code = previous; /* Usually overwrite previous item */
3552
3553 /* If the maximum is zero then the minimum must also be zero; Perl allows
3554 this case, so we do too - by simply omitting the item altogether. */
3555
3556 if (repeat_max == 0) goto END_REPEAT;
3557
3558 /* All real repeats make it impossible to handle partial matching (maybe
3559 one day we will be able to remove this restriction). */
3560
47db1125 3561 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
8ac170f3
PH
3562
3563 /* Combine the op_type with the repeat_type */
3564
3565 repeat_type += op_type;
3566
3567 /* A minimum of zero is handled either as the special case * or ?, or as
3568 an UPTO, with the maximum given. */
3569
3570 if (repeat_min == 0)
3571 {
3572 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3573 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3574 else
3575 {
3576 *code++ = OP_UPTO + repeat_type;
3577 PUT2INC(code, 0, repeat_max);
3578 }
3579 }
3580
3581 /* A repeat minimum of 1 is optimized into some special cases. If the
6bf342e1 3582 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
8ac170f3
PH
3583 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3584 one less than the maximum. */
3585
3586 else if (repeat_min == 1)
3587 {
3588 if (repeat_max == -1)
3589 *code++ = OP_PLUS + repeat_type;
3590 else
3591 {
3592 code = oldcode; /* leave previous item in place */
3593 if (repeat_max == 1) goto END_REPEAT;
3594 *code++ = OP_UPTO + repeat_type;
3595 PUT2INC(code, 0, repeat_max - 1);
3596 }
3597 }
3598
3599 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3600 handled as an EXACT followed by an UPTO. */
3601
3602 else
3603 {
3604 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3605 PUT2INC(code, 0, repeat_min);
3606
3607 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3608 we have to insert the character for the previous code. For a repeated
aa41d2de 3609 Unicode property match, there are two extra bytes that define the
8ac170f3
PH
3610 required property. In UTF-8 mode, long characters have their length in
3611 c, with the 0x80 bit as a flag. */
3612
3613 if (repeat_max < 0)
3614 {
3615#ifdef SUPPORT_UTF8
3616 if (utf8 && c >= 128)
3617 {
3618 memcpy(code, utf8_char, c & 7);
3619 code += c & 7;
3620 }
3621 else
3622#endif
3623 {
3624 *code++ = c;
aa41d2de
PH
3625 if (prop_type >= 0)
3626 {
3627 *code++ = prop_type;
3628 *code++ = prop_value;
3629 }
8ac170f3
PH
3630 }
3631 *code++ = OP_STAR + repeat_type;
3632 }
3633
3634 /* Else insert an UPTO if the max is greater than the min, again
6bf342e1
PH
3635 preceded by the character, for the previously inserted code. If the
3636 UPTO is just for 1 instance, we can use QUERY instead. */
8ac170f3
PH
3637
3638 else if (repeat_max != repeat_min)
3639 {
3640#ifdef SUPPORT_UTF8
3641 if (utf8 && c >= 128)
3642 {
3643 memcpy(code, utf8_char, c & 7);
3644 code += c & 7;
3645 }
3646 else
3647#endif
3648 *code++ = c;
aa41d2de
PH
3649 if (prop_type >= 0)
3650 {
3651 *code++ = prop_type;
3652 *code++ = prop_value;
3653 }
8ac170f3 3654 repeat_max -= repeat_min;
6bf342e1
PH
3655
3656 if (repeat_max == 1)
3657 {
3658 *code++ = OP_QUERY + repeat_type;
3659 }
3660 else
3661 {
3662 *code++ = OP_UPTO + repeat_type;
3663 PUT2INC(code, 0, repeat_max);
3664 }
8ac170f3
PH
3665 }
3666 }
3667
3668 /* The character or character type itself comes last in all cases. */
3669
3670#ifdef SUPPORT_UTF8
3671 if (utf8 && c >= 128)
3672 {
3673 memcpy(code, utf8_char, c & 7);
3674 code += c & 7;
3675 }
3676 else
3677#endif
3678 *code++ = c;
3679
aa41d2de
PH
3680 /* For a repeated Unicode property match, there are two extra bytes that
3681 define the required property. */
8ac170f3
PH
3682
3683#ifdef SUPPORT_UCP
aa41d2de
PH
3684 if (prop_type >= 0)
3685 {
3686 *code++ = prop_type;
3687 *code++ = prop_value;
3688 }
8ac170f3
PH
3689#endif
3690 }
3691
3692 /* If previous was a character class or a back reference, we put the repeat
3693 stuff after it, but just skip the item if the repeat was {0,0}. */
3694
3695 else if (*previous == OP_CLASS ||
3696 *previous == OP_NCLASS ||
3697#ifdef SUPPORT_UTF8
3698 *previous == OP_XCLASS ||
3699#endif
3700 *previous == OP_REF)
3701 {
3702 if (repeat_max == 0)
3703 {
3704 code = previous;
3705 goto END_REPEAT;
3706 }
3707
3708 /* All real repeats make it impossible to handle partial matching (maybe
3709 one day we will be able to remove this restriction). */
3710
47db1125 3711 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
8ac170f3
PH
3712
3713 if (repeat_min == 0 && repeat_max == -1)
3714 *code++ = OP_CRSTAR + repeat_type;
3715 else if (repeat_min == 1 && repeat_max == -1)
3716 *code++ = OP_CRPLUS + repeat_type;
3717 else if (repeat_min == 0 && repeat_max == 1)
3718 *code++ = OP_CRQUERY + repeat_type;
3719 else
3720 {
3721 *code++ = OP_CRRANGE + repeat_type;
3722 PUT2INC(code, 0, repeat_min);
3723 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3724 PUT2INC(code, 0, repeat_max);
3725 }
3726 }
3727
3728 /* If previous was a bracket group, we may have to replicate it in certain
3729 cases. */
3730
6bf342e1
PH
3731 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3732 *previous == OP_ONCE || *previous == OP_COND)
8ac170f3
PH
3733 {
3734 register int i;
3735 int ketoffset = 0;
3736 int len = code - previous;
3737 uschar *bralink = NULL;
3738
6bf342e1
PH
3739 /* Repeating a DEFINE group is pointless */
3740
3741 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3742 {
3743 *errorcodeptr = ERR55;
3744 goto FAILED;
3745 }
3746
8ac170f3
PH
3747 /* If the maximum repeat count is unlimited, find the end of the bracket
3748 by scanning through from the start, and compute the offset back to it
3749 from the current code pointer. There may be an OP_OPT setting following
3750 the final KET, so we can't find the end just by going back from the code
3751 pointer. */
3752
3753 if (repeat_max == -1)
3754 {
3755 register uschar *ket = previous;
3756 do ket += GET(ket, 1); while (*ket != OP_KET);
3757 ketoffset = code - ket;
3758 }
3759
3760 /* The case of a zero minimum is special because of the need to stick
3761 OP_BRAZERO in front of it, and because the group appears once in the
3762 data, whereas in other cases it appears the minimum number of times. For
3763 this reason, it is simplest to treat this case separately, as otherwise
3764 the code gets far too messy. There are several special subcases when the
3765 minimum is zero. */
3766
3767 if (repeat_min == 0)
3768 {
3769 /* If the maximum is also zero, we just omit the group from the output
3770 altogether. */
3771
3772 if (repeat_max == 0)
3773 {
3774 code = previous;
3775 goto END_REPEAT;
3776 }
3777
3778 /* If the maximum is 1 or unlimited, we just have to stick in the
3779 BRAZERO and do no more at this point. However, we do need to adjust
3780 any OP_RECURSE calls inside the group that refer to the group itself or
6bf342e1
PH
3781 any internal or forward referenced group, because the offset is from
3782 the start of the whole regex. Temporarily terminate the pattern while
3783 doing this. */
8ac170f3
PH
3784
3785 if (repeat_max <= 1)
3786 {
3787 *code = OP_END;
6bf342e1 3788 adjust_recurse(previous, 1, utf8, cd, save_hwm);
8ac170f3
PH
3789 memmove(previous+1, previous, len);
3790 code++;
3791 *previous++ = OP_BRAZERO + repeat_type;
3792 }
3793
3794 /* If the maximum is greater than 1 and limited, we have to replicate
3795 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3796 The first one has to be handled carefully because it's the original
3797 copy, which has to be moved up. The remainder can be handled by code
3798 that is common with the non-zero minimum case below. We have to
3799 adjust the value or repeat_max, since one less copy is required. Once
3800 again, we may have to adjust any OP_RECURSE calls inside the group. */
3801
3802 else
3803 {
3804 int offset;
3805 *code = OP_END;
6bf342e1 3806 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
8ac170f3
PH
3807 memmove(previous + 2 + LINK_SIZE, previous, len);
3808 code += 2 + LINK_SIZE;
3809 *previous++ = OP_BRAZERO + repeat_type;
3810 *previous++ = OP_BRA;
3811
3812 /* We chain together the bracket offset fields that have to be
3813 filled in later when the ends of the brackets are reached. */
3814
3815 offset = (bralink == NULL)? 0 : previous - bralink;
3816 bralink = previous;
3817 PUTINC(previous, 0, offset);
3818 }
3819
3820 repeat_max--;
3821 }
3822
3823 /* If the minimum is greater than zero, replicate the group as many
3824 times as necessary, and adjust the maximum to the number of subsequent
3825 copies that we need. If we set a first char from the group, and didn't
6bf342e1
PH
3826 set a required char, copy the latter from the former. If there are any
3827 forward reference subroutine calls in the group, there will be entries on
3828 the workspace list; replicate these with an appropriate increment. */
8ac170f3
PH
3829
3830 else
3831 {
3832 if (repeat_min > 1)
3833 {
6bf342e1 3834 /* In the pre-compile phase, we don't actually do the replication. We
47db1125
NM
3835 just adjust the length as if we had. Do some paranoid checks for
3836 potential integer overflow. */
6bf342e1
PH
3837
3838 if (lengthptr != NULL)
47db1125
NM
3839 {
3840 int delta = (repeat_min - 1)*length_prevgroup;
3841 if ((double)(repeat_min - 1)*(double)length_prevgroup >
3842 (double)INT_MAX ||
3843 OFLOW_MAX - *lengthptr < delta)
3844 {
3845 *errorcodeptr = ERR20;
3846 goto FAILED;
3847 }
3848 *lengthptr += delta;
3849 }
6bf342e1
PH
3850
3851 /* This is compiling for real */
3852
3853 else
8ac170f3 3854 {
6bf342e1
PH
3855 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3856 for (i = 1; i < repeat_min; i++)
3857 {
3858 uschar *hc;
3859 uschar *this_hwm = cd->hwm;
3860 memcpy(code, previous, len);
3861 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3862 {
3863 PUT(cd->hwm, 0, GET(hc, 0) + len);
3864 cd->hwm += LINK_SIZE;
3865 }
3866 save_hwm = this_hwm;
3867 code += len;
3868 }
8ac170f3
PH
3869 }
3870 }
6bf342e1 3871
8ac170f3
PH
3872 if (repeat_max > 0) repeat_max -= repeat_min;
3873 }
3874
3875 /* This code is common to both the zero and non-zero minimum cases. If
3876 the maximum is limited, it replicates the group in a nested fashion,
3877 remembering the bracket starts on a stack. In the case of a zero minimum,
3878 the first one was set up above. In all cases the repeat_max now specifies
6bf342e1
PH
3879 the number of additional copies needed. Again, we must remember to
3880 replicate entries on the forward reference list. */
8ac170f3
PH
3881
3882 if (repeat_max >= 0)
3883 {
6bf342e1
PH
3884 /* In the pre-compile phase, we don't actually do the replication. We
3885 just adjust the length as if we had. For each repetition we must add 1
3886 to the length for BRAZERO and for all but the last repetition we must
47db1125
NM
3887 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3888 paranoid checks to avoid integer overflow. */
6bf342e1
PH
3889
3890 if (lengthptr != NULL && repeat_max > 0)
47db1125
NM
3891 {
3892 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3893 2 - 2*LINK_SIZE; /* Last one doesn't nest */
3894 if ((double)repeat_max *
3895 (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3896 > (double)INT_MAX ||
3897 OFLOW_MAX - *lengthptr < delta)
3898 {
3899 *errorcodeptr = ERR20;
3900 goto FAILED;
3901 }
3902 *lengthptr += delta;
3903 }
6bf342e1
PH
3904
3905 /* This is compiling for real */
3906
3907 else for (i = repeat_max - 1; i >= 0; i--)
8ac170f3 3908 {
6bf342e1
PH
3909 uschar *hc;
3910 uschar *this_hwm = cd->hwm;
3911
8ac170f3
PH
3912 *code++ = OP_BRAZERO + repeat_type;
3913
3914 /* All but the final copy start a new nesting, maintaining the
3915 chain of brackets outstanding. */
3916
3917 if (i != 0)
3918 {
3919 int offset;
3920 *code++ = OP_BRA;
3921 offset = (bralink == NULL)? 0 : code - bralink;
3922 bralink = code;
3923 PUTINC(code, 0, offset);
3924 }
3925
3926 memcpy(code, previous, len);
6bf342e1
PH
3927 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3928 {
3929 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3930 cd->hwm += LINK_SIZE;
3931 }
3932 save_hwm = this_hwm;
8ac170f3
PH
3933 code += len;
3934 }
3935
3936 /* Now chain through the pending brackets, and fill in their length
3937 fields (which are holding the chain links pro tem). */
3938
3939 while (bralink != NULL)
3940 {
3941 int oldlinkoffset;
3942 int offset = code - bralink + 1;
3943 uschar *bra = code - offset;
3944 oldlinkoffset = GET(bra, 1);
3945 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3946 *code++ = OP_KET;
3947 PUTINC(code, 0, offset);
3948 PUT(bra, 1, offset);
3949 }
3950 }
3951
3952 /* If the maximum is unlimited, set a repeater in the final copy. We
3953 can't just offset backwards from the current code point, because we
3954 don't know if there's been an options resetting after the ket. The
6bf342e1 3955 correct offset was computed above.
8ac170f3 3956
6bf342e1
PH
3957 Then, when we are doing the actual compile phase, check to see whether
3958 this group is a non-atomic one that could match an empty string. If so,
3959 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3960 that runtime checking can be done. [This check is also applied to
3961 atomic groups at runtime, but in a different way.] */
3962
3963 else
3964 {
3965 uschar *ketcode = code - ketoffset;
3966 uschar *bracode = ketcode - GET(ketcode, 1);
3967 *ketcode = OP_KETRMAX + repeat_type;
3968 if (lengthptr == NULL && *bracode != OP_ONCE)
3969 {
3970 uschar *scode = bracode;
3971 do
3972 {
3973 if (could_be_empty_branch(scode, ketcode, utf8))
3974 {
3975 *bracode += OP_SBRA - OP_BRA;
3976 break;
3977 }
3978 scode += GET(scode, 1);
3979 }
3980 while (*scode == OP_ALT);
3981 }
3982 }
8ac170f3
PH
3983 }
3984
3985 /* Else there's some kind of shambles */
3986
3987 else
3988 {
3989 *errorcodeptr = ERR11;
3990 goto FAILED;
3991 }
3992
6bf342e1
PH
3993 /* If the character following a repeat is '+', or if certain optimization
3994 tests above succeeded, possessive_quantifier is TRUE. For some of the
3995 simpler opcodes, there is an special alternative opcode for this. For
3996 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3997 The '+' notation is just syntactic sugar, taken from Sun's Java package,
3998 but the special opcodes can optimize it a bit. The repeated item starts at
3999 tempcode, not at previous, which might be the first part of a string whose
4000 (former) last char we repeated.
4001
4002 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4003 an 'upto' may follow. We skip over an 'exact' item, and then test the
4004 length of what remains before proceeding. */
8ac170f3
PH
4005
4006 if (possessive_quantifier)
4007 {
6bf342e1
PH
4008 int len;
4009 if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4010 *tempcode == OP_NOTEXACT)
4011 tempcode += _pcre_OP_lengths[*tempcode];
4012 len = code - tempcode;
4013 if (len > 0) switch (*tempcode)
4014 {
4015 case OP_STAR: *tempcode = OP_POSSTAR; break;
4016 case OP_PLUS: *tempcode = OP_POSPLUS; break;
4017 case OP_QUERY: *tempcode = OP_POSQUERY; break;
4018 case OP_UPTO: *tempcode = OP_POSUPTO; break;
4019
4020 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
4021 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
4022 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4023 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
4024
4025 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
4026 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
4027 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4028 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
4029
4030 default:
4031 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4032 code += 1 + LINK_SIZE;
4033 len += 1 + LINK_SIZE;
4034 tempcode[0] = OP_ONCE;
4035 *code++ = OP_KET;
4036 PUTINC(code, 0, len);
4037 PUT(tempcode, 1, len);
4038 break;
4039 }
8ac170f3
PH
4040 }
4041
4042 /* In all case we no longer have a previous item. We also set the
4043 "follows varying string" flag for subsequently encountered reqbytes if
4044 it isn't already set and we have just passed a varying length item. */
4045
4046 END_REPEAT:
4047 previous = NULL;
4048 cd->req_varyopt |= reqvary;
4049 break;
4050
4051
6bf342e1
PH
4052 /* ===================================================================*/
4053 /* Start of nested parenthesized sub-expression, or comment or lookahead or
4054 lookbehind or option setting or condition or all the other extended
47db1125 4055 parenthesis forms. */
8ac170f3
PH
4056
4057 case '(':
4058 newoptions = options;
4059 skipbytes = 0;
6bf342e1
PH
4060 bravalue = OP_CBRA;
4061 save_hwm = cd->hwm;
64f2600a 4062 reset_bracount = FALSE;
8ac170f3 4063
47db1125
NM
4064 /* First deal with various "verbs" that can be introduced by '*'. */
4065
4066 if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4067 {
4068 int i, namelen;
4069 const char *vn = verbnames;
4070 const uschar *name = ++ptr;
4071 previous = NULL;
4072 while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
4073 if (*ptr == ':')
4074 {
4075 *errorcodeptr = ERR59; /* Not supported */
4076 goto FAILED;
4077 }
4078 if (*ptr != ')')
4079 {
4080 *errorcodeptr = ERR60;
4081 goto FAILED;
4082 }
4083 namelen = ptr - name;
4084 for (i = 0; i < verbcount; i++)
4085 {
4086 if (namelen == verbs[i].len &&
4087 strncmp((char *)name, vn, namelen) == 0)
4088 {
4089 *code = verbs[i].op;
4090 if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
4091 break;
4092 }
4093 vn += verbs[i].len + 1;
4094 }
4095 if (i < verbcount) continue;
4096 *errorcodeptr = ERR60;
4097 goto FAILED;
4098 }
4099
4100 /* Deal with the extended parentheses; all are introduced by '?', and the
4101 appearance of any of them means that this is not a capturing group. */
4102
4103 else if (*ptr == '?')
8ac170f3 4104 {
6bf342e1 4105 int i, set, unset, namelen;
8ac170f3 4106 int *optset;
6bf342e1
PH
4107 const uschar *name;
4108 uschar *slot;
8ac170f3
PH
4109
4110 switch (*(++ptr))
4111 {
4112 case '#': /* Comment; skip to ket */
4113 ptr++;
6bf342e1
PH
4114 while (*ptr != 0 && *ptr != ')') ptr++;
4115 if (*ptr == 0)
4116 {
4117 *errorcodeptr = ERR18;
4118 goto FAILED;
4119 }
8ac170f3
PH
4120 continue;
4121
6bf342e1 4122
64f2600a
PH
4123 /* ------------------------------------------------------------ */
4124 case '|': /* Reset capture count for each branch */
4125 reset_bracount = TRUE;
4126 /* Fall through */
4127
6bf342e1
PH
4128 /* ------------------------------------------------------------ */
4129 case ':': /* Non-capturing bracket */
8ac170f3
PH
4130 bravalue = OP_BRA;
4131 ptr++;
4132 break;
4133
6bf342e1
PH
4134
4135 /* ------------------------------------------------------------ */
8ac170f3
PH
4136 case '(':
4137 bravalue = OP_COND; /* Conditional group */
4138
6bf342e1
PH
4139 /* A condition can be an assertion, a number (referring to a numbered
4140 group), a name (referring to a named group), or 'R', referring to
4141 recursion. R<digits> and R&name are also permitted for recursion tests.
4142
4143 There are several syntaxes for testing a named group: (?(name)) is used
4144 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4145
4146 There are two unfortunate ambiguities, caused by history. (a) 'R' can
4147 be the recursive thing or the name 'R' (and similarly for 'R' followed
4148 by digits), and (b) a number could be a name that consists of digits.
4149 In both cases, we look for a name first; if not found, we try the other
4150 cases. */
4151
4152 /* For conditions that are assertions, check the syntax, and then exit
4153 the switch. This will take control down to where bracketed groups,
4154 including assertions, are processed. */
4155
4156 if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4157 break;
4158
4159 /* Most other conditions use OP_CREF (a couple change to OP_RREF
4160 below), and all need to skip 3 bytes at the start of the group. */
4161
4162 code[1+LINK_SIZE] = OP_CREF;
4163 skipbytes = 3;
64f2600a 4164 refsign = -1;
6bf342e1
PH
4165
4166 /* Check for a test for recursion in a named group. */
4167
4168 if (ptr[1] == 'R' && ptr[2] == '&')
8ac170f3 4169 {
6bf342e1
PH
4170 terminator = -1;
4171 ptr += 2;
4172 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4173 }
aa41d2de 4174
6bf342e1
PH
4175 /* Check for a test for a named group's having been set, using the Perl
4176 syntax (?(<name>) or (?('name') */
aa41d2de 4177
6bf342e1
PH
4178 else if (ptr[1] == '<')
4179 {
4180 terminator = '>';
4181 ptr++;
4182 }
4183 else if (ptr[1] == '\'')
4184 {
4185 terminator = '\'';
4186 ptr++;
4187 }
64f2600a
PH
4188 else
4189 {
4190 terminator = 0;
4191 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4192 }
8ac170f3 4193
6bf342e1 4194 /* We now expect to read a name; any thing else is an error */
8ac170f3 4195
6bf342e1
PH
4196 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4197 {
4198 ptr += 1; /* To get the right offset */
4199 *errorcodeptr = ERR28;
4200 goto FAILED;
4201 }
4202
4203 /* Read the name, but also get it as a number if it's all digits */
4204
4205 recno = 0;
4206 name = ++ptr;
4207 while ((cd->ctypes[*ptr] & ctype_word) != 0)
4208 {
4209 if (recno >= 0)
4210 recno = ((digitab[*ptr] & ctype_digit) != 0)?
4211 recno * 10 + *ptr - '0' : -1;
8ac170f3 4212 ptr++;
6bf342e1
PH
4213 }
4214 namelen = ptr - name;
aa41d2de 4215
6bf342e1
PH
4216 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4217 {
4218 ptr--; /* Error offset */
4219 *errorcodeptr = ERR26;
4220 goto FAILED;
4221 }
aa41d2de 4222
6bf342e1 4223 /* Do no further checking in the pre-compile phase. */
aa41d2de 4224
6bf342e1 4225 if (lengthptr != NULL) break;
aa41d2de 4226
6bf342e1 4227 /* In the real compile we do the work of looking for the actual
64f2600a
PH
4228 reference. If the string started with "+" or "-" we require the rest to
4229 be digits, in which case recno will be set. */
4230
4231 if (refsign > 0)
4232 {
4233 if (recno <= 0)
4234 {
4235 *errorcodeptr = ERR58;
4236 goto FAILED;
4237 }
4238 if (refsign == '-')
4239 {
4240 recno = cd->bracount - recno + 1;
4241 if (recno <= 0)
4242 {
4243 *errorcodeptr = ERR15;
4244 goto FAILED;
4245 }
4246 }
4247 else recno += cd->bracount;
4248 PUT2(code, 2+LINK_SIZE, recno);
4249 break;
4250 }
4251
4252 /* Otherwise (did not start with "+" or "-"), start by looking for the
4253 name. */
aa41d2de 4254
6bf342e1
PH
4255 slot = cd->name_table;
4256 for (i = 0; i < cd->names_found; i++)
4257 {
4258 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4259 slot += cd->name_entry_size;
4260 }
aa41d2de 4261
6bf342e1 4262 /* Found a previous named subpattern */
aa41d2de 4263
6bf342e1
PH
4264 if (i < cd->names_found)
4265 {
4266 recno = GET2(slot, 0);
4267 PUT2(code, 2+LINK_SIZE, recno);
4268 }
aa41d2de 4269
6bf342e1 4270 /* Search the pattern for a forward reference */
aa41d2de 4271
6bf342e1
PH
4272 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4273 (options & PCRE_EXTENDED) != 0)) > 0)
4274 {
4275 PUT2(code, 2+LINK_SIZE, i);
4276 }
aa41d2de 4277
6bf342e1
PH
4278 /* If terminator == 0 it means that the name followed directly after
4279 the opening parenthesis [e.g. (?(abc)...] and in this case there are
4280 some further alternatives to try. For the cases where terminator != 0
4281 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4282 now checked all the possibilities, so give an error. */
aa41d2de 4283
6bf342e1
PH
4284 else if (terminator != 0)
4285 {
4286 *errorcodeptr = ERR15;
4287 goto FAILED;
4288 }
4289
4290 /* Check for (?(R) for recursion. Allow digits after R to specify a
4291 specific group number. */
4292
4293 else if (*name == 'R')
4294 {
4295 recno = 0;
4296 for (i = 1; i < namelen; i++)
aa41d2de 4297 {
6bf342e1
PH
4298 if ((digitab[name[i]] & ctype_digit) == 0)
4299 {
4300 *errorcodeptr = ERR15;
4301 goto FAILED;
4302 }
4303 recno = recno * 10 + name[i] - '0';
aa41d2de 4304 }
6bf342e1
PH
4305 if (recno == 0) recno = RREF_ANY;
4306 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4307 PUT2(code, 2+LINK_SIZE, recno);
4308 }
4309
4310 /* Similarly, check for the (?(DEFINE) "condition", which is always
4311 false. */
4312
4313 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4314 {
4315 code[1+LINK_SIZE] = OP_DEF;
4316 skipbytes = 1;
4317 }
4318
4319 /* Check for the "name" actually being a subpattern number. */
4320
4321 else if (recno > 0)
4322 {
4323 PUT2(code, 2+LINK_SIZE, recno);
8ac170f3 4324 }
aa41d2de 4325
6bf342e1 4326 /* Either an unidentified subpattern, or a reference to (?(0) */
aa41d2de 4327
6bf342e1
PH
4328 else
4329 {
4330 *errorcodeptr = (recno == 0)? ERR35: ERR15;
4331 goto FAILED;
4332 }
8ac170f3
PH
4333 break;
4334
6bf342e1
PH
4335
4336 /* ------------------------------------------------------------ */
8ac170f3
PH
4337 case '=': /* Positive lookahead */
4338 bravalue = OP_ASSERT;
4339 ptr++;
4340 break;
4341
6bf342e1
PH
4342
4343 /* ------------------------------------------------------------ */
8ac170f3 4344 case '!': /* Negative lookahead */
8ac170f3 4345 ptr++;
47db1125
NM
4346 if (*ptr == ')') /* Optimize (?!) */
4347 {
4348 *code++ = OP_FAIL;
4349 previous = NULL;
4350 continue;
4351 }
4352 bravalue = OP_ASSERT_NOT;
8ac170f3
PH
4353 break;
4354
6bf342e1
PH
4355
4356 /* ------------------------------------------------------------ */
4357 case '<': /* Lookbehind or named define */
4358 switch (ptr[1])
8ac170f3
PH
4359 {
4360 case '=': /* Positive lookbehind */
4361 bravalue = OP_ASSERTBACK;
6bf342e1 4362 ptr += 2;
8ac170f3
PH
4363 break;
4364
4365 case '!': /* Negative lookbehind */
4366 bravalue = OP_ASSERTBACK_NOT;
6bf342e1 4367 ptr += 2;
8ac170f3 4368 break;
6bf342e1
PH
4369
4370 default: /* Could be name define, else bad */
4371 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4372 ptr++; /* Correct offset for error */
4373 *errorcodeptr = ERR24;
4374 goto FAILED;
8ac170f3
PH
4375 }
4376 break;
4377
6bf342e1
PH
4378
4379 /* ------------------------------------------------------------ */
8ac170f3
PH
4380 case '>': /* One-time brackets */
4381 bravalue = OP_ONCE;
4382 ptr++;
4383 break;
4384
6bf342e1
PH
4385
4386 /* ------------------------------------------------------------ */
8ac170f3
PH
4387 case 'C': /* Callout - may be followed by digits; */
4388 previous_callout = code; /* Save for later completion */
4389 after_manual_callout = 1; /* Skip one item before completing */
6bf342e1
PH
4390 *code++ = OP_CALLOUT;
4391 {
8ac170f3
PH
4392 int n = 0;
4393 while ((digitab[*(++ptr)] & ctype_digit) != 0)
4394 n = n * 10 + *ptr - '0';
6bf342e1
PH
4395 if (*ptr != ')')
4396 {
4397 *errorcodeptr = ERR39;
4398 goto FAILED;
4399 }
8ac170f3
PH
4400 if (n > 255)
4401 {
4402 *errorcodeptr = ERR38;
4403 goto FAILED;
4404 }
4405 *code++ = n;
4406 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4407 PUT(code, LINK_SIZE, 0); /* Default length */
4408 code += 2 * LINK_SIZE;
4409 }
4410 previous = NULL;
4411 continue;
4412
6bf342e1
PH
4413
4414 /* ------------------------------------------------------------ */
4415 case 'P': /* Python-style named subpattern handling */
4416 if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
4417 {
4418 is_recurse = *ptr == '>';
4419 terminator = ')';
4420 goto NAMED_REF_OR_RECURSE;
4421 }
4422 else if (*ptr != '<') /* Test for Python-style definition */
8ac170f3 4423 {
6bf342e1
PH
4424 *errorcodeptr = ERR41;
4425 goto FAILED;
4426 }
4427 /* Fall through to handle (?P< as (?< is handled */
8ac170f3 4428
8ac170f3 4429
6bf342e1
PH
4430 /* ------------------------------------------------------------ */
4431 DEFINE_NAME: /* Come here from (?< handling */
4432 case '\'':
4433 {
4434 terminator = (*ptr == '<')? '>' : '\'';
4435 name = ++ptr;
4436
4437 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4438 namelen = ptr - name;
4439
4440 /* In the pre-compile phase, just do a syntax check. */
4441
4442 if (lengthptr != NULL)
8ac170f3 4443 {
6bf342e1
PH
4444 if (*ptr != terminator)
4445 {
4446 *errorcodeptr = ERR42;
4447 goto FAILED;
4448 }
4449 if (cd->names_found >= MAX_NAME_COUNT)
4450 {
4451 *errorcodeptr = ERR49;
4452 goto FAILED;
4453 }
4454 if (namelen + 3 > cd->name_entry_size)
8ac170f3 4455 {
6bf342e1
PH
4456 cd->name_entry_size = namelen + 3;
4457 if (namelen > MAX_NAME_SIZE)
8ac170f3 4458 {
6bf342e1
PH
4459 *errorcodeptr = ERR48;
4460 goto FAILED;
8ac170f3 4461 }
8ac170f3 4462 }
6bf342e1
PH
4463 }
4464
4465 /* In the real compile, create the entry in the table */
4466
4467 else
4468 {
4469 slot = cd->name_table;
4470 for (i = 0; i < cd->names_found; i++)
8ac170f3 4471 {
6bf342e1
PH
4472 int crc = memcmp(name, slot+2, namelen);
4473 if (crc == 0)
4474 {
4475 if (slot[2+namelen] == 0)
4476 {
4477 if ((options & PCRE_DUPNAMES) == 0)
4478 {
4479 *errorcodeptr = ERR43;
4480 goto FAILED;
4481 }
4482 }
4483 else crc = -1; /* Current name is substring */
4484 }
4485 if (crc < 0)
4486 {
4487 memmove(slot + cd->name_entry_size, slot,
4488 (cd->names_found - i) * cd->name_entry_size);
4489 break;
4490 }
4491 slot += cd->name_entry_size;
8ac170f3 4492 }
8ac170f3 4493
6bf342e1
PH
4494 PUT2(slot, 0, cd->bracount + 1);
4495 memcpy(slot + 2, name, namelen);
4496 slot[2+namelen] = 0;
4497 }
8ac170f3
PH
4498 }
4499
6bf342e1 4500 /* In both cases, count the number of names we've encountered. */
8ac170f3 4501
6bf342e1
PH
4502 ptr++; /* Move past > or ' */
4503 cd->names_found++;
4504 goto NUMBERED_GROUP;
8ac170f3 4505
6bf342e1
PH
4506
4507 /* ------------------------------------------------------------ */
4508 case '&': /* Perl recursion/subroutine syntax */
4509 terminator = ')';
4510 is_recurse = TRUE;
4511 /* Fall through */
4512
4513 /* We come here from the Python syntax above that handles both
4514 references (?P=name) and recursion (?P>name), as well as falling
4515 through from the Perl recursion syntax (?&name). */
4516
4517 NAMED_REF_OR_RECURSE:
4518 name = ++ptr;
4519 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4520 namelen = ptr - name;
4521
4522 /* In the pre-compile phase, do a syntax check and set a dummy
4523 reference number. */
4524
4525 if (lengthptr != NULL)
4526 {
4527 if (*ptr != terminator)
4528 {
4529 *errorcodeptr = ERR42;
4530 goto FAILED;
4531 }
4532 if (namelen > MAX_NAME_SIZE)
4533 {
4534 *errorcodeptr = ERR48;
4535 goto FAILED;
4536 }
4537 recno = 0;
4538 }
4539
4540 /* In the real compile, seek the name in the table */
4541
4542 else
4543 {
4544 slot = cd->name_table;
4545 for (i = 0; i < cd->names_found; i++)
4546 {
4547 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4548 slot += cd->name_entry_size;
4549 }
aa41d2de
PH
4550
4551 if (i < cd->names_found) /* Back reference */
4552 {
4553 recno = GET2(slot, 0);
4554 }
4555 else if ((recno = /* Forward back reference */
6bf342e1
PH
4556 find_parens(ptr, cd->bracount, name, namelen,
4557 (options & PCRE_EXTENDED) != 0)) <= 0)
8ac170f3
PH
4558 {
4559 *errorcodeptr = ERR15;
4560 goto FAILED;
4561 }
6bf342e1 4562 }
8ac170f3 4563
6bf342e1
PH
4564 /* In both phases, we can now go to the code than handles numerical
4565 recursion or backreferences. */
8ac170f3 4566
6bf342e1
PH
4567 if (is_recurse) goto HANDLE_RECURSION;
4568 else goto HANDLE_REFERENCE;
8ac170f3 4569
8ac170f3 4570
6bf342e1
PH
4571 /* ------------------------------------------------------------ */
4572 case 'R': /* Recursion */
8ac170f3
PH
4573 ptr++; /* Same as (?0) */
4574 /* Fall through */
4575
8ac170f3 4576
6bf342e1 4577 /* ------------------------------------------------------------ */
64f2600a 4578 case '-': case '+':
6bf342e1
PH
4579 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4580 case '5': case '6': case '7': case '8': case '9': /* subroutine */
8ac170f3
PH
4581 {
4582 const uschar *called;
64f2600a
PH
4583
4584 if ((refsign = *ptr) == '+') ptr++;
4585 else if (refsign == '-')
4586 {
4587 if ((digitab[ptr[1]] & ctype_digit) == 0)
4588 goto OTHER_CHAR_AFTER_QUERY;
4589 ptr++;
4590 }
4591
8ac170f3
PH
4592 recno = 0;
4593 while((digitab[*ptr] & ctype_digit) != 0)
4594 recno = recno * 10 + *ptr++ - '0';
64f2600a 4595
6bf342e1
PH
4596 if (*ptr != ')')
4597 {
4598 *errorcodeptr = ERR29;
4599 goto FAILED;
4600 }
8ac170f3 4601
64f2600a
PH
4602 if (refsign == '-')
4603 {
4604 if (recno == 0)
4605 {
4606 *errorcodeptr = ERR58;
4607 goto FAILED;
4608 }
4609 recno = cd->bracount - recno + 1;
4610 if (recno <= 0)
4611 {
4612 *errorcodeptr = ERR15;
4613 goto FAILED;
4614 }
4615 }
4616 else if (refsign == '+')
4617 {
4618 if (recno == 0)
4619 {
4620 *errorcodeptr = ERR58;
4621 goto FAILED;
4622 }
4623 recno += cd->bracount;
4624 }
4625
8ac170f3
PH
4626 /* Come here from code above that handles a named recursion */
4627
4628 HANDLE_RECURSION:
4629
4630 previous = code;
6bf342e1 4631 called = cd->start_code;
8ac170f3 4632
6bf342e1
PH
4633 /* When we are actually compiling, find the bracket that is being
4634 referenced. Temporarily end the regex in case it doesn't exist before
4635 this point. If we end up with a forward reference, first check that
4636 the bracket does occur later so we can give the error (and position)
4637 now. Then remember this forward reference in the workspace so it can
4638 be filled in at the end. */
8ac170f3 4639
6bf342e1 4640 if (lengthptr == NULL)
8ac170f3 4641 {
6bf342e1
PH
4642 *code = OP_END;
4643 if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
8ac170f3 4644
6bf342e1 4645 /* Forward reference */
8ac170f3 4646
6bf342e1
PH
4647 if (called == NULL)
4648 {
4649 if (find_parens(ptr, cd->bracount, NULL, recno,
4650 (options & PCRE_EXTENDED) != 0) < 0)
4651 {
4652 *errorcodeptr = ERR15;
4653 goto FAILED;
4654 }
4655 called = cd->start_code + recno;
4656 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4657 }
4658
4659 /* If not a forward reference, and the subpattern is still open,
4660 this is a recursive call. We check to see if this is a left
4661 recursion that could loop for ever, and diagnose that case. */
4662
4663 else if (GET(called, 1) == 0 &&
4664 could_be_empty(called, code, bcptr, utf8))
4665 {
4666 *errorcodeptr = ERR40;
4667 goto FAILED;
4668 }
8ac170f3
PH
4669 }
4670
aa41d2de 4671 /* Insert the recursion/subroutine item, automatically wrapped inside
6bf342e1
PH
4672 "once" brackets. Set up a "previous group" length so that a
4673 subsequent quantifier will work. */
aa41d2de
PH
4674
4675 *code = OP_ONCE;
4676 PUT(code, 1, 2 + 2*LINK_SIZE);
4677 code += 1 + LINK_SIZE;
8ac170f3
PH
4678
4679 *code = OP_RECURSE;
4680 PUT(code, 1, called - cd->start_code);
4681 code += 1 + LINK_SIZE;
aa41d2de
PH
4682
4683 *code = OP_KET;
4684 PUT(code, 1, 2 + 2*LINK_SIZE);
4685 code += 1 + LINK_SIZE;
6bf342e1
PH
4686
4687 length_prevgroup = 3 + 3*LINK_SIZE;
8ac170f3 4688 }
6bf342e1
PH
4689
4690 /* Can't determine a first byte now */
4691
4692 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
8ac170f3
PH
4693 continue;
4694
8ac170f3 4695
6bf342e1
PH
4696 /* ------------------------------------------------------------ */
4697 default: /* Other characters: check option setting */
64f2600a 4698 OTHER_CHAR_AFTER_QUERY:
8ac170f3
PH
4699 set = unset = 0;
4700 optset = &set;
4701
4702 while (*ptr != ')' && *ptr != ':')
4703 {
4704 switch (*ptr++)
4705 {
4706 case '-': optset = &unset; break;
4707
6bf342e1
PH
4708 case 'J': /* Record that it changed in the external options */
4709 *optset |= PCRE_DUPNAMES;
47db1125 4710 cd->external_flags |= PCRE_JCHANGED;
6bf342e1
PH
4711 break;
4712
8ac170f3
PH
4713 case 'i': *optset |= PCRE_CASELESS; break;
4714 case 'm': *optset |= PCRE_MULTILINE; break;
4715 case 's': *optset |= PCRE_DOTALL; break;
4716 case 'x': *optset |= PCRE_EXTENDED; break;
4717 case 'U': *optset |= PCRE_UNGREEDY; break;
4718 case 'X': *optset |= PCRE_EXTRA; break;
6bf342e1
PH
4719
4720 default: *errorcodeptr = ERR12;
4721 ptr--; /* Correct the offset */
4722 goto FAILED;
8ac170f3
PH
4723 }
4724 }
4725
4726 /* Set up the changed option bits, but don't change anything yet. */
4727
4728 newoptions = (options | set) & (~unset);
4729
4730 /* If the options ended with ')' this is not the start of a nested
6bf342e1
PH
4731 group with option changes, so the options change at this level. If this
4732 item is right at the start of the pattern, the options can be
4733 abstracted and made external in the pre-compile phase, and ignored in
4734 the compile phase. This can be helpful when matching -- for instance in
4735 caseless checking of required bytes.
4736
4737 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4738 definitely *not* at the start of the pattern because something has been
4739 compiled. In the pre-compile phase, however, the code pointer can have
4740 that value after the start, because it gets reset as code is discarded
4741 during the pre-compile. However, this can happen only at top level - if
4742 we are within parentheses, the starting BRA will still be present. At
4743 any parenthesis level, the length value can be used to test if anything
4744 has been compiled at that level. Thus, a test for both these conditions
4745 is necessary to ensure we correctly detect the start of the pattern in
4746 both phases.
4747
4748 If we are not at the pattern start, compile code to change the ims
4749 options if this setting actually changes any of them. We also pass the
4750 new setting back so that it can be put at the start of any following
4751 branches, and when this group ends (if we are in a group), a resetting
4752 item can be compiled. */
8ac170f3
PH
4753
4754 if (*ptr == ')')
4755 {
6bf342e1
PH
4756 if (code == cd->start_code + 1 + LINK_SIZE &&
4757 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
8ac170f3 4758 {
6bf342e1
PH
4759 cd->external_options = newoptions;
4760 options = newoptions;
8ac170f3 4761 }
6bf342e1
PH
4762 else
4763 {
4764 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4765 {
4766 *code++ = OP_OPT;
4767 *code++ = newoptions & PCRE_IMS;
4768 }
8ac170f3 4769
6bf342e1
PH
4770 /* Change options at this level, and pass them back for use
4771 in subsequent branches. Reset the greedy defaults and the case
4772 value for firstbyte and reqbyte. */
8ac170f3 4773
6bf342e1
PH
4774 *optionsptr = options = newoptions;
4775 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4776 greedy_non_default = greedy_default ^ 1;
4777 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4778 }
8ac170f3
PH
4779
4780 previous = NULL; /* This item can't be repeated */
4781 continue; /* It is complete */
4782 }
4783
4784 /* If the options ended with ':' we are heading into a nested group
4785 with possible change of options. Such groups are non-capturing and are
4786 not assertions of any kind. All we need to do is skip over the ':';
4787 the newoptions value is handled below. */
4788
4789 bravalue = OP_BRA;
4790 ptr++;
6bf342e1
PH
4791 } /* End of switch for character following (? */
4792 } /* End of (? handling */
8ac170f3 4793
6bf342e1
PH
4794 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4795 all unadorned brackets become non-capturing and behave like (?:...)
4796 brackets. */
8ac170f3
PH
4797
4798 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4799 {
4800 bravalue = OP_BRA;
4801 }
4802
6bf342e1 4803 /* Else we have a capturing group. */
8ac170f3
PH
4804
4805 else
4806 {
4807 NUMBERED_GROUP:
6bf342e1
PH
4808 cd->bracount += 1;
4809 PUT2(code, 1+LINK_SIZE, cd->bracount);
4810 skipbytes = 2;
8ac170f3
PH
4811 }
4812
6bf342e1
PH
4813 /* Process nested bracketed regex. Assertions may not be repeated, but
4814 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4815 non-register variable in order to be able to pass its address because some
4816 compilers complain otherwise. Pass in a new setting for the ims options if
4817 they have changed. */
8ac170f3
PH
4818
4819 previous = (bravalue >= OP_ONCE)? code : NULL;
4820 *code = bravalue;
4821 tempcode = code;
4822 tempreqvary = cd->req_varyopt; /* Save value before bracket */
6bf342e1 4823 length_prevgroup = 0; /* Initialize for pre-compile phase */
8ac170f3
PH
4824
4825 if (!compile_regex(
4826 newoptions, /* The complete new option state */
4827 options & PCRE_IMS, /* The previous ims option state */
8ac170f3
PH
4828 &tempcode, /* Where to put code (updated) */
4829 &ptr, /* Input pointer (updated) */
4830 errorcodeptr, /* Where to put an error message */
4831 (bravalue == OP_ASSERTBACK ||
4832 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
64f2600a 4833 reset_bracount, /* True if (?| group */
6bf342e1 4834 skipbytes, /* Skip over bracket number */
8ac170f3
PH
4835 &subfirstbyte, /* For possible first char */
4836 &subreqbyte, /* For possible last char */
4837 bcptr, /* Current branch chain */
6bf342e1
PH
4838 cd, /* Tables block */
4839 (lengthptr == NULL)? NULL : /* Actual compile phase */
4840 &length_prevgroup /* Pre-compile phase */
4841 ))
8ac170f3
PH
4842 goto FAILED;
4843
4844 /* At the end of compiling, code is still pointing to the start of the
4845 group, while tempcode has been updated to point past the end of the group
4846 and any option resetting that may follow it. The pattern pointer (ptr)
4847 is on the bracket. */
4848
4849 /* If this is a conditional bracket, check that there are no more than
64f2600a
PH
4850 two branches in the group, or just one if it's a DEFINE group. We do this
4851 in the real compile phase, not in the pre-pass, where the whole group may
4852 not be available. */
8ac170f3 4853
64f2600a 4854 if (bravalue == OP_COND && lengthptr == NULL)
8ac170f3
PH
4855 {
4856 uschar *tc = code;
aa41d2de 4857 int condcount = 0;
8ac170f3
PH
4858
4859 do {
4860 condcount++;
4861 tc += GET(tc,1);
4862 }
4863 while (*tc != OP_KET);
4864
6bf342e1
PH
4865 /* A DEFINE group is never obeyed inline (the "condition" is always
4866 false). It must have only one branch. */
4867
4868 if (code[LINK_SIZE+1] == OP_DEF)
8ac170f3 4869 {
6bf342e1
PH
4870 if (condcount > 1)
4871 {
4872 *errorcodeptr = ERR54;
4873 goto FAILED;
4874 }
4875 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
4876 }
4877
4878 /* A "normal" conditional group. If there is just one branch, we must not
4879 make use of its firstbyte or reqbyte, because this is equivalent to an
4880 empty second branch. */
4881
4882 else
4883 {
4884 if (condcount > 2)
4885 {
4886 *errorcodeptr = ERR27;
4887 goto FAILED;
4888 }
4889 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
8ac170f3 4890 }
6bf342e1 4891 }
8ac170f3 4892
6bf342e1
PH
4893 /* Error if hit end of pattern */
4894
4895 if (*ptr != ')')
4896 {
4897 *errorcodeptr = ERR14;
4898 goto FAILED;
4899 }
4900
47db1125
NM
4901 /* In the pre-compile phase, update the length by the length of the group,
4902 less the brackets at either end. Then reduce the compiled code to just a
4903 set of non-capturing brackets so that it doesn't use much memory if it is
4904 duplicated by a quantifier.*/
8ac170f3 4905
6bf342e1
PH
4906 if (lengthptr != NULL)
4907 {
47db1125
NM
4908 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
4909 {
4910 *errorcodeptr = ERR20;
4911 goto FAILED;
4912 }
6bf342e1 4913 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
47db1125 4914 *code++ = OP_BRA;
6bf342e1
PH
4915 PUTINC(code, 0, 1 + LINK_SIZE);
4916 *code++ = OP_KET;
4917 PUTINC(code, 0, 1 + LINK_SIZE);
47db1125 4918 break; /* No need to waste time with special character handling */
8ac170f3
PH
4919 }
4920
6bf342e1
PH
4921 /* Otherwise update the main code pointer to the end of the group. */
4922
47db1125 4923 code = tempcode;
6bf342e1
PH
4924
4925 /* For a DEFINE group, required and first character settings are not
4926 relevant. */
4927
4928 if (bravalue == OP_DEF) break;
4929
4930 /* Handle updating of the required and first characters for other types of
4931 group. Update for normal brackets of all kinds, and conditions with two
4932 branches (see code above). If the bracket is followed by a quantifier with
4933 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4934 zerofirstbyte outside the main loop so that they can be accessed for the
4935 back off. */
8ac170f3
PH
4936
4937 zeroreqbyte = reqbyte;
4938 zerofirstbyte = firstbyte;
4939 groupsetfirstbyte = FALSE;
4940
6bf342e1 4941 if (bravalue >= OP_ONCE)
8ac170f3
PH
4942 {
4943 /* If we have not yet set a firstbyte in this branch, take it from the
4944 subpattern, remembering that it was set here so that a repeat of more
4945 than one can replicate it as reqbyte if necessary. If the subpattern has
4946 no firstbyte, set "none" for the whole branch. In both cases, a zero
4947 repeat forces firstbyte to "none". */
4948
4949 if (firstbyte == REQ_UNSET)
4950 {
4951 if (subfirstbyte >= 0)
4952 {
4953 firstbyte = subfirstbyte;
4954 groupsetfirstbyte = TRUE;
4955 }
4956 else firstbyte = REQ_NONE;
4957 zerofirstbyte = REQ_NONE;
4958 }
4959
4960 /* If firstbyte was previously set, convert the subpattern's firstbyte
4961 into reqbyte if there wasn't one, using the vary flag that was in
4962 existence beforehand. */
4963
4964 else if (subfirstbyte >= 0 && subreqbyte < 0)
4965 subreqbyte = subfirstbyte | tempreqvary;
4966
4967 /* If the subpattern set a required byte (or set a first byte that isn't
4968 really the first byte - see above), set it. */
4969
4970 if (subreqbyte >= 0) reqbyte = subreqbyte;
4971 }
4972
4973 /* For a forward assertion, we take the reqbyte, if set. This can be
4974 helpful if the pattern that follows the assertion doesn't set a different
4975 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
4976 for an assertion, however because it leads to incorrect effect for patterns
4977 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
4978 of a firstbyte. This is overcome by a scan at the end if there's no
4979 firstbyte, looking for an asserted first char. */
4980
4981 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
6bf342e1 4982 break; /* End of processing '(' */
8ac170f3 4983
8ac170f3 4984
6bf342e1
PH
4985 /* ===================================================================*/
4986 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
8ac170f3
PH
4987 are arranged to be the negation of the corresponding OP_values. For the
4988 back references, the values are ESC_REF plus the reference number. Only
4989 back references and those types that consume a character may be repeated.
4990 We can test for values between ESC_b and ESC_Z for the latter; this may
4991 have to change if any new ones are ever created. */
4992
6bf342e1
PH
4993 case '\\':
4994 tempptr = ptr;
4995 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
4996 if (*errorcodeptr != 0) goto FAILED;
4997
8ac170f3
PH
4998 if (c < 0)
4999 {
5000 if (-c == ESC_Q) /* Handle start of quoted string */
5001 {
5002 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
5003 else inescq = TRUE;
5004 continue;
5005 }
5006
6bf342e1
PH
5007 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
5008
8ac170f3
PH
5009 /* For metasequences that actually match a character, we disable the
5010 setting of a first character if it hasn't already been set. */
5011
5012 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
5013 firstbyte = REQ_NONE;
5014
5015 /* Set values to reset to if this is followed by a zero repeat. */
5016
5017 zerofirstbyte = firstbyte;
5018 zeroreqbyte = reqbyte;
5019
64f2600a
PH
5020 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5021 We also support \k{name} (.NET syntax) */
6bf342e1 5022
64f2600a 5023 if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
6bf342e1
PH
5024 {
5025 is_recurse = FALSE;
64f2600a 5026 terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
6bf342e1
PH
5027 goto NAMED_REF_OR_RECURSE;
5028 }
5029
5030 /* Back references are handled specially; must disable firstbyte if
5031 not set to cope with cases like (?=(\w+))\1: which would otherwise set
5032 ':' later. */
8ac170f3
PH
5033
5034 if (-c >= ESC_REF)
5035 {
6bf342e1
PH
5036 recno = -c - ESC_REF;
5037
5038 HANDLE_REFERENCE: /* Come here from named backref handling */
5039 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
8ac170f3
PH
5040 previous = code;
5041 *code++ = OP_REF;
6bf342e1
PH
5042 PUT2INC(code, 0, recno);
5043 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
5044 if (recno > cd->top_backref) cd->top_backref = recno;
8ac170f3
PH
5045 }
5046
6bf342e1 5047 /* So are Unicode property matches, if supported. */
8ac170f3
PH
5048
5049#ifdef SUPPORT_UCP
5050 else if (-c == ESC_P || -c == ESC_p)
5051 {
5052 BOOL negated;
aa41d2de
PH
5053 int pdata;
5054 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
6bf342e1 5055 if (ptype < 0) goto FAILED;
8ac170f3
PH
5056 previous = code;
5057 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
aa41d2de
PH
5058 *code++ = ptype;
5059 *code++ = pdata;
8ac170f3 5060 }
6bf342e1
PH
5061#else
5062
5063 /* If Unicode properties are not supported, \X, \P, and \p are not
5064 allowed. */
5065
5066 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
5067 {
5068 *errorcodeptr = ERR45;
5069 goto FAILED;
5070 }
8ac170f3
PH
5071#endif
5072
6bf342e1
PH
5073 /* For the rest (including \X when Unicode properties are supported), we
5074 can obtain the OP value by negating the escape value. */
8ac170f3
PH
5075
5076 else
5077 {
5078 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
5079 *code++ = -c;
5080 }
5081 continue;
5082 }
5083
5084 /* We have a data character whose value is in c. In UTF-8 mode it may have
5085 a value > 127. We set its representation in the length/buffer, and then
5086 handle it as a data character. */
5087
5088#ifdef SUPPORT_UTF8
5089 if (utf8 && c > 127)
5090 mclength = _pcre_ord2utf8(c, mcbuffer);
5091 else
5092#endif
5093
5094 {
5095 mcbuffer[0] = c;
5096 mclength = 1;
5097 }
8ac170f3
PH
5098 goto ONE_CHAR;
5099
6bf342e1
PH
5100
5101 /* ===================================================================*/
8ac170f3
PH
5102 /* Handle a literal character. It is guaranteed not to be whitespace or #
5103 when the extended flag is set. If we are in UTF-8 mode, it may be a
5104 multi-byte literal character. */
5105
5106 default:
5107 NORMAL_CHAR:
5108 mclength = 1;
5109 mcbuffer[0] = c;
5110
5111#ifdef SUPPORT_UTF8
6bf342e1 5112 if (utf8 && c >= 0xc0)
8ac170f3
PH
5113 {
5114 while ((ptr[1] & 0xc0) == 0x80)
5115 mcbuffer[mclength++] = *(++ptr);
5116 }
5117#endif
5118
5119 /* At this point we have the character's bytes in mcbuffer, and the length
5120 in mclength. When not in UTF-8 mode, the length is always 1. */
5121
5122 ONE_CHAR:
5123 previous = code;
5124 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
5125 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
5126
47db1125
NM
5127 /* Remember if \r or \n were seen */
5128
5129 if (mcbuffer[0] == '\r' || mcbuffer[0] == '\n')
5130 cd->external_flags |= PCRE_HASCRORLF;
5131
8ac170f3
PH
5132 /* Set the first and required bytes appropriately. If no previous first
5133 byte, set it from this character, but revert to none on a zero repeat.
5134 Otherwise, leave the firstbyte value alone, and don't change it on a zero
5135 repeat. */
5136
5137 if (firstbyte == REQ_UNSET)
5138 {
5139 zerofirstbyte = REQ_NONE;
5140 zeroreqbyte = reqbyte;
5141
5142 /* If the character is more than one byte long, we can set firstbyte
5143 only if it is not to be matched caselessly. */
5144
5145 if (mclength == 1 || req_caseopt == 0)
5146 {
5147 firstbyte = mcbuffer[0] | req_caseopt;
5148 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
5149 }
5150 else firstbyte = reqbyte = REQ_NONE;
5151 }
5152
5153 /* firstbyte was previously set; we can set reqbyte only the length is
5154 1 or the matching is caseful. */
5155
5156 else
5157 {
5158 zerofirstbyte = firstbyte;
5159 zeroreqbyte = reqbyte;
5160 if (mclength == 1 || req_caseopt == 0)
5161 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
5162 }
5163
5164 break; /* End of literal character handling */
5165 }
5166 } /* end of big loop */
5167
6bf342e1 5168
8ac170f3
PH
5169/* Control never reaches here by falling through, only by a goto for all the
5170error states. Pass back the position in the pattern so that it can be displayed
5171to the user for diagnosing the error. */
5172
5173FAILED:
5174*ptrptr = ptr;
5175return FALSE;
5176}
5177
5178
5179
5180
5181/*************************************************
5182* Compile sequence of alternatives *
5183*************************************************/
5184
6bf342e1
PH
5185/* On entry, ptr is pointing past the bracket character, but on return it
5186points to the closing bracket, or vertical bar, or end of string. The code
5187variable is pointing at the byte into which the BRA operator has been stored.
5188If the ims options are changed at the start (for a (?ims: group) or during any
5189branch, we need to insert an OP_OPT item at the start of every following branch
5190to ensure they get set correctly at run time, and also pass the new options
5191into every subsequent branch compile.
5192
5193This function is used during the pre-compile phase when we are trying to find
5194out the amount of memory needed, as well as during the real compile phase. The
5195value of lengthptr distinguishes the two phases.
8ac170f3 5196
64f2600a 5197Arguments:
8ac170f3
PH
5198 options option bits, including any changes for this subpattern
5199 oldims previous settings of ims option bits
8ac170f3
PH
5200 codeptr -> the address of the current code pointer
5201 ptrptr -> the address of the current pattern pointer
5202 errorcodeptr -> pointer to error code variable
5203 lookbehind TRUE if this is a lookbehind assertion
64f2600a 5204 reset_bracount TRUE to reset the count for each branch
6bf342e1 5205 skipbytes skip this many bytes at start (for brackets and OP_COND)
8ac170f3
PH
5206 firstbyteptr place to put the first required character, or a negative number
5207 reqbyteptr place to put the last required character, or a negative number
5208 bcptr pointer to the chain of currently open branches
5209 cd points to the data block with tables pointers etc.
6bf342e1
PH
5210 lengthptr NULL during the real compile phase
5211 points to length accumulator during pre-compile phase
8ac170f3 5212
6bf342e1 5213Returns: TRUE on success
8ac170f3
PH
5214*/
5215
5216static BOOL
6bf342e1 5217compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
64f2600a
PH
5218 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5219 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5220 int *lengthptr)
8ac170f3
PH
5221{
5222const uschar *ptr = *ptrptr;
5223uschar *code = *codeptr;
5224uschar *last_branch = code;
5225uschar *start_bracket = code;
5226uschar *reverse_count = NULL;
5227int firstbyte, reqbyte;
5228int branchfirstbyte, branchreqbyte;
6bf342e1 5229int length;
64f2600a
PH
5230int orig_bracount;
5231int max_bracount;
8ac170f3
PH
5232branch_chain bc;
5233
5234bc.outer = bcptr;
5235bc.current = code;
5236
5237firstbyte = reqbyte = REQ_UNSET;
5238
6bf342e1
PH
5239/* Accumulate the length for use in the pre-compile phase. Start with the
5240length of the BRA and KET and any extra bytes that are required at the
5241beginning. We accumulate in a local variable to save frequent testing of
5242lenthptr for NULL. We cannot do this by looking at the value of code at the
5243start and end of each alternative, because compiled items are discarded during
5244the pre-compile phase so that the work space is not exceeded. */
5245
5246length = 2 + 2*LINK_SIZE + skipbytes;
5247
5248/* WARNING: If the above line is changed for any reason, you must also change
5249the code that abstracts option settings at the start of the pattern and makes
5250them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5251pre-compile phase to find out whether anything has yet been compiled or not. */
5252
8ac170f3
PH
5253/* Offset is set zero to mark that this bracket is still open */
5254
5255PUT(code, 1, 0);
5256code += 1 + LINK_SIZE + skipbytes;
5257
5258/* Loop for each alternative branch */
5259
64f2600a 5260orig_bracount = max_bracount = cd->bracount;
8ac170f3
PH
5261for (;;)
5262 {
64f2600a
PH
5263 /* For a (?| group, reset the capturing bracket count so that each branch
5264 uses the same numbers. */
5265
5266 if (reset_bracount) cd->bracount = orig_bracount;
5267
8ac170f3
PH
5268 /* Handle a change of ims options at the start of the branch */
5269
5270 if ((options & PCRE_IMS) != oldims)
5271 {
5272 *code++ = OP_OPT;
5273 *code++ = options & PCRE_IMS;
6bf342e1 5274 length += 2;
8ac170f3
PH
5275 }
5276
5277 /* Set up dummy OP_REVERSE if lookbehind assertion */
5278
5279 if (lookbehind)
5280 {
5281 *code++ = OP_REVERSE;
5282 reverse_count = code;
5283 PUTINC(code, 0, 0);
6bf342e1 5284 length += 1 + LINK_SIZE;
8ac170f3
PH
5285 }
5286
6bf342e1
PH
5287 /* Now compile the branch; in the pre-compile phase its length gets added
5288 into the length. */
8ac170f3 5289
6bf342e1
PH
5290 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5291 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
8ac170f3
PH
5292 {
5293 *ptrptr = ptr;
5294 return FALSE;
5295 }
5296
64f2600a
PH
5297 /* Keep the highest bracket count in case (?| was used and some branch
5298 has fewer than the rest. */
5299
5300 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5301
6bf342e1 5302 /* In the real compile phase, there is some post-processing to be done. */
8ac170f3 5303
6bf342e1 5304 if (lengthptr == NULL)
8ac170f3 5305 {
6bf342e1
PH
5306 /* If this is the first branch, the firstbyte and reqbyte values for the
5307 branch become the values for the regex. */
8ac170f3 5308
6bf342e1
PH
5309 if (*last_branch != OP_ALT)
5310 {
5311 firstbyte = branchfirstbyte;
5312 reqbyte = branchreqbyte;
5313 }
8ac170f3 5314
6bf342e1
PH
5315 /* If this is not the first branch, the first char and reqbyte have to
5316 match the values from all the previous branches, except that if the
5317 previous value for reqbyte didn't have REQ_VARY set, it can still match,
5318 and we set REQ_VARY for the regex. */
8ac170f3 5319
6bf342e1 5320 else
8ac170f3 5321 {
6bf342e1
PH
5322 /* If we previously had a firstbyte, but it doesn't match the new branch,
5323 we have to abandon the firstbyte for the regex, but if there was
5324 previously no reqbyte, it takes on the value of the old firstbyte. */
5325
5326 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5327 {
5328 if (reqbyte < 0) reqbyte = firstbyte;
5329 firstbyte = REQ_NONE;
5330 }
8ac170f3 5331
6bf342e1
PH
5332 /* If we (now or from before) have no firstbyte, a firstbyte from the
5333 branch becomes a reqbyte if there isn't a branch reqbyte. */
8ac170f3 5334
6bf342e1
PH
5335 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5336 branchreqbyte = branchfirstbyte;
8ac170f3 5337
6bf342e1 5338 /* Now ensure that the reqbytes match */
8ac170f3 5339
6bf342e1
PH
5340 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5341 reqbyte = REQ_NONE;
5342 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
5343 }
8ac170f3 5344
6bf342e1
PH
5345 /* If lookbehind, check that this branch matches a fixed-length string, and
5346 put the length into the OP_REVERSE item. Temporarily mark the end of the
5347 branch with OP_END. */
8ac170f3 5348
6bf342e1 5349 if (lookbehind)
8ac170f3 5350 {
6bf342e1
PH
5351 int fixed_length;
5352 *code = OP_END;
5353 fixed_length = find_fixedlength(last_branch, options);
5354 DPRINTF(("fixed length = %d\n", fixed_length));
5355 if (fixed_length < 0)
5356 {
5357 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5358 *ptrptr = ptr;
5359 return FALSE;
5360 }
5361 PUT(reverse_count, 0, fixed_length);
8ac170f3 5362 }
8ac170f3
PH
5363 }
5364
64f2600a
PH
5365 /* Reached end of expression, either ')' or end of pattern. In the real
5366 compile phase, go back through the alternative branches and reverse the chain
5367 of offsets, with the field in the BRA item now becoming an offset to the
5368 first alternative. If there are no alternatives, it points to the end of the
5369 group. The length in the terminating ket is always the length of the whole
5370 bracketed item. If any of the ims options were changed inside the group,
5371 compile a resetting op-code following, except at the very end of the pattern.
5372 Return leaving the pointer at the terminating char. */
8ac170f3
PH
5373
5374 if (*ptr != '|')
5375 {
64f2600a 5376 if (lengthptr == NULL)
8ac170f3 5377 {
64f2600a
PH
5378 int branch_length = code - last_branch;
5379 do
5380 {
5381 int prev_length = GET(last_branch, 1);
5382 PUT(last_branch, 1, branch_length);
5383 branch_length = prev_length;
5384 last_branch -= branch_length;
5385 }
5386 while (branch_length > 0);
8ac170f3 5387 }
8ac170f3
PH
5388
5389 /* Fill in the ket */
5390
5391 *code = OP_KET;
5392 PUT(code, 1, code - start_bracket);
5393 code += 1 + LINK_SIZE;
5394
5395 /* Resetting option if needed */
5396
5397 if ((options & PCRE_IMS) != oldims && *ptr == ')')
5398 {
5399 *code++ = OP_OPT;
5400 *code++ = oldims;
6bf342e1 5401 length += 2;
8ac170f3
PH
5402 }
5403
64f2600a
PH
5404 /* Retain the highest bracket number, in case resetting was used. */
5405
5406 cd->bracount = max_bracount;
5407
8ac170f3
PH
5408 /* Set values to pass back */
5409
5410 *codeptr = code;
5411 *ptrptr = ptr;
5412 *firstbyteptr = firstbyte;
5413 *reqbyteptr = reqbyte;
47db1125
NM
5414 if (lengthptr != NULL)
5415 {
5416 if (OFLOW_MAX - *lengthptr < length)
5417 {
5418 *errorcodeptr = ERR20;
5419 return FALSE;
5420 }
5421 *lengthptr += length;
5422 }
8ac170f3
PH
5423 return TRUE;
5424 }
5425
64f2600a
PH
5426 /* Another branch follows. In the pre-compile phase, we can move the code
5427 pointer back to where it was for the start of the first branch. (That is,
5428 pretend that each branch is the only one.)
5429
5430 In the real compile phase, insert an ALT node. Its length field points back
8ac170f3
PH
5431 to the previous branch while the bracket remains open. At the end the chain
5432 is reversed. It's done like this so that the start of the bracket has a
5433 zero offset until it is closed, making it possible to detect recursion. */
5434
64f2600a
PH
5435 if (lengthptr != NULL)
5436 {
5437 code = *codeptr + 1 + LINK_SIZE + skipbytes;
5438 length += 1 + LINK_SIZE;
5439 }
5440 else
5441 {
5442 *code = OP_ALT;
5443 PUT(code, 1, code - last_branch);
5444 bc.current = last_branch = code;
5445 code += 1 + LINK_SIZE;
5446 }
5447
8ac170f3
PH
5448 ptr++;
5449 }
5450/* Control never reaches here */
5451}
5452
5453
5454
5455
5456/*************************************************
5457* Check for anchored expression *
5458*************************************************/
5459
5460/* Try to find out if this is an anchored regular expression. Consider each
5461alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
5462all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
5463it's anchored. However, if this is a multiline pattern, then only OP_SOD
5464counts, since OP_CIRC can match in the middle.
5465
5466We can also consider a regex to be anchored if OP_SOM starts all its branches.
5467This is the code for \G, which means "match at start of match position, taking
5468into account the match offset".
5469
5470A branch is also implicitly anchored if it starts with .* and DOTALL is set,
5471because that will try the rest of the pattern at all possible matching points,
5472so there is no point trying again.... er ....
5473
5474.... except when the .* appears inside capturing parentheses, and there is a
5475subsequent back reference to those parentheses. We haven't enough information
5476to catch that case precisely.
5477
5478At first, the best we could do was to detect when .* was in capturing brackets
5479and the highest back reference was greater than or equal to that level.
5480However, by keeping a bitmap of the first 31 back references, we can catch some
5481of the more common cases more precisely.
5482
5483Arguments:
5484 code points to start of expression (the bracket)
5485 options points to the options setting
5486 bracket_map a bitmap of which brackets we are inside while testing; this
5487 handles up to substring 31; after that we just have to take
5488 the less precise approach
5489 backref_map the back reference bitmap
5490
5491Returns: TRUE or FALSE
5492*/
5493
5494static BOOL
5495is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
5496 unsigned int backref_map)
5497{
5498do {
6bf342e1
PH
5499 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5500 options, PCRE_MULTILINE, FALSE);
8ac170f3
PH
5501 register int op = *scode;
5502
6bf342e1
PH
5503 /* Non-capturing brackets */
5504
5505 if (op == OP_BRA)
5506 {
5507 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5508 }
5509
8ac170f3
PH
5510 /* Capturing brackets */
5511
6bf342e1 5512 else if (op == OP_CBRA)
8ac170f3 5513 {
6bf342e1
PH
5514 int n = GET2(scode, 1+LINK_SIZE);
5515 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
8ac170f3
PH
5516 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
5517 }
5518
5519 /* Other brackets */
5520
6bf342e1 5521 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
8ac170f3
PH
5522 {
5523 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5524 }
5525
5526 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
5527 are or may be referenced. */
5528
6bf342e1
PH
5529 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5530 op == OP_TYPEPOSSTAR) &&
8ac170f3
PH
5531 (*options & PCRE_DOTALL) != 0)
5532 {
5533 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5534 }
5535
5536 /* Check for explicit anchoring */
5537
5538 else if (op != OP_SOD && op != OP_SOM &&
5539 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
5540 return FALSE;
5541 code += GET(code, 1);
5542 }
5543while (*code == OP_ALT); /* Loop for each alternative */
5544return TRUE;
5545}
5546
5547
5548
5549/*************************************************
5550* Check for starting with ^ or .* *
5551*************************************************/
5552
5553/* This is called to find out if every branch starts with ^ or .* so that
5554"first char" processing can be done to speed things up in multiline
5555matching and for non-DOTALL patterns that start with .* (which must start at
5556the beginning or after \n). As in the case of is_anchored() (see above), we
5557have to take account of back references to capturing brackets that contain .*
5558because in that case we can't make the assumption.
5559
5560Arguments:
5561 code points to start of expression (the bracket)
5562 bracket_map a bitmap of which brackets we are inside while testing; this
5563 handles up to substring 31; after that we just have to take
5564 the less precise approach
5565 backref_map the back reference bitmap
5566
5567Returns: TRUE or FALSE
5568*/
5569
5570static BOOL
5571is_startline(const uschar *code, unsigned int bracket_map,
5572 unsigned int backref_map)
5573{
5574do {
6bf342e1
PH
5575 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5576 NULL, 0, FALSE);
8ac170f3
PH
5577 register int op = *scode;
5578
6bf342e1
PH
5579 /* Non-capturing brackets */
5580
5581 if (op == OP_BRA)
5582 {
5583 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5584 }
5585
8ac170f3
PH
5586 /* Capturing brackets */
5587
6bf342e1 5588 else if (op == OP_CBRA)
8ac170f3 5589 {
6bf342e1
PH
5590 int n = GET2(scode, 1+LINK_SIZE);
5591 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
8ac170f3
PH
5592 if (!is_startline(scode, new_map, backref_map)) return FALSE;
5593 }
5594
5595 /* Other brackets */
5596
6bf342e1 5597 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
8ac170f3
PH
5598 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
5599
5600 /* .* means "start at start or after \n" if it isn't in brackets that
5601 may be referenced. */
5602
6bf342e1 5603 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
8ac170f3
PH
5604 {
5605 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5606 }
5607
5608 /* Check for explicit circumflex */
5609
5610 else if (op != OP_CIRC) return FALSE;
5611
5612 /* Move on to the next alternative */
5613
5614 code += GET(code, 1);
5615 }
5616while (*code == OP_ALT); /* Loop for each alternative */
5617return TRUE;
5618}
5619
5620
5621
5622/*************************************************
5623* Check for asserted fixed first char *
5624*************************************************/
5625
5626/* During compilation, the "first char" settings from forward assertions are
5627discarded, because they can cause conflicts with actual literals that follow.
5628However, if we end up without a first char setting for an unanchored pattern,
5629it is worth scanning the regex to see if there is an initial asserted first
5630char. If all branches start with the same asserted char, or with a bracket all
5631of whose alternatives start with the same asserted char (recurse ad lib), then
5632we return that char, otherwise -1.
5633
5634Arguments:
5635 code points to start of expression (the bracket)
5636 options pointer to the options (used to check casing changes)
5637 inassert TRUE if in an assertion
5638
5639Returns: -1 or the fixed first char
5640*/
5641
5642static int
5643find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
5644{
5645register int c = -1;
5646do {
5647 int d;
5648 const uschar *scode =
5649 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5650 register int op = *scode;
5651
8ac170f3
PH
5652 switch(op)
5653 {
5654 default:
5655 return -1;
5656
5657 case OP_BRA:
6bf342e1 5658 case OP_CBRA:
8ac170f3
PH
5659 case OP_ASSERT:
5660 case OP_ONCE:
5661 case OP_COND:
5662 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
5663 return -1;
5664 if (c < 0) c = d; else if (c != d) return -1;
5665 break;
5666
5667 case OP_EXACT: /* Fall through */
5668 scode += 2;
5669
5670 case OP_CHAR:
5671 case OP_CHARNC:
5672 case OP_PLUS:
5673 case OP_MINPLUS:
6bf342e1 5674 case OP_POSPLUS:
8ac170f3
PH
5675 if (!inassert) return -1;
5676 if (c < 0)
5677 {
5678 c = scode[1];
5679 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5680 }
5681 else if (c != scode[1]) return -1;
5682 break;
5683 }
5684
5685 code += GET(code, 1);
5686 }
5687while (*code == OP_ALT);
5688return c;
5689}
5690
5691
5692
5693/*************************************************
5694* Compile a Regular Expression *
5695*************************************************/
5696
5697/* This function takes a string and returns a pointer to a block of store
5698holding a compiled version of the expression. The original API for this
5699function had no error code return variable; it is retained for backwards
5700compatibility. The new function is given a new name.
5701
5702Arguments:
5703 pattern the regular expression
5704 options various option bits
5705 errorcodeptr pointer to error code variable (pcre_compile2() only)
5706 can be NULL if you don't want a code value
5707 errorptr pointer to pointer to error text
5708 erroroffset ptr offset in pattern where error was detected
5709 tables pointer to character tables or NULL
5710
5711Returns: pointer to compiled data block, or NULL on error,
5712 with errorptr and erroroffset set
5713*/
5714
64f2600a 5715PCRE_EXP_DEFN pcre *
8ac170f3
PH
5716pcre_compile(const char *pattern, int options, const char **errorptr,
5717 int *erroroffset, const unsigned char *tables)
5718{
5719return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5720}
5721
5722
64f2600a 5723PCRE_EXP_DEFN pcre *
8ac170f3
PH
5724pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5725 const char **errorptr, int *erroroffset, const unsigned char *tables)
5726{
5727real_pcre *re;
6bf342e1
PH
5728int length = 1; /* For final END opcode */
5729int firstbyte, reqbyte, newline;
8ac170f3 5730int errorcode = 0;
47db1125 5731int skipatstart = 0;
8ac170f3
PH
5732#ifdef SUPPORT_UTF8
5733BOOL utf8;
8ac170f3 5734#endif
8ac170f3
PH
5735size_t size;
5736uschar *code;
5737const uschar *codestart;
5738const uschar *ptr;
5739compile_data compile_block;
aa41d2de 5740compile_data *cd = &compile_block;
6bf342e1
PH
5741
5742/* This space is used for "compiling" into during the first phase, when we are
5743computing the amount of memory that is needed. Compiled items are thrown away
5744as soon as possible, so that a fairly large buffer should be sufficient for
5745this purpose. The same space is used in the second phase for remembering where
5746to fill in forward references to subpatterns. */
5747
5748uschar cworkspace[COMPILE_WORK_SIZE];
5749
5750
5751/* Set this early so that early errors get offset 0. */
5752
5753ptr = (const uschar *)pattern;
8ac170f3
PH
5754
5755/* We can't pass back an error message if errorptr is NULL; I guess the best we
5756can do is just return NULL, but we can set a code value if there is a code
5757pointer. */
5758
5759if (errorptr == NULL)
5760 {
5761 if (errorcodeptr != NULL) *errorcodeptr = 99;
5762 return NULL;
5763 }
5764
5765*errorptr = NULL;
5766if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5767
5768/* However, we can give a message for this error */
5769
5770if (erroroffset == NULL)
5771 {
5772 errorcode = ERR16;
64f2600a 5773 goto PCRE_EARLY_ERROR_RETURN2;
8ac170f3
PH
5774 }
5775
5776*erroroffset = 0;
5777
5778/* Can't support UTF8 unless PCRE has been compiled to include the code. */
5779
5780#ifdef SUPPORT_UTF8
5781utf8 = (options & PCRE_UTF8) != 0;
5782if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
5783 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5784 {
5785 errorcode = ERR44;
64f2600a 5786 goto PCRE_EARLY_ERROR_RETURN2;
8ac170f3
PH
5787 }
5788#else
5789if ((options & PCRE_UTF8) != 0)
5790 {
5791 errorcode = ERR32;
5792 goto PCRE_EARLY_ERROR_RETURN;
5793 }
5794#endif
5795
5796if ((options & ~PUBLIC_OPTIONS) != 0)
5797 {
5798 errorcode = ERR17;
5799 goto PCRE_EARLY_ERROR_RETURN;
5800 }
5801
5802/* Set up pointers to the individual character tables */
5803
5804if (tables == NULL) tables = _pcre_default_tables;
aa41d2de
PH
5805cd->lcc = tables + lcc_offset;
5806cd->fcc = tables + fcc_offset;
5807cd->cbits = tables + cbits_offset;
5808cd->ctypes = tables + ctypes_offset;
5809
47db1125
NM
5810/* Check for global one-time settings at the start of the pattern, and remember
5811the offset for later. */
5812
5813while (ptr[skipatstart] == '(' && ptr[skipatstart+1] == '*')
5814 {
5815 int newnl = 0;
5816 int newbsr = 0;
5817
5818 if (strncmp((char *)(ptr+skipatstart+2), "CR)", 3) == 0)
5819 { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
5820 else if (strncmp((char *)(ptr+skipatstart+2), "LF)", 3) == 0)
5821 { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
5822 else if (strncmp((char *)(ptr+skipatstart+2), "CRLF)", 5) == 0)
5823 { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
5824 else if (strncmp((char *)(ptr+skipatstart+2), "ANY)", 4) == 0)
5825 { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
5826 else if (strncmp((char *)(ptr+skipatstart+2), "ANYCRLF)", 8) == 0)
5827 { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
5828
5829 else if (strncmp((char *)(ptr+skipatstart+2), "BSR_ANYCRLF)", 12) == 0)
5830 { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
5831 else if (strncmp((char *)(ptr+skipatstart+2), "BSR_UNICODE)", 12) == 0)
5832 { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
5833
5834 if (newnl != 0)
5835 options = (options & ~PCRE_NEWLINE_BITS) | newnl;
5836 else if (newbsr != 0)
5837 options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
5838 else break;
5839 }
5840
5841/* Check validity of \R options. */
5842
5843switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5844 {
5845 case 0:
5846 case PCRE_BSR_ANYCRLF:
5847 case PCRE_BSR_UNICODE:
5848 break;
5849 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5850 }
5851
6bf342e1 5852/* Handle different types of newline. The three bits give seven cases. The
64f2600a
PH
5853current code allows for fixed one- or two-byte sequences, plus "any" and
5854"anycrlf". */
aa41d2de 5855
47db1125 5856switch (options & PCRE_NEWLINE_BITS)
aa41d2de 5857 {
47db1125 5858 case 0: newline = NEWLINE; break; /* Build-time default */
aa41d2de
PH
5859 case PCRE_NEWLINE_CR: newline = '\r'; break;
5860 case PCRE_NEWLINE_LF: newline = '\n'; break;
5861 case PCRE_NEWLINE_CR+
5862 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
6bf342e1 5863 case PCRE_NEWLINE_ANY: newline = -1; break;
64f2600a 5864 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6bf342e1 5865 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
aa41d2de
PH
5866 }
5867
64f2600a
PH
5868if (newline == -2)
5869 {
5870 cd->nltype = NLTYPE_ANYCRLF;
5871 }
5872else if (newline < 0)
aa41d2de 5873 {
6bf342e1 5874 cd->nltype = NLTYPE_ANY;
aa41d2de
PH
5875 }
5876else
5877 {
6bf342e1
PH
5878 cd->nltype = NLTYPE_FIXED;
5879 if (newline > 255)
5880 {
5881 cd->nllen = 2;
5882 cd->nl[0] = (newline >> 8) & 255;
5883 cd->nl[1] = newline & 255;
5884 }
5885 else
5886 {
5887 cd->nllen = 1;
5888 cd->nl[0] = newline;
5889 }
aa41d2de 5890 }
8ac170f3 5891
6bf342e1
PH
5892/* Maximum back reference and backref bitmap. The bitmap records up to 31 back
5893references to help in deciding whether (.*) can be treated as anchored or not.
5894*/
8ac170f3 5895
aa41d2de
PH
5896cd->top_backref = 0;
5897cd->backref_map = 0;
8ac170f3
PH
5898
5899/* Reflect pattern for debugging output */
5900
5901DPRINTF(("------------------------------------------------------------------\n"));
5902DPRINTF(("%s\n", pattern));
5903
6bf342e1
PH
5904/* Pretend to compile the pattern while actually just accumulating the length
5905of memory required. This behaviour is triggered by passing a non-NULL final
5906argument to compile_regex(). We pass a block of workspace (cworkspace) for it
5907to compile parts of the pattern into; the compiled code is discarded when it is
5908no longer needed, so hopefully this workspace will never overflow, though there
5909is a test for its doing so. */
8ac170f3 5910
6bf342e1
PH
5911cd->bracount = 0;
5912cd->names_found = 0;
5913cd->name_entry_size = 0;
5914cd->name_table = NULL;
5915cd->start_workspace = cworkspace;
5916cd->start_code = cworkspace;
5917cd->hwm = cworkspace;
5918cd->start_pattern = (const uschar *)pattern;
5919cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
5920cd->req_varyopt = 0;
6bf342e1 5921cd->external_options = options;
47db1125 5922cd->external_flags = 0;
8ac170f3 5923
6bf342e1
PH
5924/* Now do the pre-compile. On error, errorcode will be set non-zero, so we
5925don't need to look at the result of the function here. The initial options have
5926been put into the cd block so that they can be changed if an option setting is
5927found within the regex right at the beginning. Bringing initial option settings
5928outside can help speed up starting point checks. */
8ac170f3 5929
47db1125 5930ptr += skipatstart;
6bf342e1
PH
5931code = cworkspace;
5932*code = OP_BRA;
5933(void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
64f2600a
PH
5934 &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
5935 &length);
6bf342e1 5936if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
8ac170f3 5937
6bf342e1
PH
5938DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
5939 cd->hwm - cworkspace));
8ac170f3 5940
6bf342e1
PH
5941if (length > MAX_PATTERN_SIZE)
5942 {
5943 errorcode = ERR20;
5944 goto PCRE_EARLY_ERROR_RETURN;
5945 }
8ac170f3 5946
6bf342e1
PH
5947/* Compute the size of data block needed and get it, either from malloc or
5948externally provided function. Integer overflow should no longer be possible
5949because nowadays we limit the maximum value of cd->names_found and
5950cd->name_entry_size. */
8ac170f3 5951
6bf342e1
PH
5952size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
5953re = (real_pcre *)(pcre_malloc)(size);
8ac170f3 5954
6bf342e1
PH
5955if (re == NULL)
5956 {
5957 errorcode = ERR21;
5958 goto PCRE_EARLY_ERROR_RETURN;
5959 }
8ac170f3 5960
47db1125
NM
5961/* Put in the magic number, and save the sizes, initial options, internal
5962flags, and character table pointer. NULL is used for the default character
5963tables. The nullpad field is at the end; it's there to help in the case when a
5964regex compiled on a system with 4-byte pointers is run on another with 8-byte
5965pointers. */
8ac170f3 5966
6bf342e1
PH
5967re->magic_number = MAGIC_NUMBER;
5968re->size = size;
5969re->options = cd->external_options;
47db1125 5970re->flags = cd->external_flags;
6bf342e1
PH
5971re->dummy1 = 0;
5972re->first_byte = 0;
5973re->req_byte = 0;
5974re->name_table_offset = sizeof(real_pcre);
5975re->name_entry_size = cd->name_entry_size;
5976re->name_count = cd->names_found;
5977re->ref_count = 0;
5978re->tables = (tables == _pcre_default_tables)? NULL : tables;
5979re->nullpad = NULL;
8ac170f3 5980
6bf342e1
PH
5981/* The starting points of the name/number translation table and of the code are
5982passed around in the compile data block. The start/end pattern and initial
5983options are already set from the pre-compile phase, as is the name_entry_size
5984field. Reset the bracket count and the names_found field. Also reset the hwm
5985field; this time it's used for remembering forward references to subpatterns.
5986*/
8ac170f3 5987
6bf342e1
PH
5988cd->bracount = 0;
5989cd->names_found = 0;
5990cd->name_table = (uschar *)re + re->name_table_offset;
5991codestart = cd->name_table + re->name_entry_size * re->name_count;
5992cd->start_code = codestart;
5993cd->hwm = cworkspace;
5994cd->req_varyopt = 0;
47db1125 5995cd->had_accept = FALSE;
8ac170f3 5996
6bf342e1
PH
5997/* Set up a starting, non-extracting bracket, then compile the expression. On
5998error, errorcode will be set non-zero, so we don't need to look at the result
5999of the function here. */
8ac170f3 6000
47db1125 6001ptr = (const uschar *)pattern + skipatstart;
6bf342e1
PH
6002code = (uschar *)codestart;
6003*code = OP_BRA;
6004(void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
64f2600a 6005 &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
6bf342e1
PH
6006re->top_bracket = cd->bracount;
6007re->top_backref = cd->top_backref;
47db1125 6008re->flags = cd->external_flags;
8ac170f3 6009
47db1125 6010if (cd->had_accept) reqbyte = -1; /* Must disable after (*ACCEPT) */
8ac170f3 6011
6bf342e1 6012/* If not reached end of pattern on success, there's an excess bracket. */
8ac170f3 6013
6bf342e1 6014if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
8ac170f3 6015
6bf342e1
PH
6016/* Fill in the terminating state and check for disastrous overflow, but
6017if debugging, leave the test till after things are printed out. */
8ac170f3 6018
6bf342e1 6019*code++ = OP_END;
8ac170f3 6020
6bf342e1
PH
6021#ifndef DEBUG
6022if (code - codestart > length) errorcode = ERR23;
8ac170f3
PH
6023#endif
6024
6bf342e1 6025/* Fill in any forward references that are required. */
8ac170f3 6026
6bf342e1 6027while (errorcode == 0 && cd->hwm > cworkspace)
8ac170f3 6028 {
6bf342e1
PH
6029 int offset, recno;
6030 const uschar *groupptr;
6031 cd->hwm -= LINK_SIZE;
6032 offset = GET(cd->hwm, 0);
6033 recno = GET(codestart, offset);
6034 groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
6035 if (groupptr == NULL) errorcode = ERR53;
6036 else PUT(((uschar *)codestart), offset, groupptr - codestart);
8ac170f3
PH
6037 }
6038
8ac170f3
PH
6039/* Give an error if there's back reference to a non-existent capturing
6040subpattern. */
6041
6bf342e1 6042if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
8ac170f3
PH
6043
6044/* Failed to compile, or error while post-processing */
6045
6046if (errorcode != 0)
6047 {
6048 (pcre_free)(re);
8ac170f3 6049 PCRE_EARLY_ERROR_RETURN:
6bf342e1 6050 *erroroffset = ptr - (const uschar *)pattern;
64f2600a 6051 PCRE_EARLY_ERROR_RETURN2:
47db1125 6052 *errorptr = find_error_text(errorcode);
8ac170f3
PH
6053 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
6054 return NULL;
6055 }
6056
6057/* If the anchored option was not passed, set the flag if we can determine that
6058the pattern is anchored by virtue of ^ characters or \A or anything else (such
6059as starting with .* when DOTALL is set).
6060
6bf342e1 6061Otherwise, if we know what the first byte has to be, save it, because that
8ac170f3
PH
6062speeds up unanchored matches no end. If not, see if we can set the
6063PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
6064start with ^. and also when all branches start with .* for non-DOTALL matches.
6065*/
6066
6bf342e1 6067if ((re->options & PCRE_ANCHORED) == 0)
8ac170f3 6068 {
6bf342e1 6069 int temp_options = re->options; /* May get changed during these scans */
aa41d2de 6070 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
8ac170f3
PH
6071 re->options |= PCRE_ANCHORED;
6072 else
6073 {
6074 if (firstbyte < 0)
6075 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
6076 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
6077 {
6078 int ch = firstbyte & 255;
6079 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
aa41d2de 6080 cd->fcc[ch] == ch)? ch : firstbyte;
47db1125 6081 re->flags |= PCRE_FIRSTSET;
8ac170f3 6082 }
aa41d2de 6083 else if (is_startline(codestart, 0, cd->backref_map))
47db1125 6084 re->flags |= PCRE_STARTLINE;
8ac170f3
PH
6085 }
6086 }
6087
6088/* For an anchored pattern, we use the "required byte" only if it follows a
6089variable length item in the regex. Remove the caseless flag for non-caseable
6090bytes. */
6091
6092if (reqbyte >= 0 &&
6093 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
6094 {
6095 int ch = reqbyte & 255;
6096 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
aa41d2de 6097 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
47db1125 6098 re->flags |= PCRE_REQCHSET;
8ac170f3
PH
6099 }
6100
aa41d2de
PH
6101/* Print out the compiled data if debugging is enabled. This is never the
6102case when building a production library. */
8ac170f3
PH
6103
6104#ifdef DEBUG
6105
6106printf("Length = %d top_bracket = %d top_backref = %d\n",
6107 length, re->top_bracket, re->top_backref);
6108
47db1125 6109printf("Options=%08x\n", re->options);
8ac170f3 6110
47db1125 6111if ((re->flags & PCRE_FIRSTSET) != 0)
8ac170f3
PH
6112 {
6113 int ch = re->first_byte & 255;
aa41d2de
PH
6114 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
6115 "" : " (caseless)";
8ac170f3
PH
6116 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
6117 else printf("First char = \\x%02x%s\n", ch, caseless);
6118 }
6119
47db1125 6120if ((re->flags & PCRE_REQCHSET) != 0)
8ac170f3
PH
6121 {
6122 int ch = re->req_byte & 255;
aa41d2de
PH
6123 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
6124 "" : " (caseless)";
8ac170f3
PH
6125 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
6126 else printf("Req char = \\x%02x%s\n", ch, caseless);
6127 }
6128
64f2600a 6129pcre_printint(re, stdout, TRUE);
8ac170f3
PH
6130
6131/* This check is done here in the debugging case so that the code that
6132was compiled can be seen. */
6133
6134if (code - codestart > length)
6135 {
6136 (pcre_free)(re);
47db1125 6137 *errorptr = find_error_text(ERR23);
8ac170f3
PH
6138 *erroroffset = ptr - (uschar *)pattern;
6139 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
6140 return NULL;
6141 }
6bf342e1 6142#endif /* DEBUG */
8ac170f3
PH
6143
6144return (pcre *)re;
6145}
6146
6147/* End of pcre_compile.c */