dbe08c23d22f6a394131bf2c8313b28204ecbcd3
[exim.git] / src / src / pcre / pcre.c
1 /* $Cambridge: exim/src/src/pcre/pcre.c,v 1.2 2005/06/15 08:57:10 ph10 Exp $ */
2
3 /*************************************************
4 * Perl-Compatible Regular Expressions *
5 *************************************************/
6
7 /*
8 This is a library of functions to support regular expressions whose syntax
9 and semantics are as close as possible to those of the Perl 5 language. See
10 the file Tech.Notes for some information on the internals.
11
12 Written by: Philip Hazel <ph10@cam.ac.uk>
13
14 Copyright (c) 1997-2004 University of Cambridge
15
16 -----------------------------------------------------------------------------
17 Redistribution and use in source and binary forms, with or without
18 modification, are permitted provided that the following conditions are met:
19
20 * Redistributions of source code must retain the above copyright notice,
21 this list of conditions and the following disclaimer.
22
23 * Redistributions in binary form must reproduce the above copyright
24 notice, this list of conditions and the following disclaimer in the
25 documentation and/or other materials provided with the distribution.
26
27 * Neither the name of the University of Cambridge nor the names of its
28 contributors may be used to endorse or promote products derived from
29 this software without specific prior written permission.
30
31 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
32 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
33 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
35 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
36 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
37 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
38 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
39 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
40 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
41 POSSIBILITY OF SUCH DAMAGE.
42 -----------------------------------------------------------------------------
43 */
44
45
46 /* Define DEBUG to get debugging output on stdout. */
47 /* #define DEBUG */
48
49 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
50 inline, and there are *still* stupid compilers about that don't like indented
51 pre-processor statements. I suppose it's only been 10 years... */
52
53 #ifdef DEBUG
54 #define DPRINTF(p) printf p
55 #else
56 #define DPRINTF(p) /*nothing*/
57 #endif
58
59 /* Include the internals header, which itself includes "config.h", the Standard
60 C headers, and the external pcre header. */
61
62 #include "internal.h"
63
64 /* If Unicode Property support is wanted, include a private copy of the
65 function that does it, and the table that translates names to numbers. */
66
67 #ifdef SUPPORT_UCP
68 #include "ucp.c"
69 #include "ucptypetable.c"
70 #endif
71
72 /* Maximum number of items on the nested bracket stacks at compile time. This
73 applies to the nesting of all kinds of parentheses. It does not limit
74 un-nested, non-capturing parentheses. This number can be made bigger if
75 necessary - it is used to dimension one int and one unsigned char vector at
76 compile time. */
77
78 #define BRASTACK_SIZE 200
79
80
81 /* Maximum number of ints of offset to save on the stack for recursive calls.
82 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
83 because the offset vector is always a multiple of 3 long. */
84
85 #define REC_STACK_SAVE_MAX 30
86
87
88 /* The maximum remaining length of subject we are prepared to search for a
89 req_byte match. */
90
91 #define REQ_BYTE_MAX 1000
92
93
94 /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
95 the definition is next to the definition of the opcodes in internal.h. */
96
97 static const uschar OP_lengths[] = { OP_LENGTHS };
98
99 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
100
101 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
102 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
103
104 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
105 are simple data values; negative values are for special things like \d and so
106 on. Zero means further processing is needed (for things like \x), or the escape
107 is invalid. */
108
109 #if !EBCDIC /* This is the "normal" table for ASCII systems */
110 static const short int escapes[] = {
111 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
112 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
113 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
114 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
115 -ESC_P, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
116 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
117 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
118 0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
119 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
120 0, 0, -ESC_z /* x - z */
121 };
122
123 #else /* This is the "abnormal" table for EBCDIC systems */
124 static const short int escapes[] = {
125 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
126 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
127 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
128 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
129 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
130 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
131 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
132 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
133 /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
134 /* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, -ESC_p,
135 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
136 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
137 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
138 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
139 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
140 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
141 /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
142 /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
143 /* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0,
144 /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
145 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
146 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
147 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
148 };
149 #endif
150
151
152 /* Tables of names of POSIX character classes and their lengths. The list is
153 terminated by a zero length entry. The first three must be alpha, upper, lower,
154 as this is assumed for handling case independence. */
155
156 static const char *const posix_names[] = {
157 "alpha", "lower", "upper",
158 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
159 "print", "punct", "space", "word", "xdigit" };
160
161 static const uschar posix_name_lengths[] = {
162 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
163
164 /* Table of class bit maps for each POSIX class; up to three may be combined
165 to form the class. The table for [:blank:] is dynamically modified to remove
166 the vertical space characters. */
167
168 static const int posix_class_maps[] = {
169 cbit_lower, cbit_upper, -1, /* alpha */
170 cbit_lower, -1, -1, /* lower */
171 cbit_upper, -1, -1, /* upper */
172 cbit_digit, cbit_lower, cbit_upper, /* alnum */
173 cbit_print, cbit_cntrl, -1, /* ascii */
174 cbit_space, -1, -1, /* blank - a GNU extension */
175 cbit_cntrl, -1, -1, /* cntrl */
176 cbit_digit, -1, -1, /* digit */
177 cbit_graph, -1, -1, /* graph */
178 cbit_print, -1, -1, /* print */
179 cbit_punct, -1, -1, /* punct */
180 cbit_space, -1, -1, /* space */
181 cbit_word, -1, -1, /* word - a Perl extension */
182 cbit_xdigit,-1, -1 /* xdigit */
183 };
184
185 /* Table to identify digits and hex digits. This is used when compiling
186 patterns. Note that the tables in chartables are dependent on the locale, and
187 may mark arbitrary characters as digits - but the PCRE compiling code expects
188 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
189 a private table here. It costs 256 bytes, but it is a lot faster than doing
190 character value tests (at least in some simple cases I timed), and in some
191 applications one wants PCRE to compile efficiently as well as match
192 efficiently.
193
194 For convenience, we use the same bit definitions as in chartables:
195
196 0x04 decimal digit
197 0x08 hexadecimal digit
198
199 Then we can use ctype_digit and ctype_xdigit in the code. */
200
201 #if !EBCDIC /* This is the "normal" case, for ASCII systems */
202 static const unsigned char digitab[] =
203 {
204 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
205 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
206 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
207 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
208 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
209 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
210 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
211 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
212 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
213 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
214 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
215 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
216 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
217 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
218 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
219 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
220 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
221 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
222 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
223 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
224 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
225 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
226 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
227 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
228 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
229 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
230 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
231 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
232 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
233 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
234 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
235 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
236
237 #else /* This is the "abnormal" case, for EBCDIC systems */
238 static const unsigned char digitab[] =
239 {
240 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
241 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
242 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
243 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
244 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
245 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
246 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
247 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
248 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
249 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
250 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
251 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- ¬ */
252 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
253 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
254 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
255 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
256 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
257 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
258 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
259 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
260 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
261 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
262 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
263 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
264 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
265 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
266 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
267 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
268 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
269 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
270 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
271 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
272
273 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
274 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
275 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
276 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
277 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
278 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
279 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
280 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
281 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
282 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
283 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
284 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
285 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- ¬ */
286 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
287 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
288 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
289 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
290 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
291 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
292 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
293 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
294 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
295 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
296 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
297 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
298 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
299 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
300 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
301 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
302 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
303 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
304 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
305 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
306 #endif
307
308
309 /* Definition to allow mutual recursion */
310
311 static BOOL
312 compile_regex(int, int, int *, uschar **, const uschar **, const char **,
313 BOOL, int, int *, int *, branch_chain *, compile_data *);
314
315 /* Structure for building a chain of data that actually lives on the
316 stack, for holding the values of the subject pointer at the start of each
317 subpattern, so as to detect when an empty string has been matched by a
318 subpattern - to break infinite loops. When NO_RECURSE is set, these blocks
319 are on the heap, not on the stack. */
320
321 typedef struct eptrblock {
322 struct eptrblock *epb_prev;
323 const uschar *epb_saved_eptr;
324 } eptrblock;
325
326 /* Flag bits for the match() function */
327
328 #define match_condassert 0x01 /* Called to check a condition assertion */
329 #define match_isgroup 0x02 /* Set if start of bracketed group */
330
331 /* Non-error returns from the match() function. Error returns are externally
332 defined PCRE_ERROR_xxx codes, which are all negative. */
333
334 #define MATCH_MATCH 1
335 #define MATCH_NOMATCH 0
336
337
338
339 /*************************************************
340 * Global variables *
341 *************************************************/
342
343 /* PCRE is thread-clean and doesn't use any global variables in the normal
344 sense. However, it calls memory allocation and free functions via the four
345 indirections below, and it can optionally do callouts. These values can be
346 changed by the caller, but are shared between all threads. However, when
347 compiling for Virtual Pascal, things are done differently (see pcre.in). */
348
349 #ifndef VPCOMPAT
350 #ifdef __cplusplus
351 extern "C" void *(*pcre_malloc)(size_t) = malloc;
352 extern "C" void (*pcre_free)(void *) = free;
353 extern "C" void *(*pcre_stack_malloc)(size_t) = malloc;
354 extern "C" void (*pcre_stack_free)(void *) = free;
355 extern "C" int (*pcre_callout)(pcre_callout_block *) = NULL;
356 #else
357 void *(*pcre_malloc)(size_t) = malloc;
358 void (*pcre_free)(void *) = free;
359 void *(*pcre_stack_malloc)(size_t) = malloc;
360 void (*pcre_stack_free)(void *) = free;
361 int (*pcre_callout)(pcre_callout_block *) = NULL;
362 #endif
363 #endif
364
365
366 /*************************************************
367 * Macros and tables for character handling *
368 *************************************************/
369
370 /* When UTF-8 encoding is being used, a character is no longer just a single
371 byte. The macros for character handling generate simple sequences when used in
372 byte-mode, and more complicated ones for UTF-8 characters. */
373
374 #ifndef SUPPORT_UTF8
375 #define GETCHAR(c, eptr) c = *eptr;
376 #define GETCHARINC(c, eptr) c = *eptr++;
377 #define GETCHARINCTEST(c, eptr) c = *eptr++;
378 #define GETCHARLEN(c, eptr, len) c = *eptr;
379 #define BACKCHAR(eptr)
380
381 #else /* SUPPORT_UTF8 */
382
383 /* Get the next UTF-8 character, not advancing the pointer. This is called when
384 we know we are in UTF-8 mode. */
385
386 #define GETCHAR(c, eptr) \
387 c = *eptr; \
388 if ((c & 0xc0) == 0xc0) \
389 { \
390 int gcii; \
391 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
392 int gcss = 6*gcaa; \
393 c = (c & utf8_table3[gcaa]) << gcss; \
394 for (gcii = 1; gcii <= gcaa; gcii++) \
395 { \
396 gcss -= 6; \
397 c |= (eptr[gcii] & 0x3f) << gcss; \
398 } \
399 }
400
401 /* Get the next UTF-8 character, advancing the pointer. This is called when we
402 know we are in UTF-8 mode. */
403
404 #define GETCHARINC(c, eptr) \
405 c = *eptr++; \
406 if ((c & 0xc0) == 0xc0) \
407 { \
408 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
409 int gcss = 6*gcaa; \
410 c = (c & utf8_table3[gcaa]) << gcss; \
411 while (gcaa-- > 0) \
412 { \
413 gcss -= 6; \
414 c |= (*eptr++ & 0x3f) << gcss; \
415 } \
416 }
417
418 /* Get the next character, testing for UTF-8 mode, and advancing the pointer */
419
420 #define GETCHARINCTEST(c, eptr) \
421 c = *eptr++; \
422 if (md->utf8 && (c & 0xc0) == 0xc0) \
423 { \
424 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
425 int gcss = 6*gcaa; \
426 c = (c & utf8_table3[gcaa]) << gcss; \
427 while (gcaa-- > 0) \
428 { \
429 gcss -= 6; \
430 c |= (*eptr++ & 0x3f) << gcss; \
431 } \
432 }
433
434 /* Get the next UTF-8 character, not advancing the pointer, incrementing length
435 if there are extra bytes. This is called when we know we are in UTF-8 mode. */
436
437 #define GETCHARLEN(c, eptr, len) \
438 c = *eptr; \
439 if ((c & 0xc0) == 0xc0) \
440 { \
441 int gcii; \
442 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
443 int gcss = 6*gcaa; \
444 c = (c & utf8_table3[gcaa]) << gcss; \
445 for (gcii = 1; gcii <= gcaa; gcii++) \
446 { \
447 gcss -= 6; \
448 c |= (eptr[gcii] & 0x3f) << gcss; \
449 } \
450 len += gcaa; \
451 }
452
453 /* If the pointer is not at the start of a character, move it back until
454 it is. Called only in UTF-8 mode. */
455
456 #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
457
458 #endif
459
460
461
462 /*************************************************
463 * Default character tables *
464 *************************************************/
465
466 /* A default set of character tables is included in the PCRE binary. Its source
467 is built by the maketables auxiliary program, which uses the default C ctypes
468 functions, and put in the file chartables.c. These tables are used by PCRE
469 whenever the caller of pcre_compile() does not provide an alternate set of
470 tables. */
471
472 #include "chartables.c"
473
474
475
476 #ifdef SUPPORT_UTF8
477 /*************************************************
478 * Tables for UTF-8 support *
479 *************************************************/
480
481 /* These are the breakpoints for different numbers of bytes in a UTF-8
482 character. */
483
484 static const int utf8_table1[] =
485 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
486
487 /* These are the indicator bits and the mask for the data bits to set in the
488 first byte of a character, indexed by the number of additional bytes. */
489
490 static const int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
491 static const int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
492
493 /* Table of the number of extra characters, indexed by the first character
494 masked with 0x3f. The highest number for a valid UTF-8 character is in fact
495 0x3d. */
496
497 static const uschar utf8_table4[] = {
498 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
499 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
500 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
501 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
502
503
504 /*************************************************
505 * Convert character value to UTF-8 *
506 *************************************************/
507
508 /* This function takes an integer value in the range 0 - 0x7fffffff
509 and encodes it as a UTF-8 character in 0 to 6 bytes.
510
511 Arguments:
512 cvalue the character value
513 buffer pointer to buffer for result - at least 6 bytes long
514
515 Returns: number of characters placed in the buffer
516 */
517
518 static int
519 ord2utf8(int cvalue, uschar *buffer)
520 {
521 register int i, j;
522 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
523 if (cvalue <= utf8_table1[i]) break;
524 buffer += i;
525 for (j = i; j > 0; j--)
526 {
527 *buffer-- = 0x80 | (cvalue & 0x3f);
528 cvalue >>= 6;
529 }
530 *buffer = utf8_table2[i] | cvalue;
531 return i + 1;
532 }
533 #endif
534
535
536
537 /*************************************************
538 * Print compiled regex *
539 *************************************************/
540
541 /* The code for doing this is held in a separate file that is also included in
542 pcretest.c. It defines a function called print_internals(). */
543
544 #ifdef DEBUG
545 #include "printint.c"
546 #endif
547
548
549
550 /*************************************************
551 * Return version string *
552 *************************************************/
553
554 #define STRING(a) # a
555 #define XSTRING(s) STRING(s)
556
557 EXPORT const char *
558 pcre_version(void)
559 {
560 return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
561 }
562
563
564
565
566 /*************************************************
567 * Flip bytes in an integer *
568 *************************************************/
569
570 /* This function is called when the magic number in a regex doesn't match in
571 order to flip its bytes to see if we are dealing with a pattern that was
572 compiled on a host of different endianness. If so, this function is used to
573 flip other byte values.
574
575 Arguments:
576 value the number to flip
577 n the number of bytes to flip (assumed to be 2 or 4)
578
579 Returns: the flipped value
580 */
581
582 static long int
583 byteflip(long int value, int n)
584 {
585 if (n == 2) return ((value & 0x00ff) << 8) | ((value & 0xff00) >> 8);
586 return ((value & 0x000000ff) << 24) |
587 ((value & 0x0000ff00) << 8) |
588 ((value & 0x00ff0000) >> 8) |
589 ((value & 0xff000000) >> 24);
590 }
591
592
593
594 /*************************************************
595 * Test for a byte-flipped compiled regex *
596 *************************************************/
597
598 /* This function is called from pce_exec() and also from pcre_fullinfo(). Its
599 job is to test whether the regex is byte-flipped - that is, it was compiled on
600 a system of opposite endianness. The function is called only when the native
601 MAGIC_NUMBER test fails. If the regex is indeed flipped, we flip all the
602 relevant values into a different data block, and return it.
603
604 Arguments:
605 re points to the regex
606 study points to study data, or NULL
607 internal_re points to a new regex block
608 internal_study points to a new study block
609
610 Returns: the new block if is is indeed a byte-flipped regex
611 NULL if it is not
612 */
613
614 static real_pcre *
615 try_flipped(const real_pcre *re, real_pcre *internal_re,
616 const pcre_study_data *study, pcre_study_data *internal_study)
617 {
618 if (byteflip(re->magic_number, sizeof(re->magic_number)) != MAGIC_NUMBER)
619 return NULL;
620
621 *internal_re = *re; /* To copy other fields */
622 internal_re->size = byteflip(re->size, sizeof(re->size));
623 internal_re->options = byteflip(re->options, sizeof(re->options));
624 internal_re->top_bracket = byteflip(re->top_bracket, sizeof(re->top_bracket));
625 internal_re->top_backref = byteflip(re->top_backref, sizeof(re->top_backref));
626 internal_re->first_byte = byteflip(re->first_byte, sizeof(re->first_byte));
627 internal_re->req_byte = byteflip(re->req_byte, sizeof(re->req_byte));
628 internal_re->name_table_offset = byteflip(re->name_table_offset,
629 sizeof(re->name_table_offset));
630 internal_re->name_entry_size = byteflip(re->name_entry_size,
631 sizeof(re->name_entry_size));
632 internal_re->name_count = byteflip(re->name_count, sizeof(re->name_count));
633
634 if (study != NULL)
635 {
636 *internal_study = *study; /* To copy other fields */
637 internal_study->size = byteflip(study->size, sizeof(study->size));
638 internal_study->options = byteflip(study->options, sizeof(study->options));
639 }
640
641 return internal_re;
642 }
643
644
645
646 /*************************************************
647 * (Obsolete) Return info about compiled pattern *
648 *************************************************/
649
650 /* This is the original "info" function. It picks potentially useful data out
651 of the private structure, but its interface was too rigid. It remains for
652 backwards compatibility. The public options are passed back in an int - though
653 the re->options field has been expanded to a long int, all the public options
654 at the low end of it, and so even on 16-bit systems this will still be OK.
655 Therefore, I haven't changed the API for pcre_info().
656
657 Arguments:
658 argument_re points to compiled code
659 optptr where to pass back the options
660 first_byte where to pass back the first character,
661 or -1 if multiline and all branches start ^,
662 or -2 otherwise
663
664 Returns: number of capturing subpatterns
665 or negative values on error
666 */
667
668 EXPORT int
669 pcre_info(const pcre *argument_re, int *optptr, int *first_byte)
670 {
671 real_pcre internal_re;
672 const real_pcre *re = (const real_pcre *)argument_re;
673 if (re == NULL) return PCRE_ERROR_NULL;
674 if (re->magic_number != MAGIC_NUMBER)
675 {
676 re = try_flipped(re, &internal_re, NULL, NULL);
677 if (re == NULL) return PCRE_ERROR_BADMAGIC;
678 }
679 if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
680 if (first_byte != NULL)
681 *first_byte = ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
682 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
683 return re->top_bracket;
684 }
685
686
687
688 /*************************************************
689 * Return info about compiled pattern *
690 *************************************************/
691
692 /* This is a newer "info" function which has an extensible interface so
693 that additional items can be added compatibly.
694
695 Arguments:
696 argument_re points to compiled code
697 extra_data points extra data, or NULL
698 what what information is required
699 where where to put the information
700
701 Returns: 0 if data returned, negative on error
702 */
703
704 EXPORT int
705 pcre_fullinfo(const pcre *argument_re, const pcre_extra *extra_data, int what,
706 void *where)
707 {
708 real_pcre internal_re;
709 pcre_study_data internal_study;
710 const real_pcre *re = (const real_pcre *)argument_re;
711 const pcre_study_data *study = NULL;
712
713 if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
714
715 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0)
716 study = (const pcre_study_data *)extra_data->study_data;
717
718 if (re->magic_number != MAGIC_NUMBER)
719 {
720 re = try_flipped(re, &internal_re, study, &internal_study);
721 if (re == NULL) return PCRE_ERROR_BADMAGIC;
722 if (study != NULL) study = &internal_study;
723 }
724
725 switch (what)
726 {
727 case PCRE_INFO_OPTIONS:
728 *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
729 break;
730
731 case PCRE_INFO_SIZE:
732 *((size_t *)where) = re->size;
733 break;
734
735 case PCRE_INFO_STUDYSIZE:
736 *((size_t *)where) = (study == NULL)? 0 : study->size;
737 break;
738
739 case PCRE_INFO_CAPTURECOUNT:
740 *((int *)where) = re->top_bracket;
741 break;
742
743 case PCRE_INFO_BACKREFMAX:
744 *((int *)where) = re->top_backref;
745 break;
746
747 case PCRE_INFO_FIRSTBYTE:
748 *((int *)where) =
749 ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
750 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
751 break;
752
753 /* Make sure we pass back the pointer to the bit vector in the external
754 block, not the internal copy (with flipped integer fields). */
755
756 case PCRE_INFO_FIRSTTABLE:
757 *((const uschar **)where) =
758 (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
759 ((const pcre_study_data *)extra_data->study_data)->start_bits : NULL;
760 break;
761
762 case PCRE_INFO_LASTLITERAL:
763 *((int *)where) =
764 ((re->options & PCRE_REQCHSET) != 0)? re->req_byte : -1;
765 break;
766
767 case PCRE_INFO_NAMEENTRYSIZE:
768 *((int *)where) = re->name_entry_size;
769 break;
770
771 case PCRE_INFO_NAMECOUNT:
772 *((int *)where) = re->name_count;
773 break;
774
775 case PCRE_INFO_NAMETABLE:
776 *((const uschar **)where) = (const uschar *)re + re->name_table_offset;
777 break;
778
779 case PCRE_INFO_DEFAULT_TABLES:
780 *((const uschar **)where) = (const uschar *)pcre_default_tables;
781 break;
782
783 default: return PCRE_ERROR_BADOPTION;
784 }
785
786 return 0;
787 }
788
789
790
791 /*************************************************
792 * Return info about what features are configured *
793 *************************************************/
794
795 /* This is function which has an extensible interface so that additional items
796 can be added compatibly.
797
798 Arguments:
799 what what information is required
800 where where to put the information
801
802 Returns: 0 if data returned, negative on error
803 */
804
805 EXPORT int
806 pcre_config(int what, void *where)
807 {
808 switch (what)
809 {
810 case PCRE_CONFIG_UTF8:
811 #ifdef SUPPORT_UTF8
812 *((int *)where) = 1;
813 #else
814 *((int *)where) = 0;
815 #endif
816 break;
817
818 case PCRE_CONFIG_UNICODE_PROPERTIES:
819 #ifdef SUPPORT_UCP
820 *((int *)where) = 1;
821 #else
822 *((int *)where) = 0;
823 #endif
824 break;
825
826 case PCRE_CONFIG_NEWLINE:
827 *((int *)where) = NEWLINE;
828 break;
829
830 case PCRE_CONFIG_LINK_SIZE:
831 *((int *)where) = LINK_SIZE;
832 break;
833
834 case PCRE_CONFIG_POSIX_MALLOC_THRESHOLD:
835 *((int *)where) = POSIX_MALLOC_THRESHOLD;
836 break;
837
838 case PCRE_CONFIG_MATCH_LIMIT:
839 *((unsigned int *)where) = MATCH_LIMIT;
840 break;
841
842 case PCRE_CONFIG_STACKRECURSE:
843 #ifdef NO_RECURSE
844 *((int *)where) = 0;
845 #else
846 *((int *)where) = 1;
847 #endif
848 break;
849
850 default: return PCRE_ERROR_BADOPTION;
851 }
852
853 return 0;
854 }
855
856
857
858 #ifdef DEBUG
859 /*************************************************
860 * Debugging function to print chars *
861 *************************************************/
862
863 /* Print a sequence of chars in printable format, stopping at the end of the
864 subject if the requested.
865
866 Arguments:
867 p points to characters
868 length number to print
869 is_subject TRUE if printing from within md->start_subject
870 md pointer to matching data block, if is_subject is TRUE
871
872 Returns: nothing
873 */
874
875 static void
876 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
877 {
878 int c;
879 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
880 while (length-- > 0)
881 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
882 }
883 #endif
884
885
886
887
888 /*************************************************
889 * Handle escapes *
890 *************************************************/
891
892 /* This function is called when a \ has been encountered. It either returns a
893 positive value for a simple escape such as \n, or a negative value which
894 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
895 a positive value greater than 255 may be returned. On entry, ptr is pointing at
896 the \. On exit, it is on the final character of the escape sequence.
897
898 Arguments:
899 ptrptr points to the pattern position pointer
900 errorptr points to the pointer to the error message
901 bracount number of previous extracting brackets
902 options the options bits
903 isclass TRUE if inside a character class
904
905 Returns: zero or positive => a data character
906 negative => a special escape sequence
907 on error, errorptr is set
908 */
909
910 static int
911 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
912 int options, BOOL isclass)
913 {
914 const uschar *ptr = *ptrptr;
915 int c, i;
916
917 /* If backslash is at the end of the pattern, it's an error. */
918
919 c = *(++ptr);
920 if (c == 0) *errorptr = ERR1;
921
922 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
923 a table. A non-zero result is something that can be returned immediately.
924 Otherwise further processing may be required. */
925
926 #if !EBCDIC /* ASCII coding */
927 else if (c < '0' || c > 'z') {} /* Not alphameric */
928 else if ((i = escapes[c - '0']) != 0) c = i;
929
930 #else /* EBCDIC coding */
931 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
932 else if ((i = escapes[c - 0x48]) != 0) c = i;
933 #endif
934
935 /* Escapes that need further processing, or are illegal. */
936
937 else
938 {
939 const uschar *oldptr;
940 switch (c)
941 {
942 /* A number of Perl escapes are not handled by PCRE. We give an explicit
943 error. */
944
945 case 'l':
946 case 'L':
947 case 'N':
948 case 'u':
949 case 'U':
950 *errorptr = ERR37;
951 break;
952
953 /* The handling of escape sequences consisting of a string of digits
954 starting with one that is not zero is not straightforward. By experiment,
955 the way Perl works seems to be as follows:
956
957 Outside a character class, the digits are read as a decimal number. If the
958 number is less than 10, or if there are that many previous extracting
959 left brackets, then it is a back reference. Otherwise, up to three octal
960 digits are read to form an escaped byte. Thus \123 is likely to be octal
961 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
962 value is greater than 377, the least significant 8 bits are taken. Inside a
963 character class, \ followed by a digit is always an octal number. */
964
965 case '1': case '2': case '3': case '4': case '5':
966 case '6': case '7': case '8': case '9':
967
968 if (!isclass)
969 {
970 oldptr = ptr;
971 c -= '0';
972 while ((digitab[ptr[1]] & ctype_digit) != 0)
973 c = c * 10 + *(++ptr) - '0';
974 if (c < 10 || c <= bracount)
975 {
976 c = -(ESC_REF + c);
977 break;
978 }
979 ptr = oldptr; /* Put the pointer back and fall through */
980 }
981
982 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
983 generates a binary zero byte and treats the digit as a following literal.
984 Thus we have to pull back the pointer by one. */
985
986 if ((c = *ptr) >= '8')
987 {
988 ptr--;
989 c = 0;
990 break;
991 }
992
993 /* \0 always starts an octal number, but we may drop through to here with a
994 larger first octal digit. */
995
996 case '0':
997 c -= '0';
998 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
999 c = c * 8 + *(++ptr) - '0';
1000 c &= 255; /* Take least significant 8 bits */
1001 break;
1002
1003 /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
1004 which can be greater than 0xff, but only if the ddd are hex digits. */
1005
1006 case 'x':
1007 #ifdef SUPPORT_UTF8
1008 if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
1009 {
1010 const uschar *pt = ptr + 2;
1011 register int count = 0;
1012 c = 0;
1013 while ((digitab[*pt] & ctype_xdigit) != 0)
1014 {
1015 int cc = *pt++;
1016 count++;
1017 #if !EBCDIC /* ASCII coding */
1018 if (cc >= 'a') cc -= 32; /* Convert to upper case */
1019 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
1020 #else /* EBCDIC coding */
1021 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
1022 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
1023 #endif
1024 }
1025 if (*pt == '}')
1026 {
1027 if (c < 0 || count > 8) *errorptr = ERR34;
1028 ptr = pt;
1029 break;
1030 }
1031 /* If the sequence of hex digits does not end with '}', then we don't
1032 recognize this construct; fall through to the normal \x handling. */
1033 }
1034 #endif
1035
1036 /* Read just a single hex char */
1037
1038 c = 0;
1039 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
1040 {
1041 int cc; /* Some compilers don't like ++ */
1042 cc = *(++ptr); /* in initializers */
1043 #if !EBCDIC /* ASCII coding */
1044 if (cc >= 'a') cc -= 32; /* Convert to upper case */
1045 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
1046 #else /* EBCDIC coding */
1047 if (cc <= 'z') cc += 64; /* Convert to upper case */
1048 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
1049 #endif
1050 }
1051 break;
1052
1053 /* Other special escapes not starting with a digit are straightforward */
1054
1055 case 'c':
1056 c = *(++ptr);
1057 if (c == 0)
1058 {
1059 *errorptr = ERR2;
1060 return 0;
1061 }
1062
1063 /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
1064 is ASCII-specific, but then the whole concept of \cx is ASCII-specific.
1065 (However, an EBCDIC equivalent has now been added.) */
1066
1067 #if !EBCDIC /* ASCII coding */
1068 if (c >= 'a' && c <= 'z') c -= 32;
1069 c ^= 0x40;
1070 #else /* EBCDIC coding */
1071 if (c >= 'a' && c <= 'z') c += 64;
1072 c ^= 0xC0;
1073 #endif
1074 break;
1075
1076 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1077 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
1078 for Perl compatibility, it is a literal. This code looks a bit odd, but
1079 there used to be some cases other than the default, and there may be again
1080 in future, so I haven't "optimized" it. */
1081
1082 default:
1083 if ((options & PCRE_EXTRA) != 0) switch(c)
1084 {
1085 default:
1086 *errorptr = ERR3;
1087 break;
1088 }
1089 break;
1090 }
1091 }
1092
1093 *ptrptr = ptr;
1094 return c;
1095 }
1096
1097
1098
1099 #ifdef SUPPORT_UCP
1100 /*************************************************
1101 * Handle \P and \p *
1102 *************************************************/
1103
1104 /* This function is called after \P or \p has been encountered, provided that
1105 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1106 pointing at the P or p. On exit, it is pointing at the final character of the
1107 escape sequence.
1108
1109 Argument:
1110 ptrptr points to the pattern position pointer
1111 negptr points to a boolean that is set TRUE for negation else FALSE
1112 errorptr points to the pointer to the error message
1113
1114 Returns: value from ucp_type_table, or -1 for an invalid type
1115 */
1116
1117 static int
1118 get_ucp(const uschar **ptrptr, BOOL *negptr, const char **errorptr)
1119 {
1120 int c, i, bot, top;
1121 const uschar *ptr = *ptrptr;
1122 char name[4];
1123
1124 c = *(++ptr);
1125 if (c == 0) goto ERROR_RETURN;
1126
1127 *negptr = FALSE;
1128
1129 /* \P or \p can be followed by a one- or two-character name in {}, optionally
1130 preceded by ^ for negation. */
1131
1132 if (c == '{')
1133 {
1134 if (ptr[1] == '^')
1135 {
1136 *negptr = TRUE;
1137 ptr++;
1138 }
1139 for (i = 0; i <= 2; i++)
1140 {
1141 c = *(++ptr);
1142 if (c == 0) goto ERROR_RETURN;
1143 if (c == '}') break;
1144 name[i] = c;
1145 }
1146 if (c !='}') /* Try to distinguish error cases */
1147 {
1148 while (*(++ptr) != 0 && *ptr != '}');
1149 if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;
1150 }
1151 name[i] = 0;
1152 }
1153
1154 /* Otherwise there is just one following character */
1155
1156 else
1157 {
1158 name[0] = c;
1159 name[1] = 0;
1160 }
1161
1162 *ptrptr = ptr;
1163
1164 /* Search for a recognized property name using binary chop */
1165
1166 bot = 0;
1167 top = sizeof(utt)/sizeof(ucp_type_table);
1168
1169 while (bot < top)
1170 {
1171 i = (bot + top)/2;
1172 c = strcmp(name, utt[i].name);
1173 if (c == 0) return utt[i].value;
1174 if (c > 0) bot = i + 1; else top = i;
1175 }
1176
1177 UNKNOWN_RETURN:
1178 *errorptr = ERR47;
1179 *ptrptr = ptr;
1180 return -1;
1181
1182 ERROR_RETURN:
1183 *errorptr = ERR46;
1184 *ptrptr = ptr;
1185 return -1;
1186 }
1187 #endif
1188
1189
1190
1191
1192 /*************************************************
1193 * Check for counted repeat *
1194 *************************************************/
1195
1196 /* This function is called when a '{' is encountered in a place where it might
1197 start a quantifier. It looks ahead to see if it really is a quantifier or not.
1198 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1199 where the ddds are digits.
1200
1201 Arguments:
1202 p pointer to the first char after '{'
1203
1204 Returns: TRUE or FALSE
1205 */
1206
1207 static BOOL
1208 is_counted_repeat(const uschar *p)
1209 {
1210 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1211 while ((digitab[*p] & ctype_digit) != 0) p++;
1212 if (*p == '}') return TRUE;
1213
1214 if (*p++ != ',') return FALSE;
1215 if (*p == '}') return TRUE;
1216
1217 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1218 while ((digitab[*p] & ctype_digit) != 0) p++;
1219
1220 return (*p == '}');
1221 }
1222
1223
1224
1225 /*************************************************
1226 * Read repeat counts *
1227 *************************************************/
1228
1229 /* Read an item of the form {n,m} and return the values. This is called only
1230 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1231 so the syntax is guaranteed to be correct, but we need to check the values.
1232
1233 Arguments:
1234 p pointer to first char after '{'
1235 minp pointer to int for min
1236 maxp pointer to int for max
1237 returned as -1 if no max
1238 errorptr points to pointer to error message
1239
1240 Returns: pointer to '}' on success;
1241 current ptr on error, with errorptr set
1242 */
1243
1244 static const uschar *
1245 read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
1246 {
1247 int min = 0;
1248 int max = -1;
1249
1250 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
1251
1252 if (*p == '}') max = min; else
1253 {
1254 if (*(++p) != '}')
1255 {
1256 max = 0;
1257 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
1258 if (max < min)
1259 {
1260 *errorptr = ERR4;
1261 return p;
1262 }
1263 }
1264 }
1265
1266 /* Do paranoid checks, then fill in the required variables, and pass back the
1267 pointer to the terminating '}'. */
1268
1269 if (min > 65535 || max > 65535)
1270 *errorptr = ERR5;
1271 else
1272 {
1273 *minp = min;
1274 *maxp = max;
1275 }
1276 return p;
1277 }
1278
1279
1280
1281 /*************************************************
1282 * Find first significant op code *
1283 *************************************************/
1284
1285 /* This is called by several functions that scan a compiled expression looking
1286 for a fixed first character, or an anchoring op code etc. It skips over things
1287 that do not influence this. For some calls, a change of option is important.
1288 For some calls, it makes sense to skip negative forward and all backward
1289 assertions, and also the \b assertion; for others it does not.
1290
1291 Arguments:
1292 code pointer to the start of the group
1293 options pointer to external options
1294 optbit the option bit whose changing is significant, or
1295 zero if none are
1296 skipassert TRUE if certain assertions are to be skipped
1297
1298 Returns: pointer to the first significant opcode
1299 */
1300
1301 static const uschar*
1302 first_significant_code(const uschar *code, int *options, int optbit,
1303 BOOL skipassert)
1304 {
1305 for (;;)
1306 {
1307 switch ((int)*code)
1308 {
1309 case OP_OPT:
1310 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1311 *options = (int)code[1];
1312 code += 2;
1313 break;
1314
1315 case OP_ASSERT_NOT:
1316 case OP_ASSERTBACK:
1317 case OP_ASSERTBACK_NOT:
1318 if (!skipassert) return code;
1319 do code += GET(code, 1); while (*code == OP_ALT);
1320 code += OP_lengths[*code];
1321 break;
1322
1323 case OP_WORD_BOUNDARY:
1324 case OP_NOT_WORD_BOUNDARY:
1325 if (!skipassert) return code;
1326 /* Fall through */
1327
1328 case OP_CALLOUT:
1329 case OP_CREF:
1330 case OP_BRANUMBER:
1331 code += OP_lengths[*code];
1332 break;
1333
1334 default:
1335 return code;
1336 }
1337 }
1338 /* Control never reaches here */
1339 }
1340
1341
1342
1343
1344 /*************************************************
1345 * Find the fixed length of a pattern *
1346 *************************************************/
1347
1348 /* Scan a pattern and compute the fixed length of subject that will match it,
1349 if the length is fixed. This is needed for dealing with backward assertions.
1350 In UTF8 mode, the result is in characters rather than bytes.
1351
1352 Arguments:
1353 code points to the start of the pattern (the bracket)
1354 options the compiling options
1355
1356 Returns: the fixed length, or -1 if there is no fixed length,
1357 or -2 if \C was encountered
1358 */
1359
1360 static int
1361 find_fixedlength(uschar *code, int options)
1362 {
1363 int length = -1;
1364
1365 register int branchlength = 0;
1366 register uschar *cc = code + 1 + LINK_SIZE;
1367
1368 /* Scan along the opcodes for this branch. If we get to the end of the
1369 branch, check the length against that of the other branches. */
1370
1371 for (;;)
1372 {
1373 int d;
1374 register int op = *cc;
1375 if (op >= OP_BRA) op = OP_BRA;
1376
1377 switch (op)
1378 {
1379 case OP_BRA:
1380 case OP_ONCE:
1381 case OP_COND:
1382 d = find_fixedlength(cc, options);
1383 if (d < 0) return d;
1384 branchlength += d;
1385 do cc += GET(cc, 1); while (*cc == OP_ALT);
1386 cc += 1 + LINK_SIZE;
1387 break;
1388
1389 /* Reached end of a branch; if it's a ket it is the end of a nested
1390 call. If it's ALT it is an alternation in a nested call. If it is
1391 END it's the end of the outer call. All can be handled by the same code. */
1392
1393 case OP_ALT:
1394 case OP_KET:
1395 case OP_KETRMAX:
1396 case OP_KETRMIN:
1397 case OP_END:
1398 if (length < 0) length = branchlength;
1399 else if (length != branchlength) return -1;
1400 if (*cc != OP_ALT) return length;
1401 cc += 1 + LINK_SIZE;
1402 branchlength = 0;
1403 break;
1404
1405 /* Skip over assertive subpatterns */
1406
1407 case OP_ASSERT:
1408 case OP_ASSERT_NOT:
1409 case OP_ASSERTBACK:
1410 case OP_ASSERTBACK_NOT:
1411 do cc += GET(cc, 1); while (*cc == OP_ALT);
1412 /* Fall through */
1413
1414 /* Skip over things that don't match chars */
1415
1416 case OP_REVERSE:
1417 case OP_BRANUMBER:
1418 case OP_CREF:
1419 case OP_OPT:
1420 case OP_CALLOUT:
1421 case OP_SOD:
1422 case OP_SOM:
1423 case OP_EOD:
1424 case OP_EODN:
1425 case OP_CIRC:
1426 case OP_DOLL:
1427 case OP_NOT_WORD_BOUNDARY:
1428 case OP_WORD_BOUNDARY:
1429 cc += OP_lengths[*cc];
1430 break;
1431
1432 /* Handle literal characters */
1433
1434 case OP_CHAR:
1435 case OP_CHARNC:
1436 branchlength++;
1437 cc += 2;
1438 #ifdef SUPPORT_UTF8
1439 if ((options & PCRE_UTF8) != 0)
1440 {
1441 while ((*cc & 0xc0) == 0x80) cc++;
1442 }
1443 #endif
1444 break;
1445
1446 /* Handle exact repetitions. The count is already in characters, but we
1447 need to skip over a multibyte character in UTF8 mode. */
1448
1449 case OP_EXACT:
1450 branchlength += GET2(cc,1);
1451 cc += 4;
1452 #ifdef SUPPORT_UTF8
1453 if ((options & PCRE_UTF8) != 0)
1454 {
1455 while((*cc & 0x80) == 0x80) cc++;
1456 }
1457 #endif
1458 break;
1459
1460 case OP_TYPEEXACT:
1461 branchlength += GET2(cc,1);
1462 cc += 4;
1463 break;
1464
1465 /* Handle single-char matchers */
1466
1467 case OP_PROP:
1468 case OP_NOTPROP:
1469 cc++;
1470 /* Fall through */
1471
1472 case OP_NOT_DIGIT:
1473 case OP_DIGIT:
1474 case OP_NOT_WHITESPACE:
1475 case OP_WHITESPACE:
1476 case OP_NOT_WORDCHAR:
1477 case OP_WORDCHAR:
1478 case OP_ANY:
1479 branchlength++;
1480 cc++;
1481 break;
1482
1483 /* The single-byte matcher isn't allowed */
1484
1485 case OP_ANYBYTE:
1486 return -2;
1487
1488 /* Check a class for variable quantification */
1489
1490 #ifdef SUPPORT_UTF8
1491 case OP_XCLASS:
1492 cc += GET(cc, 1) - 33;
1493 /* Fall through */
1494 #endif
1495
1496 case OP_CLASS:
1497 case OP_NCLASS:
1498 cc += 33;
1499
1500 switch (*cc)
1501 {
1502 case OP_CRSTAR:
1503 case OP_CRMINSTAR:
1504 case OP_CRQUERY:
1505 case OP_CRMINQUERY:
1506 return -1;
1507
1508 case OP_CRRANGE:
1509 case OP_CRMINRANGE:
1510 if (GET2(cc,1) != GET2(cc,3)) return -1;
1511 branchlength += GET2(cc,1);
1512 cc += 5;
1513 break;
1514
1515 default:
1516 branchlength++;
1517 }
1518 break;
1519
1520 /* Anything else is variable length */
1521
1522 default:
1523 return -1;
1524 }
1525 }
1526 /* Control never gets here */
1527 }
1528
1529
1530
1531
1532 /*************************************************
1533 * Scan compiled regex for numbered bracket *
1534 *************************************************/
1535
1536 /* This little function scans through a compiled pattern until it finds a
1537 capturing bracket with the given number.
1538
1539 Arguments:
1540 code points to start of expression
1541 utf8 TRUE in UTF-8 mode
1542 number the required bracket number
1543
1544 Returns: pointer to the opcode for the bracket, or NULL if not found
1545 */
1546
1547 static const uschar *
1548 find_bracket(const uschar *code, BOOL utf8, int number)
1549 {
1550 #ifndef SUPPORT_UTF8
1551 utf8 = utf8; /* Stop pedantic compilers complaining */
1552 #endif
1553
1554 for (;;)
1555 {
1556 register int c = *code;
1557 if (c == OP_END) return NULL;
1558 else if (c > OP_BRA)
1559 {
1560 int n = c - OP_BRA;
1561 if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1562 if (n == number) return (uschar *)code;
1563 code += OP_lengths[OP_BRA];
1564 }
1565 else
1566 {
1567 code += OP_lengths[c];
1568
1569 #ifdef SUPPORT_UTF8
1570
1571 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1572 by a multi-byte character. The length in the table is a minimum, so we have
1573 to scan along to skip the extra bytes. All opcodes are less than 128, so we
1574 can use relatively efficient code. */
1575
1576 if (utf8) switch(c)
1577 {
1578 case OP_CHAR:
1579 case OP_CHARNC:
1580 case OP_EXACT:
1581 case OP_UPTO:
1582 case OP_MINUPTO:
1583 case OP_STAR:
1584 case OP_MINSTAR:
1585 case OP_PLUS:
1586 case OP_MINPLUS:
1587 case OP_QUERY:
1588 case OP_MINQUERY:
1589 while ((*code & 0xc0) == 0x80) code++;
1590 break;
1591
1592 /* XCLASS is used for classes that cannot be represented just by a bit
1593 map. This includes negated single high-valued characters. The length in
1594 the table is zero; the actual length is stored in the compiled code. */
1595
1596 case OP_XCLASS:
1597 code += GET(code, 1) + 1;
1598 break;
1599 }
1600 #endif
1601 }
1602 }
1603 }
1604
1605
1606
1607 /*************************************************
1608 * Scan compiled regex for recursion reference *
1609 *************************************************/
1610
1611 /* This little function scans through a compiled pattern until it finds an
1612 instance of OP_RECURSE.
1613
1614 Arguments:
1615 code points to start of expression
1616 utf8 TRUE in UTF-8 mode
1617
1618 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1619 */
1620
1621 static const uschar *
1622 find_recurse(const uschar *code, BOOL utf8)
1623 {
1624 #ifndef SUPPORT_UTF8
1625 utf8 = utf8; /* Stop pedantic compilers complaining */
1626 #endif
1627
1628 for (;;)
1629 {
1630 register int c = *code;
1631 if (c == OP_END) return NULL;
1632 else if (c == OP_RECURSE) return code;
1633 else if (c > OP_BRA)
1634 {
1635 code += OP_lengths[OP_BRA];
1636 }
1637 else
1638 {
1639 code += OP_lengths[c];
1640
1641 #ifdef SUPPORT_UTF8
1642
1643 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1644 by a multi-byte character. The length in the table is a minimum, so we have
1645 to scan along to skip the extra bytes. All opcodes are less than 128, so we
1646 can use relatively efficient code. */
1647
1648 if (utf8) switch(c)
1649 {
1650 case OP_CHAR:
1651 case OP_CHARNC:
1652 case OP_EXACT:
1653 case OP_UPTO:
1654 case OP_MINUPTO:
1655 case OP_STAR:
1656 case OP_MINSTAR:
1657 case OP_PLUS:
1658 case OP_MINPLUS:
1659 case OP_QUERY:
1660 case OP_MINQUERY:
1661 while ((*code & 0xc0) == 0x80) code++;
1662 break;
1663
1664 /* XCLASS is used for classes that cannot be represented just by a bit
1665 map. This includes negated single high-valued characters. The length in
1666 the table is zero; the actual length is stored in the compiled code. */
1667
1668 case OP_XCLASS:
1669 code += GET(code, 1) + 1;
1670 break;
1671 }
1672 #endif
1673 }
1674 }
1675 }
1676
1677
1678
1679 /*************************************************
1680 * Scan compiled branch for non-emptiness *
1681 *************************************************/
1682
1683 /* This function scans through a branch of a compiled pattern to see whether it
1684 can match the empty string or not. It is called only from could_be_empty()
1685 below. Note that first_significant_code() skips over assertions. If we hit an
1686 unclosed bracket, we return "empty" - this means we've struck an inner bracket
1687 whose current branch will already have been scanned.
1688
1689 Arguments:
1690 code points to start of search
1691 endcode points to where to stop
1692 utf8 TRUE if in UTF8 mode
1693
1694 Returns: TRUE if what is matched could be empty
1695 */
1696
1697 static BOOL
1698 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1699 {
1700 register int c;
1701 for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);
1702 code < endcode;
1703 code = first_significant_code(code + OP_lengths[c], NULL, 0, TRUE))
1704 {
1705 const uschar *ccode;
1706
1707 c = *code;
1708
1709 if (c >= OP_BRA)
1710 {
1711 BOOL empty_branch;
1712 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1713
1714 /* Scan a closed bracket */
1715
1716 empty_branch = FALSE;
1717 do
1718 {
1719 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1720 empty_branch = TRUE;
1721 code += GET(code, 1);
1722 }
1723 while (*code == OP_ALT);
1724 if (!empty_branch) return FALSE; /* All branches are non-empty */
1725 code += 1 + LINK_SIZE;
1726 c = *code;
1727 }
1728
1729 else switch (c)
1730 {
1731 /* Check for quantifiers after a class */
1732
1733 #ifdef SUPPORT_UTF8
1734 case OP_XCLASS:
1735 ccode = code + GET(code, 1);
1736 goto CHECK_CLASS_REPEAT;
1737 #endif
1738
1739 case OP_CLASS:
1740 case OP_NCLASS:
1741 ccode = code + 33;
1742
1743 #ifdef SUPPORT_UTF8
1744 CHECK_CLASS_REPEAT:
1745 #endif
1746
1747 switch (*ccode)
1748 {
1749 case OP_CRSTAR: /* These could be empty; continue */
1750 case OP_CRMINSTAR:
1751 case OP_CRQUERY:
1752 case OP_CRMINQUERY:
1753 break;
1754
1755 default: /* Non-repeat => class must match */
1756 case OP_CRPLUS: /* These repeats aren't empty */
1757 case OP_CRMINPLUS:
1758 return FALSE;
1759
1760 case OP_CRRANGE:
1761 case OP_CRMINRANGE:
1762 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1763 break;
1764 }
1765 break;
1766
1767 /* Opcodes that must match a character */
1768
1769 case OP_PROP:
1770 case OP_NOTPROP:
1771 case OP_EXTUNI:
1772 case OP_NOT_DIGIT:
1773 case OP_DIGIT:
1774 case OP_NOT_WHITESPACE:
1775 case OP_WHITESPACE:
1776 case OP_NOT_WORDCHAR:
1777 case OP_WORDCHAR:
1778 case OP_ANY:
1779 case OP_ANYBYTE:
1780 case OP_CHAR:
1781 case OP_CHARNC:
1782 case OP_NOT:
1783 case OP_PLUS:
1784 case OP_MINPLUS:
1785 case OP_EXACT:
1786 case OP_NOTPLUS:
1787 case OP_NOTMINPLUS:
1788 case OP_NOTEXACT:
1789 case OP_TYPEPLUS:
1790 case OP_TYPEMINPLUS:
1791 case OP_TYPEEXACT:
1792 return FALSE;
1793
1794 /* End of branch */
1795
1796 case OP_KET:
1797 case OP_KETRMAX:
1798 case OP_KETRMIN:
1799 case OP_ALT:
1800 return TRUE;
1801
1802 /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO may be
1803 followed by a multibyte character */
1804
1805 #ifdef SUPPORT_UTF8
1806 case OP_STAR:
1807 case OP_MINSTAR:
1808 case OP_QUERY:
1809 case OP_MINQUERY:
1810 case OP_UPTO:
1811 case OP_MINUPTO:
1812 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1813 break;
1814 #endif
1815 }
1816 }
1817
1818 return TRUE;
1819 }
1820
1821
1822
1823 /*************************************************
1824 * Scan compiled regex for non-emptiness *
1825 *************************************************/
1826
1827 /* This function is called to check for left recursive calls. We want to check
1828 the current branch of the current pattern to see if it could match the empty
1829 string. If it could, we must look outwards for branches at other levels,
1830 stopping when we pass beyond the bracket which is the subject of the recursion.
1831
1832 Arguments:
1833 code points to start of the recursion
1834 endcode points to where to stop (current RECURSE item)
1835 bcptr points to the chain of current (unclosed) branch starts
1836 utf8 TRUE if in UTF-8 mode
1837
1838 Returns: TRUE if what is matched could be empty
1839 */
1840
1841 static BOOL
1842 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1843 BOOL utf8)
1844 {
1845 while (bcptr != NULL && bcptr->current >= code)
1846 {
1847 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1848 bcptr = bcptr->outer;
1849 }
1850 return TRUE;
1851 }
1852
1853
1854
1855 /*************************************************
1856 * Check for POSIX class syntax *
1857 *************************************************/
1858
1859 /* This function is called when the sequence "[:" or "[." or "[=" is
1860 encountered in a character class. It checks whether this is followed by an
1861 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1862 ".]" or "=]".
1863
1864 Argument:
1865 ptr pointer to the initial [
1866 endptr where to return the end pointer
1867 cd pointer to compile data
1868
1869 Returns: TRUE or FALSE
1870 */
1871
1872 static BOOL
1873 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1874 {
1875 int terminator; /* Don't combine these lines; the Solaris cc */
1876 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1877 if (*(++ptr) == '^') ptr++;
1878 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1879 if (*ptr == terminator && ptr[1] == ']')
1880 {
1881 *endptr = ptr;
1882 return TRUE;
1883 }
1884 return FALSE;
1885 }
1886
1887
1888
1889
1890 /*************************************************
1891 * Check POSIX class name *
1892 *************************************************/
1893
1894 /* This function is called to check the name given in a POSIX-style class entry
1895 such as [:alnum:].
1896
1897 Arguments:
1898 ptr points to the first letter
1899 len the length of the name
1900
1901 Returns: a value representing the name, or -1 if unknown
1902 */
1903
1904 static int
1905 check_posix_name(const uschar *ptr, int len)
1906 {
1907 register int yield = 0;
1908 while (posix_name_lengths[yield] != 0)
1909 {
1910 if (len == posix_name_lengths[yield] &&
1911 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1912 yield++;
1913 }
1914 return -1;
1915 }
1916
1917
1918 /*************************************************
1919 * Adjust OP_RECURSE items in repeated group *
1920 *************************************************/
1921
1922 /* OP_RECURSE items contain an offset from the start of the regex to the group
1923 that is referenced. This means that groups can be replicated for fixed
1924 repetition simply by copying (because the recursion is allowed to refer to
1925 earlier groups that are outside the current group). However, when a group is
1926 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1927 it, after it has been compiled. This means that any OP_RECURSE items within it
1928 that refer to the group itself or any contained groups have to have their
1929 offsets adjusted. That is the job of this function. Before it is called, the
1930 partially compiled regex must be temporarily terminated with OP_END.
1931
1932 Arguments:
1933 group points to the start of the group
1934 adjust the amount by which the group is to be moved
1935 utf8 TRUE in UTF-8 mode
1936 cd contains pointers to tables etc.
1937
1938 Returns: nothing
1939 */
1940
1941 static void
1942 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)
1943 {
1944 uschar *ptr = group;
1945 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1946 {
1947 int offset = GET(ptr, 1);
1948 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1949 ptr += 1 + LINK_SIZE;
1950 }
1951 }
1952
1953
1954
1955 /*************************************************
1956 * Insert an automatic callout point *
1957 *************************************************/
1958
1959 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1960 callout points before each pattern item.
1961
1962 Arguments:
1963 code current code pointer
1964 ptr current pattern pointer
1965 cd pointers to tables etc
1966
1967 Returns: new code pointer
1968 */
1969
1970 static uschar *
1971 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1972 {
1973 *code++ = OP_CALLOUT;
1974 *code++ = 255;
1975 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1976 PUT(code, LINK_SIZE, 0); /* Default length */
1977 return code + 2*LINK_SIZE;
1978 }
1979
1980
1981
1982 /*************************************************
1983 * Complete a callout item *
1984 *************************************************/
1985
1986 /* A callout item contains the length of the next item in the pattern, which
1987 we can't fill in till after we have reached the relevant point. This is used
1988 for both automatic and manual callouts.
1989
1990 Arguments:
1991 previous_callout points to previous callout item
1992 ptr current pattern pointer
1993 cd pointers to tables etc
1994
1995 Returns: nothing
1996 */
1997
1998 static void
1999 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2000 {
2001 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
2002 PUT(previous_callout, 2 + LINK_SIZE, length);
2003 }
2004
2005
2006
2007 #ifdef SUPPORT_UCP
2008 /*************************************************
2009 * Get othercase range *
2010 *************************************************/
2011
2012 /* This function is passed the start and end of a class range, in UTF-8 mode
2013 with UCP support. It searches up the characters, looking for internal ranges of
2014 characters in the "other" case. Each call returns the next one, updating the
2015 start address.
2016
2017 Arguments:
2018 cptr points to starting character value; updated
2019 d end value
2020 ocptr where to put start of othercase range
2021 odptr where to put end of othercase range
2022
2023 Yield: TRUE when range returned; FALSE when no more
2024 */
2025
2026 static BOOL
2027 get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)
2028 {
2029 int c, chartype, othercase, next;
2030
2031 for (c = *cptr; c <= d; c++)
2032 {
2033 if (ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0) break;
2034 }
2035
2036 if (c > d) return FALSE;
2037
2038 *ocptr = othercase;
2039 next = othercase + 1;
2040
2041 for (++c; c <= d; c++)
2042 {
2043 if (ucp_findchar(c, &chartype, &othercase) != ucp_L || othercase != next)
2044 break;
2045 next++;
2046 }
2047
2048 *odptr = next - 1;
2049 *cptr = c;
2050
2051 return TRUE;
2052 }
2053 #endif /* SUPPORT_UCP */
2054
2055
2056 /*************************************************
2057 * Compile one branch *
2058 *************************************************/
2059
2060 /* Scan the pattern, compiling it into the code vector. If the options are
2061 changed during the branch, the pointer is used to change the external options
2062 bits.
2063
2064 Arguments:
2065 optionsptr pointer to the option bits
2066 brackets points to number of extracting brackets used
2067 codeptr points to the pointer to the current code point
2068 ptrptr points to the current pattern pointer
2069 errorptr points to pointer to error message
2070 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2071 reqbyteptr set to the last literal character required, else < 0
2072 bcptr points to current branch chain
2073 cd contains pointers to tables etc.
2074
2075 Returns: TRUE on success
2076 FALSE, with *errorptr set on error
2077 */
2078
2079 static BOOL
2080 compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
2081 const uschar **ptrptr, const char **errorptr, int *firstbyteptr,
2082 int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
2083 {
2084 int repeat_type, op_type;
2085 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2086 int bravalue = 0;
2087 int greedy_default, greedy_non_default;
2088 int firstbyte, reqbyte;
2089 int zeroreqbyte, zerofirstbyte;
2090 int req_caseopt, reqvary, tempreqvary;
2091 int condcount = 0;
2092 int options = *optionsptr;
2093 int after_manual_callout = 0;
2094 register int c;
2095 register uschar *code = *codeptr;
2096 uschar *tempcode;
2097 BOOL inescq = FALSE;
2098 BOOL groupsetfirstbyte = FALSE;
2099 const uschar *ptr = *ptrptr;
2100 const uschar *tempptr;
2101 uschar *previous = NULL;
2102 uschar *previous_callout = NULL;
2103 uschar classbits[32];
2104
2105 #ifdef SUPPORT_UTF8
2106 BOOL class_utf8;
2107 BOOL utf8 = (options & PCRE_UTF8) != 0;
2108 uschar *class_utf8data;
2109 uschar utf8_char[6];
2110 #else
2111 BOOL utf8 = FALSE;
2112 #endif
2113
2114 /* Set up the default and non-default settings for greediness */
2115
2116 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2117 greedy_non_default = greedy_default ^ 1;
2118
2119 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2120 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2121 matches a non-fixed char first char; reqbyte just remains unset if we never
2122 find one.
2123
2124 When we hit a repeat whose minimum is zero, we may have to adjust these values
2125 to take the zero repeat into account. This is implemented by setting them to
2126 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2127 item types that can be repeated set these backoff variables appropriately. */
2128
2129 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2130
2131 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2132 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2133 value > 255. It is added into the firstbyte or reqbyte variables to record the
2134 case status of the value. This is used only for ASCII characters. */
2135
2136 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2137
2138 /* Switch on next character until the end of the branch */
2139
2140 for (;; ptr++)
2141 {
2142 BOOL negate_class;
2143 BOOL possessive_quantifier;
2144 BOOL is_quantifier;
2145 int class_charcount;
2146 int class_lastchar;
2147 int newoptions;
2148 int recno;
2149 int skipbytes;
2150 int subreqbyte;
2151 int subfirstbyte;
2152 int mclength;
2153 uschar mcbuffer[8];
2154
2155 /* Next byte in the pattern */
2156
2157 c = *ptr;
2158
2159 /* If in \Q...\E, check for the end; if not, we have a literal */
2160
2161 if (inescq && c != 0)
2162 {
2163 if (c == '\\' && ptr[1] == 'E')
2164 {
2165 inescq = FALSE;
2166 ptr++;
2167 continue;
2168 }
2169 else
2170 {
2171 if (previous_callout != NULL)
2172 {
2173 complete_callout(previous_callout, ptr, cd);
2174 previous_callout = NULL;
2175 }
2176 if ((options & PCRE_AUTO_CALLOUT) != 0)
2177 {
2178 previous_callout = code;
2179 code = auto_callout(code, ptr, cd);
2180 }
2181 goto NORMAL_CHAR;
2182 }
2183 }
2184
2185 /* Fill in length of a previous callout, except when the next thing is
2186 a quantifier. */
2187
2188 is_quantifier = c == '*' || c == '+' || c == '?' ||
2189 (c == '{' && is_counted_repeat(ptr+1));
2190
2191 if (!is_quantifier && previous_callout != NULL &&
2192 after_manual_callout-- <= 0)
2193 {
2194 complete_callout(previous_callout, ptr, cd);
2195 previous_callout = NULL;
2196 }
2197
2198 /* In extended mode, skip white space and comments */
2199
2200 if ((options & PCRE_EXTENDED) != 0)
2201 {
2202 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2203 if (c == '#')
2204 {
2205 /* The space before the ; is to avoid a warning on a silly compiler
2206 on the Macintosh. */
2207 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
2208 if (c != 0) continue; /* Else fall through to handle end of string */
2209 }
2210 }
2211
2212 /* No auto callout for quantifiers. */
2213
2214 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2215 {
2216 previous_callout = code;
2217 code = auto_callout(code, ptr, cd);
2218 }
2219
2220 switch(c)
2221 {
2222 /* The branch terminates at end of string, |, or ). */
2223
2224 case 0:
2225 case '|':
2226 case ')':
2227 *firstbyteptr = firstbyte;
2228 *reqbyteptr = reqbyte;
2229 *codeptr = code;
2230 *ptrptr = ptr;
2231 return TRUE;
2232
2233 /* Handle single-character metacharacters. In multiline mode, ^ disables
2234 the setting of any following char as a first character. */
2235
2236 case '^':
2237 if ((options & PCRE_MULTILINE) != 0)
2238 {
2239 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2240 }
2241 previous = NULL;
2242 *code++ = OP_CIRC;
2243 break;
2244
2245 case '$':
2246 previous = NULL;
2247 *code++ = OP_DOLL;
2248 break;
2249
2250 /* There can never be a first char if '.' is first, whatever happens about
2251 repeats. The value of reqbyte doesn't change either. */
2252
2253 case '.':
2254 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2255 zerofirstbyte = firstbyte;
2256 zeroreqbyte = reqbyte;
2257 previous = code;
2258 *code++ = OP_ANY;
2259 break;
2260
2261 /* Character classes. If the included characters are all < 255 in value, we
2262 build a 32-byte bitmap of the permitted characters, except in the special
2263 case where there is only one such character. For negated classes, we build
2264 the map as usual, then invert it at the end. However, we use a different
2265 opcode so that data characters > 255 can be handled correctly.
2266
2267 If the class contains characters outside the 0-255 range, a different
2268 opcode is compiled. It may optionally have a bit map for characters < 256,
2269 but those above are are explicitly listed afterwards. A flag byte tells
2270 whether the bitmap is present, and whether this is a negated class or not.
2271 */
2272
2273 case '[':
2274 previous = code;
2275
2276 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2277 they are encountered at the top level, so we'll do that too. */
2278
2279 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2280 check_posix_syntax(ptr, &tempptr, cd))
2281 {
2282 *errorptr = (ptr[1] == ':')? ERR13 : ERR31;
2283 goto FAILED;
2284 }
2285
2286 /* If the first character is '^', set the negation flag and skip it. */
2287
2288 if ((c = *(++ptr)) == '^')
2289 {
2290 negate_class = TRUE;
2291 c = *(++ptr);
2292 }
2293 else
2294 {
2295 negate_class = FALSE;
2296 }
2297
2298 /* Keep a count of chars with values < 256 so that we can optimize the case
2299 of just a single character (as long as it's < 256). For higher valued UTF-8
2300 characters, we don't yet do any optimization. */
2301
2302 class_charcount = 0;
2303 class_lastchar = -1;
2304
2305 #ifdef SUPPORT_UTF8
2306 class_utf8 = FALSE; /* No chars >= 256 */
2307 class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
2308 #endif
2309
2310 /* Initialize the 32-char bit map to all zeros. We have to build the
2311 map in a temporary bit of store, in case the class contains only 1
2312 character (< 256), because in that case the compiled code doesn't use the
2313 bit map. */
2314
2315 memset(classbits, 0, 32 * sizeof(uschar));
2316
2317 /* Process characters until ] is reached. By writing this as a "do" it
2318 means that an initial ] is taken as a data character. The first pass
2319 through the regex checked the overall syntax, so we don't need to be very
2320 strict here. At the start of the loop, c contains the first byte of the
2321 character. */
2322
2323 do
2324 {
2325 #ifdef SUPPORT_UTF8
2326 if (utf8 && c > 127)
2327 { /* Braces are required because the */
2328 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2329 }
2330 #endif
2331
2332 /* Inside \Q...\E everything is literal except \E */
2333
2334 if (inescq)
2335 {
2336 if (c == '\\' && ptr[1] == 'E')
2337 {
2338 inescq = FALSE;
2339 ptr++;
2340 continue;
2341 }
2342 else goto LONE_SINGLE_CHARACTER;
2343 }
2344
2345 /* Handle POSIX class names. Perl allows a negation extension of the
2346 form [:^name:]. A square bracket that doesn't match the syntax is
2347 treated as a literal. We also recognize the POSIX constructions
2348 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2349 5.6 and 5.8 do. */
2350
2351 if (c == '[' &&
2352 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2353 check_posix_syntax(ptr, &tempptr, cd))
2354 {
2355 BOOL local_negate = FALSE;
2356 int posix_class, i;
2357 register const uschar *cbits = cd->cbits;
2358
2359 if (ptr[1] != ':')
2360 {
2361 *errorptr = ERR31;
2362 goto FAILED;
2363 }
2364
2365 ptr += 2;
2366 if (*ptr == '^')
2367 {
2368 local_negate = TRUE;
2369 ptr++;
2370 }
2371
2372 posix_class = check_posix_name(ptr, tempptr - ptr);
2373 if (posix_class < 0)
2374 {
2375 *errorptr = ERR30;
2376 goto FAILED;
2377 }
2378
2379 /* If matching is caseless, upper and lower are converted to
2380 alpha. This relies on the fact that the class table starts with
2381 alpha, lower, upper as the first 3 entries. */
2382
2383 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2384 posix_class = 0;
2385
2386 /* Or into the map we are building up to 3 of the static class
2387 tables, or their negations. The [:blank:] class sets up the same
2388 chars as the [:space:] class (all white space). We remove the vertical
2389 white space chars afterwards. */
2390
2391 posix_class *= 3;
2392 for (i = 0; i < 3; i++)
2393 {
2394 BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;
2395 int taboffset = posix_class_maps[posix_class + i];
2396 if (taboffset < 0) break;
2397 if (local_negate)
2398 {
2399 if (i == 0)
2400 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset];
2401 else
2402 for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset];
2403 if (blankclass) classbits[1] |= 0x3c;
2404 }
2405 else
2406 {
2407 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset];
2408 if (blankclass) classbits[1] &= ~0x3c;
2409 }
2410 }
2411
2412 ptr = tempptr + 1;
2413 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2414 continue; /* End of POSIX syntax handling */
2415 }
2416
2417 /* Backslash may introduce a single character, or it may introduce one
2418 of the specials, which just set a flag. Escaped items are checked for
2419 validity in the pre-compiling pass. The sequence \b is a special case.
2420 Inside a class (and only there) it is treated as backspace. Elsewhere
2421 it marks a word boundary. Other escapes have preset maps ready to
2422 or into the one we are building. We assume they have more than one
2423 character in them, so set class_charcount bigger than one. */
2424
2425 if (c == '\\')
2426 {
2427 c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
2428
2429 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2430 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2431 else if (-c == ESC_Q) /* Handle start of quoted string */
2432 {
2433 if (ptr[1] == '\\' && ptr[2] == 'E')
2434 {
2435 ptr += 2; /* avoid empty string */
2436 }
2437 else inescq = TRUE;
2438 continue;
2439 }
2440
2441 if (c < 0)
2442 {
2443 register const uschar *cbits = cd->cbits;
2444 class_charcount += 2; /* Greater than 1 is what matters */
2445 switch (-c)
2446 {
2447 case ESC_d:
2448 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2449 continue;
2450
2451 case ESC_D:
2452 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2453 continue;
2454
2455 case ESC_w:
2456 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2457 continue;
2458
2459 case ESC_W:
2460 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2461 continue;
2462
2463 case ESC_s:
2464 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2465 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2466 continue;
2467
2468 case ESC_S:
2469 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2470 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2471 continue;
2472
2473 #ifdef SUPPORT_UCP
2474 case ESC_p:
2475 case ESC_P:
2476 {
2477 BOOL negated;
2478 int property = get_ucp(&ptr, &negated, errorptr);
2479 if (property < 0) goto FAILED;
2480 class_utf8 = TRUE;
2481 *class_utf8data++ = ((-c == ESC_p) != negated)?
2482 XCL_PROP : XCL_NOTPROP;
2483 *class_utf8data++ = property;
2484 class_charcount -= 2; /* Not a < 256 character */
2485 }
2486 continue;
2487 #endif
2488
2489 /* Unrecognized escapes are faulted if PCRE is running in its
2490 strict mode. By default, for compatibility with Perl, they are
2491 treated as literals. */
2492
2493 default:
2494 if ((options & PCRE_EXTRA) != 0)
2495 {
2496 *errorptr = ERR7;
2497 goto FAILED;
2498 }
2499 c = *ptr; /* The final character */
2500 class_charcount -= 2; /* Undo the default count from above */
2501 }
2502 }
2503
2504 /* Fall through if we have a single character (c >= 0). This may be
2505 > 256 in UTF-8 mode. */
2506
2507 } /* End of backslash handling */
2508
2509 /* A single character may be followed by '-' to form a range. However,
2510 Perl does not permit ']' to be the end of the range. A '-' character
2511 here is treated as a literal. */
2512
2513 if (ptr[1] == '-' && ptr[2] != ']')
2514 {
2515 int d;
2516 ptr += 2;
2517
2518 #ifdef SUPPORT_UTF8
2519 if (utf8)
2520 { /* Braces are required because the */
2521 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2522 }
2523 else
2524 #endif
2525 d = *ptr; /* Not UTF-8 mode */
2526
2527 /* The second part of a range can be a single-character escape, but
2528 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2529 in such circumstances. */
2530
2531 if (d == '\\')
2532 {
2533 const uschar *oldptr = ptr;
2534 d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
2535
2536 /* \b is backslash; \X is literal X; any other special means the '-'
2537 was literal */
2538
2539 if (d < 0)
2540 {
2541 if (d == -ESC_b) d = '\b';
2542 else if (d == -ESC_X) d = 'X'; else
2543 {
2544 ptr = oldptr - 2;
2545 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2546 }
2547 }
2548 }
2549
2550 /* The check that the two values are in the correct order happens in
2551 the pre-pass. Optimize one-character ranges */
2552
2553 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2554
2555 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2556 matching, we have to use an XCLASS with extra data items. Caseless
2557 matching for characters > 127 is available only if UCP support is
2558 available. */
2559
2560 #ifdef SUPPORT_UTF8
2561 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2562 {
2563 class_utf8 = TRUE;
2564
2565 /* With UCP support, we can find the other case equivalents of
2566 the relevant characters. There may be several ranges. Optimize how
2567 they fit with the basic range. */
2568
2569 #ifdef SUPPORT_UCP
2570 if ((options & PCRE_CASELESS) != 0)
2571 {
2572 int occ, ocd;
2573 int cc = c;
2574 int origd = d;
2575 while (get_othercase_range(&cc, origd, &occ, &ocd))
2576 {
2577 if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
2578
2579 if (occ < c && ocd >= c - 1) /* Extend the basic range */
2580 { /* if there is overlap, */
2581 c = occ; /* noting that if occ < c */
2582 continue; /* we can't have ocd > d */
2583 } /* because a subrange is */
2584 if (ocd > d && occ <= d + 1) /* always shorter than */
2585 { /* the basic range. */
2586 d = ocd;
2587 continue;
2588 }
2589
2590 if (occ == ocd)
2591 {
2592 *class_utf8data++ = XCL_SINGLE;
2593 }
2594 else
2595 {
2596 *class_utf8data++ = XCL_RANGE;
2597 class_utf8data += ord2utf8(occ, class_utf8data);
2598 }
2599 class_utf8data += ord2utf8(ocd, class_utf8data);
2600 }
2601 }
2602 #endif /* SUPPORT_UCP */
2603
2604 /* Now record the original range, possibly modified for UCP caseless
2605 overlapping ranges. */
2606
2607 *class_utf8data++ = XCL_RANGE;
2608 class_utf8data += ord2utf8(c, class_utf8data);
2609 class_utf8data += ord2utf8(d, class_utf8data);
2610
2611 /* With UCP support, we are done. Without UCP support, there is no
2612 caseless matching for UTF-8 characters > 127; we can use the bit map
2613 for the smaller ones. */
2614
2615 #ifdef SUPPORT_UCP
2616 continue; /* With next character in the class */
2617 #else
2618 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2619
2620 /* Adjust upper limit and fall through to set up the map */
2621
2622 d = 127;
2623
2624 #endif /* SUPPORT_UCP */
2625 }
2626 #endif /* SUPPORT_UTF8 */
2627
2628 /* We use the bit map for all cases when not in UTF-8 mode; else
2629 ranges that lie entirely within 0-127 when there is UCP support; else
2630 for partial ranges without UCP support. */
2631
2632 for (; c <= d; c++)
2633 {
2634 classbits[c/8] |= (1 << (c&7));
2635 if ((options & PCRE_CASELESS) != 0)
2636 {
2637 int uc = cd->fcc[c]; /* flip case */
2638 classbits[uc/8] |= (1 << (uc&7));
2639 }
2640 class_charcount++; /* in case a one-char range */
2641 class_lastchar = c;
2642 }
2643
2644 continue; /* Go get the next char in the class */
2645 }
2646
2647 /* Handle a lone single character - we can get here for a normal
2648 non-escape char, or after \ that introduces a single character or for an
2649 apparent range that isn't. */
2650
2651 LONE_SINGLE_CHARACTER:
2652
2653 /* Handle a character that cannot go in the bit map */
2654
2655 #ifdef SUPPORT_UTF8
2656 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2657 {
2658 class_utf8 = TRUE;
2659 *class_utf8data++ = XCL_SINGLE;
2660 class_utf8data += ord2utf8(c, class_utf8data);
2661
2662 #ifdef SUPPORT_UCP
2663 if ((options & PCRE_CASELESS) != 0)
2664 {
2665 int chartype;
2666 int othercase;
2667 if (ucp_findchar(c, &chartype, &othercase) >= 0 && othercase > 0)
2668 {
2669 *class_utf8data++ = XCL_SINGLE;
2670 class_utf8data += ord2utf8(othercase, class_utf8data);
2671 }
2672 }
2673 #endif /* SUPPORT_UCP */
2674
2675 }
2676 else
2677 #endif /* SUPPORT_UTF8 */
2678
2679 /* Handle a single-byte character */
2680 {
2681 classbits[c/8] |= (1 << (c&7));
2682 if ((options & PCRE_CASELESS) != 0)
2683 {
2684 c = cd->fcc[c]; /* flip case */
2685 classbits[c/8] |= (1 << (c&7));
2686 }
2687 class_charcount++;
2688 class_lastchar = c;
2689 }
2690 }
2691
2692 /* Loop until ']' reached; the check for end of string happens inside the
2693 loop. This "while" is the end of the "do" above. */
2694
2695 while ((c = *(++ptr)) != ']' || inescq);
2696
2697 /* If class_charcount is 1, we saw precisely one character whose value is
2698 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2699 can optimize the negative case only if there were no characters >= 128
2700 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2701 single-bytes only. This is an historical hangover. Maybe one day we can
2702 tidy these opcodes to handle multi-byte characters.
2703
2704 The optimization throws away the bit map. We turn the item into a
2705 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2706 that OP_NOT does not support multibyte characters. In the positive case, it
2707 can cause firstbyte to be set. Otherwise, there can be no first char if
2708 this item is first, whatever repeat count may follow. In the case of
2709 reqbyte, save the previous value for reinstating. */
2710
2711 #ifdef SUPPORT_UTF8
2712 if (class_charcount == 1 &&
2713 (!utf8 ||
2714 (!class_utf8 && (!negate_class || class_lastchar < 128))))
2715
2716 #else
2717 if (class_charcount == 1)
2718 #endif
2719 {
2720 zeroreqbyte = reqbyte;
2721
2722 /* The OP_NOT opcode works on one-byte characters only. */
2723
2724 if (negate_class)
2725 {
2726 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2727 zerofirstbyte = firstbyte;
2728 *code++ = OP_NOT;
2729 *code++ = class_lastchar;
2730 break;
2731 }
2732
2733 /* For a single, positive character, get the value into mcbuffer, and
2734 then we can handle this with the normal one-character code. */
2735
2736 #ifdef SUPPORT_UTF8
2737 if (utf8 && class_lastchar > 127)
2738 mclength = ord2utf8(class_lastchar, mcbuffer);
2739 else
2740 #endif
2741 {
2742 mcbuffer[0] = class_lastchar;
2743 mclength = 1;
2744 }
2745 goto ONE_CHAR;
2746 } /* End of 1-char optimization */
2747
2748 /* The general case - not the one-char optimization. If this is the first
2749 thing in the branch, there can be no first char setting, whatever the
2750 repeat count. Any reqbyte setting must remain unchanged after any kind of
2751 repeat. */
2752
2753 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2754 zerofirstbyte = firstbyte;
2755 zeroreqbyte = reqbyte;
2756
2757 /* If there are characters with values > 255, we have to compile an
2758 extended class, with its own opcode. If there are no characters < 256,
2759 we can omit the bitmap. */
2760
2761 #ifdef SUPPORT_UTF8
2762 if (class_utf8)
2763 {
2764 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2765 *code++ = OP_XCLASS;
2766 code += LINK_SIZE;
2767 *code = negate_class? XCL_NOT : 0;
2768
2769 /* If the map is required, install it, and move on to the end of
2770 the extra data */
2771
2772 if (class_charcount > 0)
2773 {
2774 *code++ |= XCL_MAP;
2775 memcpy(code, classbits, 32);
2776 code = class_utf8data;
2777 }
2778
2779 /* If the map is not required, slide down the extra data. */
2780
2781 else
2782 {
2783 int len = class_utf8data - (code + 33);
2784 memmove(code + 1, code + 33, len);
2785 code += len + 1;
2786 }
2787
2788 /* Now fill in the complete length of the item */
2789
2790 PUT(previous, 1, code - previous);
2791 break; /* End of class handling */
2792 }
2793 #endif
2794
2795 /* If there are no characters > 255, negate the 32-byte map if necessary,
2796 and copy it into the code vector. If this is the first thing in the branch,
2797 there can be no first char setting, whatever the repeat count. Any reqbyte
2798 setting must remain unchanged after any kind of repeat. */
2799
2800 if (negate_class)
2801 {
2802 *code++ = OP_NCLASS;
2803 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2804 }
2805 else
2806 {
2807 *code++ = OP_CLASS;
2808 memcpy(code, classbits, 32);
2809 }
2810 code += 32;
2811 break;
2812
2813 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2814 has been tested above. */
2815
2816 case '{':
2817 if (!is_quantifier) goto NORMAL_CHAR;
2818 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
2819 if (*errorptr != NULL) goto FAILED;
2820 goto REPEAT;
2821
2822 case '*':
2823 repeat_min = 0;
2824 repeat_max = -1;
2825 goto REPEAT;
2826
2827 case '+':
2828 repeat_min = 1;
2829 repeat_max = -1;
2830 goto REPEAT;
2831
2832 case '?':
2833 repeat_min = 0;
2834 repeat_max = 1;
2835
2836 REPEAT:
2837 if (previous == NULL)
2838 {
2839 *errorptr = ERR9;
2840 goto FAILED;
2841 }
2842
2843 if (repeat_min == 0)
2844 {
2845 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2846 reqbyte = zeroreqbyte; /* Ditto */
2847 }
2848
2849 /* Remember whether this is a variable length repeat */
2850
2851 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2852
2853 op_type = 0; /* Default single-char op codes */
2854 possessive_quantifier = FALSE; /* Default not possessive quantifier */
2855
2856 /* Save start of previous item, in case we have to move it up to make space
2857 for an inserted OP_ONCE for the additional '+' extension. */
2858
2859 tempcode = previous;
2860
2861 /* If the next character is '+', we have a possessive quantifier. This
2862 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2863 If the next character is '?' this is a minimizing repeat, by default,
2864 but if PCRE_UNGREEDY is set, it works the other way round. We change the
2865 repeat type to the non-default. */
2866
2867 if (ptr[1] == '+')
2868 {
2869 repeat_type = 0; /* Force greedy */
2870 possessive_quantifier = TRUE;
2871 ptr++;
2872 }
2873 else if (ptr[1] == '?')
2874 {
2875 repeat_type = greedy_non_default;
2876 ptr++;
2877 }
2878 else repeat_type = greedy_default;
2879
2880 /* If previous was a recursion, we need to wrap it inside brackets so that
2881 it can be replicated if necessary. */
2882
2883 if (*previous == OP_RECURSE)
2884 {
2885 memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2886 code += 1 + LINK_SIZE;
2887 *previous = OP_BRA;
2888 PUT(previous, 1, code - previous);
2889 *code = OP_KET;
2890 PUT(code, 1, code - previous);
2891 code += 1 + LINK_SIZE;
2892 }
2893
2894 /* If previous was a character match, abolish the item and generate a
2895 repeat item instead. If a char item has a minumum of more than one, ensure
2896 that it is set in reqbyte - it might not be if a sequence such as x{3} is
2897 the first thing in a branch because the x will have gone into firstbyte
2898 instead. */
2899
2900 if (*previous == OP_CHAR || *previous == OP_CHARNC)
2901 {
2902 /* Deal with UTF-8 characters that take up more than one byte. It's
2903 easier to write this out separately than try to macrify it. Use c to
2904 hold the length of the character in bytes, plus 0x80 to flag that it's a
2905 length rather than a small character. */
2906
2907 #ifdef SUPPORT_UTF8
2908 if (utf8 && (code[-1] & 0x80) != 0)
2909 {
2910 uschar *lastchar = code - 1;
2911 while((*lastchar & 0xc0) == 0x80) lastchar--;
2912 c = code - lastchar; /* Length of UTF-8 character */
2913 memcpy(utf8_char, lastchar, c); /* Save the char */
2914 c |= 0x80; /* Flag c as a length */
2915 }
2916 else
2917 #endif
2918
2919 /* Handle the case of a single byte - either with no UTF8 support, or
2920 with UTF-8 disabled, or for a UTF-8 character < 128. */
2921
2922 {
2923 c = code[-1];
2924 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2925 }
2926
2927 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
2928 }
2929
2930 /* If previous was a single negated character ([^a] or similar), we use
2931 one of the special opcodes, replacing it. The code is shared with single-
2932 character repeats by setting opt_type to add a suitable offset into
2933 repeat_type. OP_NOT is currently used only for single-byte chars. */
2934
2935 else if (*previous == OP_NOT)
2936 {
2937 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
2938 c = previous[1];
2939 goto OUTPUT_SINGLE_REPEAT;
2940 }
2941
2942 /* If previous was a character type match (\d or similar), abolish it and
2943 create a suitable repeat item. The code is shared with single-character
2944 repeats by setting op_type to add a suitable offset into repeat_type. Note
2945 the the Unicode property types will be present only when SUPPORT_UCP is
2946 defined, but we don't wrap the little bits of code here because it just
2947 makes it horribly messy. */
2948
2949 else if (*previous < OP_EODN)
2950 {
2951 uschar *oldcode;
2952 int prop_type;
2953 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
2954 c = *previous;
2955
2956 OUTPUT_SINGLE_REPEAT:
2957 prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)?
2958 previous[1] : -1;
2959
2960 oldcode = code;
2961 code = previous; /* Usually overwrite previous item */
2962
2963 /* If the maximum is zero then the minimum must also be zero; Perl allows
2964 this case, so we do too - by simply omitting the item altogether. */
2965
2966 if (repeat_max == 0) goto END_REPEAT;
2967
2968 /* All real repeats make it impossible to handle partial matching (maybe
2969 one day we will be able to remove this restriction). */
2970
2971 if (repeat_max != 1) cd->nopartial = TRUE;
2972
2973 /* Combine the op_type with the repeat_type */
2974
2975 repeat_type += op_type;
2976
2977 /* A minimum of zero is handled either as the special case * or ?, or as
2978 an UPTO, with the maximum given. */
2979
2980 if (repeat_min == 0)
2981 {
2982 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
2983 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
2984 else
2985 {
2986 *code++ = OP_UPTO + repeat_type;
2987 PUT2INC(code, 0, repeat_max);
2988 }
2989 }
2990
2991 /* A repeat minimum of 1 is optimized into some special cases. If the
2992 maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
2993 left in place and, if the maximum is greater than 1, we use OP_UPTO with
2994 one less than the maximum. */
2995
2996 else if (repeat_min == 1)
2997 {
2998 if (repeat_max == -1)
2999 *code++ = OP_PLUS + repeat_type;
3000 else
3001 {
3002 code = oldcode; /* leave previous item in place */
3003 if (repeat_max == 1) goto END_REPEAT;
3004 *code++ = OP_UPTO + repeat_type;
3005 PUT2INC(code, 0, repeat_max - 1);
3006 }
3007 }
3008
3009 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3010 handled as an EXACT followed by an UPTO. */
3011
3012 else
3013 {
3014 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3015 PUT2INC(code, 0, repeat_min);
3016
3017 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3018 we have to insert the character for the previous code. For a repeated
3019 Unicode property match, there is an extra byte that defines the
3020 required property. In UTF-8 mode, long characters have their length in
3021 c, with the 0x80 bit as a flag. */
3022
3023 if (repeat_max < 0)
3024 {
3025 #ifdef SUPPORT_UTF8
3026 if (utf8 && c >= 128)
3027 {
3028 memcpy(code, utf8_char, c & 7);
3029 code += c & 7;
3030 }
3031 else
3032 #endif
3033 {
3034 *code++ = c;
3035 if (prop_type >= 0) *code++ = prop_type;
3036 }
3037 *code++ = OP_STAR + repeat_type;
3038 }
3039
3040 /* Else insert an UPTO if the max is greater than the min, again
3041 preceded by the character, for the previously inserted code. */
3042
3043 else if (repeat_max != repeat_min)
3044 {
3045 #ifdef SUPPORT_UTF8
3046 if (utf8 && c >= 128)
3047 {
3048 memcpy(code, utf8_char, c & 7);
3049 code += c & 7;
3050 }
3051 else
3052 #endif
3053 *code++ = c;
3054 if (prop_type >= 0) *code++ = prop_type;
3055 repeat_max -= repeat_min;
3056 *code++ = OP_UPTO + repeat_type;
3057 PUT2INC(code, 0, repeat_max);
3058 }
3059 }
3060
3061 /* The character or character type itself comes last in all cases. */
3062
3063 #ifdef SUPPORT_UTF8
3064 if (utf8 && c >= 128)
3065 {
3066 memcpy(code, utf8_char, c & 7);
3067 code += c & 7;
3068 }
3069 else
3070 #endif
3071 *code++ = c;
3072
3073 /* For a repeated Unicode property match, there is an extra byte that
3074 defines the required property. */
3075
3076 #ifdef SUPPORT_UCP
3077 if (prop_type >= 0) *code++ = prop_type;
3078 #endif
3079 }
3080
3081 /* If previous was a character class or a back reference, we put the repeat
3082 stuff after it, but just skip the item if the repeat was {0,0}. */
3083
3084 else if (*previous == OP_CLASS ||
3085 *previous == OP_NCLASS ||
3086 #ifdef SUPPORT_UTF8
3087 *previous == OP_XCLASS ||
3088 #endif
3089 *previous == OP_REF)
3090 {
3091 if (repeat_max == 0)
3092 {
3093 code = previous;
3094 goto END_REPEAT;
3095 }
3096
3097 /* All real repeats make it impossible to handle partial matching (maybe
3098 one day we will be able to remove this restriction). */
3099
3100 if (repeat_max != 1) cd->nopartial = TRUE;
3101
3102 if (repeat_min == 0 && repeat_max == -1)
3103 *code++ = OP_CRSTAR + repeat_type;
3104 else if (repeat_min == 1 && repeat_max == -1)
3105 *code++ = OP_CRPLUS + repeat_type;
3106 else if (repeat_min == 0 && repeat_max == 1)
3107 *code++ = OP_CRQUERY + repeat_type;
3108 else
3109 {
3110 *code++ = OP_CRRANGE + repeat_type;
3111 PUT2INC(code, 0, repeat_min);
3112 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3113 PUT2INC(code, 0, repeat_max);
3114 }
3115 }
3116
3117 /* If previous was a bracket group, we may have to replicate it in certain
3118 cases. */
3119
3120 else if (*previous >= OP_BRA || *previous == OP_ONCE ||
3121 *previous == OP_COND)
3122 {
3123 register int i;
3124 int ketoffset = 0;
3125 int len = code - previous;
3126 uschar *bralink = NULL;
3127
3128 /* If the maximum repeat count is unlimited, find the end of the bracket
3129 by scanning through from the start, and compute the offset back to it
3130 from the current code pointer. There may be an OP_OPT setting following
3131 the final KET, so we can't find the end just by going back from the code
3132 pointer. */
3133
3134 if (repeat_max == -1)
3135 {
3136 register uschar *ket = previous;
3137 do ket += GET(ket, 1); while (*ket != OP_KET);
3138 ketoffset = code - ket;
3139 }
3140
3141 /* The case of a zero minimum is special because of the need to stick
3142 OP_BRAZERO in front of it, and because the group appears once in the
3143 data, whereas in other cases it appears the minimum number of times. For
3144 this reason, it is simplest to treat this case separately, as otherwise
3145 the code gets far too messy. There are several special subcases when the
3146 minimum is zero. */
3147
3148 if (repeat_min == 0)
3149 {
3150 /* If the maximum is also zero, we just omit the group from the output
3151 altogether. */
3152
3153 if (repeat_max == 0)
3154 {
3155 code = previous;
3156 goto END_REPEAT;
3157 }
3158
3159 /* If the maximum is 1 or unlimited, we just have to stick in the
3160 BRAZERO and do no more at this point. However, we do need to adjust
3161 any OP_RECURSE calls inside the group that refer to the group itself or
3162 any internal group, because the offset is from the start of the whole
3163 regex. Temporarily terminate the pattern while doing this. */
3164
3165 if (repeat_max <= 1)
3166 {
3167 *code = OP_END;
3168 adjust_recurse(previous, 1, utf8, cd);
3169 memmove(previous+1, previous, len);
3170 code++;
3171 *previous++ = OP_BRAZERO + repeat_type;
3172 }
3173
3174 /* If the maximum is greater than 1 and limited, we have to replicate
3175 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3176 The first one has to be handled carefully because it's the original
3177 copy, which has to be moved up. The remainder can be handled by code
3178 that is common with the non-zero minimum case below. We have to
3179 adjust the value or repeat_max, since one less copy is required. Once
3180 again, we may have to adjust any OP_RECURSE calls inside the group. */
3181
3182 else
3183 {
3184 int offset;
3185 *code = OP_END;
3186 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);
3187 memmove(previous + 2 + LINK_SIZE, previous, len);
3188 code += 2 + LINK_SIZE;
3189 *previous++ = OP_BRAZERO + repeat_type;
3190 *previous++ = OP_BRA;
3191
3192 /* We chain together the bracket offset fields that have to be
3193 filled in later when the ends of the brackets are reached. */
3194
3195 offset = (bralink == NULL)? 0 : previous - bralink;
3196 bralink = previous;
3197 PUTINC(previous, 0, offset);
3198 }
3199
3200 repeat_max--;
3201 }
3202
3203 /* If the minimum is greater than zero, replicate the group as many
3204 times as necessary, and adjust the maximum to the number of subsequent
3205 copies that we need. If we set a first char from the group, and didn't
3206 set a required char, copy the latter from the former. */
3207
3208 else
3209 {
3210 if (repeat_min > 1)
3211 {
3212 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3213 for (i = 1; i < repeat_min; i++)
3214 {
3215 memcpy(code, previous, len);
3216 code += len;
3217 }
3218 }
3219 if (repeat_max > 0) repeat_max -= repeat_min;
3220 }
3221
3222 /* This code is common to both the zero and non-zero minimum cases. If
3223 the maximum is limited, it replicates the group in a nested fashion,
3224 remembering the bracket starts on a stack. In the case of a zero minimum,
3225 the first one was set up above. In all cases the repeat_max now specifies
3226 the number of additional copies needed. */
3227
3228 if (repeat_max >= 0)
3229 {
3230 for (i = repeat_max - 1; i >= 0; i--)
3231 {
3232 *code++ = OP_BRAZERO + repeat_type;
3233
3234 /* All but the final copy start a new nesting, maintaining the
3235 chain of brackets outstanding. */
3236
3237 if (i != 0)
3238 {
3239 int offset;
3240 *code++ = OP_BRA;
3241 offset = (bralink == NULL)? 0 : code - bralink;
3242 bralink = code;
3243 PUTINC(code, 0, offset);
3244 }
3245
3246 memcpy(code, previous, len);
3247 code += len;
3248 }
3249
3250 /* Now chain through the pending brackets, and fill in their length
3251 fields (which are holding the chain links pro tem). */
3252
3253 while (bralink != NULL)
3254 {
3255 int oldlinkoffset;
3256 int offset = code - bralink + 1;
3257 uschar *bra = code - offset;
3258 oldlinkoffset = GET(bra, 1);
3259 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3260 *code++ = OP_KET;
3261 PUTINC(code, 0, offset);
3262 PUT(bra, 1, offset);
3263 }
3264 }
3265
3266 /* If the maximum is unlimited, set a repeater in the final copy. We
3267 can't just offset backwards from the current code point, because we
3268 don't know if there's been an options resetting after the ket. The
3269 correct offset was computed above. */
3270
3271 else code[-ketoffset] = OP_KETRMAX + repeat_type;
3272 }
3273
3274 /* Else there's some kind of shambles */
3275
3276 else
3277 {
3278 *errorptr = ERR11;
3279 goto FAILED;
3280 }
3281
3282 /* If the character following a repeat is '+', we wrap the entire repeated
3283 item inside OP_ONCE brackets. This is just syntactic sugar, taken from
3284 Sun's Java package. The repeated item starts at tempcode, not at previous,
3285 which might be the first part of a string whose (former) last char we
3286 repeated. However, we don't support '+' after a greediness '?'. */
3287
3288 if (possessive_quantifier)
3289 {
3290 int len = code - tempcode;
3291 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3292 code += 1 + LINK_SIZE;
3293 len += 1 + LINK_SIZE;
3294 tempcode[0] = OP_ONCE;
3295 *code++ = OP_KET;
3296 PUTINC(code, 0, len);
3297 PUT(tempcode, 1, len);
3298 }
3299
3300 /* In all case we no longer have a previous item. We also set the
3301 "follows varying string" flag for subsequently encountered reqbytes if
3302 it isn't already set and we have just passed a varying length item. */
3303
3304 END_REPEAT:
3305 previous = NULL;
3306 cd->req_varyopt |= reqvary;
3307 break;
3308
3309
3310 /* Start of nested bracket sub-expression, or comment or lookahead or
3311 lookbehind or option setting or condition. First deal with special things
3312 that can come after a bracket; all are introduced by ?, and the appearance
3313 of any of them means that this is not a referencing group. They were
3314 checked for validity in the first pass over the string, so we don't have to
3315 check for syntax errors here. */
3316
3317 case '(':
3318 newoptions = options;
3319 skipbytes = 0;
3320
3321 if (*(++ptr) == '?')
3322 {
3323 int set, unset;
3324 int *optset;
3325
3326 switch (*(++ptr))
3327 {
3328 case '#': /* Comment; skip to ket */
3329 ptr++;
3330 while (*ptr != ')') ptr++;
3331 continue;
3332
3333 case ':': /* Non-extracting bracket */
3334 bravalue = OP_BRA;
3335 ptr++;
3336 break;
3337
3338 case '(':
3339 bravalue = OP_COND; /* Conditional group */
3340
3341 /* Condition to test for recursion */
3342
3343 if (ptr[1] == 'R')
3344 {
3345 code[1+LINK_SIZE] = OP_CREF;
3346 PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
3347 skipbytes = 3;
3348 ptr += 3;
3349 }
3350
3351 /* Condition to test for a numbered subpattern match. We know that
3352 if a digit follows ( then there will just be digits until ) because
3353 the syntax was checked in the first pass. */
3354
3355 else if ((digitab[ptr[1]] && ctype_digit) != 0)
3356 {
3357 int condref; /* Don't amalgamate; some compilers */
3358 condref = *(++ptr) - '0'; /* grumble at autoincrement in declaration */
3359 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
3360 if (condref == 0)
3361 {
3362 *errorptr = ERR35;
3363 goto FAILED;
3364 }
3365 ptr++;
3366 code[1+LINK_SIZE] = OP_CREF;
3367 PUT2(code, 2+LINK_SIZE, condref);
3368 skipbytes = 3;
3369 }
3370 /* For conditions that are assertions, we just fall through, having
3371 set bravalue above. */
3372 break;
3373
3374 case '=': /* Positive lookahead */
3375 bravalue = OP_ASSERT;
3376 ptr++;
3377 break;
3378
3379 case '!': /* Negative lookahead */
3380 bravalue = OP_ASSERT_NOT;
3381 ptr++;
3382 break;
3383
3384 case '<': /* Lookbehinds */
3385 switch (*(++ptr))
3386 {
3387 case '=': /* Positive lookbehind */
3388 bravalue = OP_ASSERTBACK;
3389 ptr++;
3390 break;
3391
3392 case '!': /* Negative lookbehind */
3393 bravalue = OP_ASSERTBACK_NOT;
3394 ptr++;
3395 break;
3396 }
3397 break;
3398
3399 case '>': /* One-time brackets */
3400 bravalue = OP_ONCE;
3401 ptr++;
3402 break;
3403
3404 case 'C': /* Callout - may be followed by digits; */
3405 previous_callout = code; /* Save for later completion */
3406 after_manual_callout = 1; /* Skip one item before completing */
3407 *code++ = OP_CALLOUT; /* Already checked that the terminating */
3408 { /* closing parenthesis is present. */
3409 int n = 0;
3410 while ((digitab[*(++ptr)] & ctype_digit) != 0)
3411 n = n * 10 + *ptr - '0';
3412 if (n > 255)
3413 {
3414 *errorptr = ERR38;
3415 goto FAILED;
3416 }
3417 *code++ = n;
3418 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
3419 PUT(code, LINK_SIZE, 0); /* Default length */
3420 code += 2 * LINK_SIZE;
3421 }
3422 previous = NULL;
3423 continue;
3424
3425 case 'P': /* Named subpattern handling */
3426 if (*(++ptr) == '<') /* Definition */
3427 {
3428 int i, namelen;
3429 uschar *slot = cd->name_table;
3430 const uschar *name; /* Don't amalgamate; some compilers */
3431 name = ++ptr; /* grumble at autoincrement in declaration */
3432
3433 while (*ptr++ != '>');
3434 namelen = ptr - name - 1;
3435
3436 for (i = 0; i < cd->names_found; i++)
3437 {
3438 int crc = memcmp(name, slot+2, namelen);
3439 if (crc == 0)
3440 {
3441 if (slot[2+namelen] == 0)
3442 {
3443 *errorptr = ERR43;
3444 goto FAILED;
3445 }
3446 crc = -1; /* Current name is substring */
3447 }
3448 if (crc < 0)
3449 {
3450 memmove(slot + cd->name_entry_size, slot,
3451 (cd->names_found - i) * cd->name_entry_size);
3452 break;
3453 }
3454 slot += cd->name_entry_size;
3455 }
3456
3457 PUT2(slot, 0, *brackets + 1);
3458 memcpy(slot + 2, name, namelen);
3459 slot[2+namelen] = 0;
3460 cd->names_found++;
3461 goto NUMBERED_GROUP;
3462 }
3463
3464 if (*ptr == '=' || *ptr == '>') /* Reference or recursion */
3465 {
3466 int i, namelen;
3467 int type = *ptr++;
3468 const uschar *name = ptr;
3469 uschar *slot = cd->name_table;
3470
3471 while (*ptr != ')') ptr++;
3472 namelen = ptr - name;
3473
3474 for (i = 0; i < cd->names_found; i++)
3475 {
3476 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3477 slot += cd->name_entry_size;
3478 }
3479 if (i >= cd->names_found)
3480 {
3481 *errorptr = ERR15;
3482 goto FAILED;
3483 }
3484
3485 recno = GET2(slot, 0);
3486
3487 if (type == '>') goto HANDLE_RECURSION; /* A few lines below */
3488
3489 /* Back reference */
3490
3491 previous = code;
3492 *code++ = OP_REF;
3493 PUT2INC(code, 0, recno);
3494 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
3495 if (recno > cd->top_backref) cd->top_backref = recno;
3496 continue;
3497 }
3498
3499 /* Should never happen */
3500 break;
3501
3502 case 'R': /* Pattern recursion */
3503 ptr++; /* Same as (?0) */
3504 /* Fall through */
3505
3506 /* Recursion or "subroutine" call */
3507
3508 case '0': case '1': case '2': case '3': case '4':
3509 case '5': case '6': case '7': case '8': case '9':
3510 {
3511 const uschar *called;
3512 recno = 0;
3513 while((digitab[*ptr] & ctype_digit) != 0)
3514 recno = recno * 10 + *ptr++ - '0';
3515
3516 /* Come here from code above that handles a named recursion */
3517
3518 HANDLE_RECURSION:
3519
3520 previous = code;
3521
3522 /* Find the bracket that is being referenced. Temporarily end the
3523 regex in case it doesn't exist. */
3524
3525 *code = OP_END;
3526 called = (recno == 0)?
3527 cd->start_code : find_bracket(cd->start_code, utf8, recno);
3528
3529 if (called == NULL)
3530 {
3531 *errorptr = ERR15;
3532 goto FAILED;
3533 }
3534
3535 /* If the subpattern is still open, this is a recursive call. We
3536 check to see if this is a left recursion that could loop for ever,
3537 and diagnose that case. */
3538
3539 if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
3540 {
3541 *errorptr = ERR40;
3542 goto FAILED;
3543 }
3544
3545 /* Insert the recursion/subroutine item */
3546
3547 *code = OP_RECURSE;
3548 PUT(code, 1, called - cd->start_code);
3549 code += 1 + LINK_SIZE;
3550 }
3551 continue;
3552
3553 /* Character after (? not specially recognized */
3554
3555 default: /* Option setting */
3556 set = unset = 0;
3557 optset = &set;
3558
3559 while (*ptr != ')' && *ptr != ':')
3560 {
3561 switch (*ptr++)
3562 {
3563 case '-': optset = &unset; break;
3564
3565 case 'i': *optset |= PCRE_CASELESS; break;
3566 case 'm': *optset |= PCRE_MULTILINE; break;
3567 case 's': *optset |= PCRE_DOTALL; break;
3568 case 'x': *optset |= PCRE_EXTENDED; break;
3569 case 'U': *optset |= PCRE_UNGREEDY; break;
3570 case 'X': *optset |= PCRE_EXTRA; break;
3571 }
3572 }
3573
3574 /* Set up the changed option bits, but don't change anything yet. */
3575
3576 newoptions = (options | set) & (~unset);
3577
3578 /* If the options ended with ')' this is not the start of a nested
3579 group with option changes, so the options change at this level. Compile
3580 code to change the ims options if this setting actually changes any of
3581 them. We also pass the new setting back so that it can be put at the
3582 start of any following branches, and when this group ends (if we are in
3583 a group), a resetting item can be compiled.
3584
3585 Note that if this item is right at the start of the pattern, the
3586 options will have been abstracted and made global, so there will be no
3587 change to compile. */
3588
3589 if (*ptr == ')')
3590 {
3591 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
3592 {
3593 *code++ = OP_OPT;
3594 *code++ = newoptions & PCRE_IMS;
3595 }
3596
3597 /* Change options at this level, and pass them back for use
3598 in subsequent branches. Reset the greedy defaults and the case
3599 value for firstbyte and reqbyte. */
3600
3601 *optionsptr = options = newoptions;
3602 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
3603 greedy_non_default = greedy_default ^ 1;
3604 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3605
3606 previous = NULL; /* This item can't be repeated */
3607 continue; /* It is complete */
3608 }
3609
3610 /* If the options ended with ':' we are heading into a nested group
3611 with possible change of options. Such groups are non-capturing and are
3612 not assertions of any kind. All we need to do is skip over the ':';
3613 the newoptions value is handled below. */
3614
3615 bravalue = OP_BRA;
3616 ptr++;
3617 }
3618 }
3619
3620 /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
3621 non-capturing and behave like (?:...) brackets */
3622
3623 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
3624 {
3625 bravalue = OP_BRA;
3626 }
3627
3628 /* Else we have a referencing group; adjust the opcode. If the bracket
3629 number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
3630 arrange for the true number to follow later, in an OP_BRANUMBER item. */
3631
3632 else
3633 {
3634 NUMBERED_GROUP:
3635 if (++(*brackets) > EXTRACT_BASIC_MAX)
3636 {
3637 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
3638 code[1+LINK_SIZE] = OP_BRANUMBER;
3639 PUT2(code, 2+LINK_SIZE, *brackets);
3640 skipbytes = 3;
3641 }
3642 else bravalue = OP_BRA + *brackets;
3643 }
3644
3645 /* Process nested bracketed re. Assertions may not be repeated, but other
3646 kinds can be. We copy code into a non-register variable in order to be able
3647 to pass its address because some compilers complain otherwise. Pass in a
3648 new setting for the ims options if they have changed. */
3649
3650 previous = (bravalue >= OP_ONCE)? code : NULL;
3651 *code = bravalue;
3652 tempcode = code;
3653 tempreqvary = cd->req_varyopt; /* Save value before bracket */
3654
3655 if (!compile_regex(
3656 newoptions, /* The complete new option state */
3657 options & PCRE_IMS, /* The previous ims option state */
3658 brackets, /* Extracting bracket count */
3659 &tempcode, /* Where to put code (updated) */
3660 &ptr, /* Input pointer (updated) */
3661 errorptr, /* Where to put an error message */
3662 (bravalue == OP_ASSERTBACK ||
3663 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
3664 skipbytes, /* Skip over OP_COND/OP_BRANUMBER */
3665 &subfirstbyte, /* For possible first char */
3666 &subreqbyte, /* For possible last char */
3667 bcptr, /* Current branch chain */
3668 cd)) /* Tables block */
3669 goto FAILED;
3670
3671 /* At the end of compiling, code is still pointing to the start of the
3672 group, while tempcode has been updated to point past the end of the group
3673 and any option resetting that may follow it. The pattern pointer (ptr)
3674 is on the bracket. */
3675
3676 /* If this is a conditional bracket, check that there are no more than
3677 two branches in the group. */
3678
3679 else if (bravalue == OP_COND)
3680 {
3681 uschar *tc = code;
3682 condcount = 0;
3683
3684 do {
3685 condcount++;
3686 tc += GET(tc,1);
3687 }
3688 while (*tc != OP_KET);
3689
3690 if (condcount > 2)
3691 {
3692 *errorptr = ERR27;
3693 goto FAILED;
3694 }
3695
3696 /* If there is just one branch, we must not make use of its firstbyte or
3697 reqbyte, because this is equivalent to an empty second branch. */
3698
3699 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
3700 }
3701
3702 /* Handle updating of the required and first characters. Update for normal
3703 brackets of all kinds, and conditions with two branches (see code above).
3704 If the bracket is followed by a quantifier with zero repeat, we have to
3705 back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
3706 main loop so that they can be accessed for the back off. */
3707
3708 zeroreqbyte = reqbyte;
3709 zerofirstbyte = firstbyte;
3710 groupsetfirstbyte = FALSE;
3711
3712 if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
3713 {
3714 /* If we have not yet set a firstbyte in this branch, take it from the
3715 subpattern, remembering that it was set here so that a repeat of more
3716 than one can replicate it as reqbyte if necessary. If the subpattern has
3717 no firstbyte, set "none" for the whole branch. In both cases, a zero
3718 repeat forces firstbyte to "none". */
3719
3720 if (firstbyte == REQ_UNSET)
3721 {
3722 if (subfirstbyte >= 0)
3723 {
3724 firstbyte = subfirstbyte;
3725 groupsetfirstbyte = TRUE;
3726 }
3727 else firstbyte = REQ_NONE;
3728 zerofirstbyte = REQ_NONE;
3729 }
3730
3731 /* If firstbyte was previously set, convert the subpattern's firstbyte
3732 into reqbyte if there wasn't one, using the vary flag that was in
3733 existence beforehand. */
3734
3735 else if (subfirstbyte >= 0 && subreqbyte < 0)
3736 subreqbyte = subfirstbyte | tempreqvary;
3737
3738 /* If the subpattern set a required byte (or set a first byte that isn't
3739 really the first byte - see above), set it. */
3740
3741 if (subreqbyte >= 0) reqbyte = subreqbyte;
3742 }
3743
3744 /* For a forward assertion, we take the reqbyte, if set. This can be
3745 helpful if the pattern that follows the assertion doesn't set a different
3746 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
3747 for an assertion, however because it leads to incorrect effect for patterns
3748 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
3749 of a firstbyte. This is overcome by a scan at the end if there's no
3750 firstbyte, looking for an asserted first char. */
3751
3752 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
3753
3754 /* Now update the main code pointer to the end of the group. */
3755
3756 code = tempcode;
3757
3758 /* Error if hit end of pattern */
3759
3760 if (*ptr != ')')
3761 {
3762 *errorptr = ERR14;
3763 goto FAILED;
3764 }
3765 break;
3766
3767 /* Check \ for being a real metacharacter; if not, fall through and handle
3768 it as a data character at the start of a string. Escape items are checked
3769 for validity in the pre-compiling pass. */
3770
3771 case '\\':
3772 tempptr = ptr;
3773 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
3774
3775 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
3776 are arranged to be the negation of the corresponding OP_values. For the
3777 back references, the values are ESC_REF plus the reference number. Only
3778 back references and those types that consume a character may be repeated.
3779 We can test for values between ESC_b and ESC_Z for the latter; this may
3780 have to change if any new ones are ever created. */
3781
3782 if (c < 0)
3783 {
3784 if (-c == ESC_Q) /* Handle start of quoted string */
3785 {
3786 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
3787 else inescq = TRUE;
3788 continue;
3789 }
3790
3791 /* For metasequences that actually match a character, we disable the
3792 setting of a first character if it hasn't already been set. */
3793
3794 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
3795 firstbyte = REQ_NONE;
3796
3797 /* Set values to reset to if this is followed by a zero repeat. */
3798
3799 zerofirstbyte = firstbyte;
3800 zeroreqbyte = reqbyte;
3801
3802 /* Back references are handled specially */
3803
3804 if (-c >= ESC_REF)
3805 {
3806 int number = -c - ESC_REF;
3807 previous = code;
3808 *code++ = OP_REF;
3809 PUT2INC(code, 0, number);
3810 }
3811
3812 /* So are Unicode property matches, if supported. We know that get_ucp
3813 won't fail because it was tested in the pre-pass. */
3814
3815 #ifdef SUPPORT_UCP
3816 else if (-c == ESC_P || -c == ESC_p)
3817 {
3818 BOOL negated;
3819 int value = get_ucp(&ptr, &negated, errorptr);
3820 previous = code;
3821 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
3822 *code++ = value;
3823 }
3824 #endif
3825
3826 /* For the rest, we can obtain the OP value by negating the escape
3827 value */
3828
3829 else
3830 {
3831 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
3832 *code++ = -c;
3833 }
3834 continue;
3835 }
3836
3837 /* We have a data character whose value is in c. In UTF-8 mode it may have
3838 a value > 127. We set its representation in the length/buffer, and then
3839 handle it as a data character. */
3840
3841 #ifdef SUPPORT_UTF8
3842 if (utf8 && c > 127)
3843 mclength = ord2utf8(c, mcbuffer);
3844 else
3845 #endif
3846
3847 {
3848 mcbuffer[0] = c;
3849 mclength = 1;
3850 }
3851
3852 goto ONE_CHAR;
3853
3854 /* Handle a literal character. It is guaranteed not to be whitespace or #
3855 when the extended flag is set. If we are in UTF-8 mode, it may be a
3856 multi-byte literal character. */
3857
3858 default:
3859 NORMAL_CHAR:
3860 mclength = 1;
3861 mcbuffer[0] = c;
3862
3863 #ifdef SUPPORT_UTF8
3864 if (utf8 && (c & 0xc0) == 0xc0)
3865 {
3866 while ((ptr[1] & 0xc0) == 0x80)
3867 mcbuffer[mclength++] = *(++ptr);
3868 }
3869 #endif
3870
3871 /* At this point we have the character's bytes in mcbuffer, and the length
3872 in mclength. When not in UTF-8 mode, the length is always 1. */
3873
3874 ONE_CHAR:
3875 previous = code;
3876 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
3877 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
3878
3879 /* Set the first and required bytes appropriately. If no previous first
3880 byte, set it from this character, but revert to none on a zero repeat.
3881 Otherwise, leave the firstbyte value alone, and don't change it on a zero
3882 repeat. */
3883
3884 if (firstbyte == REQ_UNSET)
3885 {
3886 zerofirstbyte = REQ_NONE;
3887 zeroreqbyte = reqbyte;
3888
3889 /* If the character is more than one byte long, we can set firstbyte
3890 only if it is not to be matched caselessly. */
3891
3892 if (mclength == 1 || req_caseopt == 0)
3893 {
3894 firstbyte = mcbuffer[0] | req_caseopt;
3895 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
3896 }
3897 else firstbyte = reqbyte = REQ_NONE;
3898 }
3899
3900 /* firstbyte was previously set; we can set reqbyte only the length is
3901 1 or the matching is caseful. */
3902
3903 else
3904 {
3905 zerofirstbyte = firstbyte;
3906 zeroreqbyte = reqbyte;
3907 if (mclength == 1 || req_caseopt == 0)
3908 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3909 }
3910
3911 break; /* End of literal character handling */
3912 }
3913 } /* end of big loop */
3914
3915 /* Control never reaches here by falling through, only by a goto for all the
3916 error states. Pass back the position in the pattern so that it can be displayed
3917 to the user for diagnosing the error. */
3918
3919 FAILED:
3920 *ptrptr = ptr;
3921 return FALSE;
3922 }
3923
3924
3925
3926
3927 /*************************************************
3928 * Compile sequence of alternatives *
3929 *************************************************/
3930
3931 /* On entry, ptr is pointing past the bracket character, but on return
3932 it points to the closing bracket, or vertical bar, or end of string.
3933 The code variable is pointing at the byte into which the BRA operator has been
3934 stored. If the ims options are changed at the start (for a (?ims: group) or
3935 during any branch, we need to insert an OP_OPT item at the start of every
3936 following branch to ensure they get set correctly at run time, and also pass
3937 the new options into every subsequent branch compile.
3938
3939 Argument:
3940 options option bits, including any changes for this subpattern
3941 oldims previous settings of ims option bits
3942 brackets -> int containing the number of extracting brackets used
3943 codeptr -> the address of the current code pointer
3944 ptrptr -> the address of the current pattern pointer
3945 errorptr -> pointer to error message
3946 lookbehind TRUE if this is a lookbehind assertion
3947 skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER)
3948 firstbyteptr place to put the first required character, or a negative number
3949 reqbyteptr place to put the last required character, or a negative number
3950 bcptr pointer to the chain of currently open branches
3951 cd points to the data block with tables pointers etc.
3952
3953 Returns: TRUE on success
3954 */
3955
3956 static BOOL
3957 compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
3958 const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes,
3959 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
3960 {
3961 const uschar *ptr = *ptrptr;
3962 uschar *code = *codeptr;
3963 uschar *last_branch = code;
3964 uschar *start_bracket = code;
3965 uschar *reverse_count = NULL;
3966 int firstbyte, reqbyte;
3967 int branchfirstbyte, branchreqbyte;
3968 branch_chain bc;
3969
3970 bc.outer = bcptr;
3971 bc.current = code;
3972
3973 firstbyte = reqbyte = REQ_UNSET;
3974
3975 /* Offset is set zero to mark that this bracket is still open */
3976
3977 PUT(code, 1, 0);
3978 code += 1 + LINK_SIZE + skipbytes;
3979
3980 /* Loop for each alternative branch */
3981
3982 for (;;)
3983 {
3984 /* Handle a change of ims options at the start of the branch */
3985
3986 if ((options & PCRE_IMS) != oldims)
3987 {
3988 *code++ = OP_OPT;
3989 *code++ = options & PCRE_IMS;
3990 }
3991
3992 /* Set up dummy OP_REVERSE if lookbehind assertion */
3993
3994 if (lookbehind)
3995 {
3996 *code++ = OP_REVERSE;
3997 reverse_count = code;
3998 PUTINC(code, 0, 0);
3999 }
4000
4001 /* Now compile the branch */
4002
4003 if (!compile_branch(&options, brackets, &code, &ptr, errorptr,
4004 &branchfirstbyte, &branchreqbyte, &bc, cd))
4005 {
4006 *ptrptr = ptr;
4007 return FALSE;
4008 }
4009
4010 /* If this is the first branch, the firstbyte and reqbyte values for the
4011 branch become the values for the regex. */
4012
4013 if (*last_branch != OP_ALT)
4014 {
4015 firstbyte = branchfirstbyte;
4016 reqbyte = branchreqbyte;
4017 }
4018
4019 /* If this is not the first branch, the first char and reqbyte have to
4020 match the values from all the previous branches, except that if the previous
4021 value for reqbyte didn't have REQ_VARY set, it can still match, and we set
4022 REQ_VARY for the regex. */
4023
4024 else
4025 {
4026 /* If we previously had a firstbyte, but it doesn't match the new branch,
4027 we have to abandon the firstbyte for the regex, but if there was previously
4028 no reqbyte, it takes on the value of the old firstbyte. */
4029
4030 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
4031 {
4032 if (reqbyte < 0) reqbyte = firstbyte;
4033 firstbyte = REQ_NONE;
4034 }
4035
4036 /* If we (now or from before) have no firstbyte, a firstbyte from the
4037 branch becomes a reqbyte if there isn't a branch reqbyte. */
4038
4039 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
4040 branchreqbyte = branchfirstbyte;
4041
4042 /* Now ensure that the reqbytes match */
4043
4044 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
4045 reqbyte = REQ_NONE;
4046 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
4047 }
4048
4049 /* If lookbehind, check that this branch matches a fixed-length string,
4050 and put the length into the OP_REVERSE item. Temporarily mark the end of
4051 the branch with OP_END. */
4052
4053 if (lookbehind)
4054 {
4055 int length;
4056 *code = OP_END;
4057 length = find_fixedlength(last_branch, options);
4058 DPRINTF(("fixed length = %d\n", length));
4059 if (length < 0)
4060 {
4061 *errorptr = (length == -2)? ERR36 : ERR25;
4062 *ptrptr = ptr;
4063 return FALSE;
4064 }
4065 PUT(reverse_count, 0, length);
4066 }
4067
4068 /* Reached end of expression, either ')' or end of pattern. Go back through
4069 the alternative branches and reverse the chain of offsets, with the field in
4070 the BRA item now becoming an offset to the first alternative. If there are
4071 no alternatives, it points to the end of the group. The length in the
4072 terminating ket is always the length of the whole bracketed item. If any of
4073 the ims options were changed inside the group, compile a resetting op-code
4074 following, except at the very end of the pattern. Return leaving the pointer
4075 at the terminating char. */
4076
4077 if (*ptr != '|')
4078 {
4079 int length = code - last_branch;
4080 do
4081 {
4082 int prev_length = GET(last_branch, 1);
4083 PUT(last_branch, 1, length);
4084 length = prev_length;
4085 last_branch -= length;
4086 }
4087 while (length > 0);
4088
4089 /* Fill in the ket */
4090
4091 *code = OP_KET;
4092 PUT(code, 1, code - start_bracket);
4093 code += 1 + LINK_SIZE;
4094
4095 /* Resetting option if needed */
4096
4097 if ((options & PCRE_IMS) != oldims && *ptr == ')')
4098 {
4099 *code++ = OP_OPT;
4100 *code++ = oldims;
4101 }
4102
4103 /* Set values to pass back */
4104
4105 *codeptr = code;
4106 *ptrptr = ptr;
4107 *firstbyteptr = firstbyte;
4108 *reqbyteptr = reqbyte;
4109 return TRUE;
4110 }
4111
4112 /* Another branch follows; insert an "or" node. Its length field points back
4113 to the previous branch while the bracket remains open. At the end the chain
4114 is reversed. It's done like this so that the start of the bracket has a
4115 zero offset until it is closed, making it possible to detect recursion. */
4116
4117 *code = OP_ALT;
4118 PUT(code, 1, code - last_branch);
4119 bc.current = last_branch = code;
4120 code += 1 + LINK_SIZE;
4121 ptr++;
4122 }
4123 /* Control never reaches here */
4124 }
4125
4126
4127
4128
4129 /*************************************************
4130 * Check for anchored expression *
4131 *************************************************/
4132
4133 /* Try to find out if this is an anchored regular expression. Consider each
4134 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
4135 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
4136 it's anchored. However, if this is a multiline pattern, then only OP_SOD
4137 counts, since OP_CIRC can match in the middle.
4138
4139 We can also consider a regex to be anchored if OP_SOM starts all its branches.
4140 This is the code for \G, which means "match at start of match position, taking
4141 into account the match offset".
4142
4143 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
4144 because that will try the rest of the pattern at all possible matching points,
4145 so there is no point trying again.... er ....
4146
4147 .... except when the .* appears inside capturing parentheses, and there is a
4148 subsequent back reference to those parentheses. We haven't enough information
4149 to catch that case precisely.
4150
4151 At first, the best we could do was to detect when .* was in capturing brackets
4152 and the highest back reference was greater than or equal to that level.
4153 However, by keeping a bitmap of the first 31 back references, we can catch some
4154 of the more common cases more precisely.
4155
4156 Arguments:
4157 code points to start of expression (the bracket)
4158 options points to the options setting
4159 bracket_map a bitmap of which brackets we are inside while testing; this
4160 handles up to substring 31; after that we just have to take
4161 the less precise approach
4162 backref_map the back reference bitmap
4163
4164 Returns: TRUE or FALSE
4165 */
4166
4167 static BOOL
4168 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
4169 unsigned int backref_map)
4170 {
4171 do {
4172 const uschar *scode =
4173 first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE);
4174 register int op = *scode;
4175
4176 /* Capturing brackets */
4177
4178 if (op > OP_BRA)
4179 {
4180 int new_map;
4181 op -= OP_BRA;
4182 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
4183 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
4184 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
4185 }
4186
4187 /* Other brackets */
4188
4189 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4190 {
4191 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
4192 }
4193
4194 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
4195 are or may be referenced. */
4196
4197 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
4198 (*options & PCRE_DOTALL) != 0)
4199 {
4200 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
4201 }
4202
4203 /* Check for explicit anchoring */
4204
4205 else if (op != OP_SOD && op != OP_SOM &&
4206 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
4207 return FALSE;
4208 code += GET(code, 1);
4209 }
4210 while (*code == OP_ALT); /* Loop for each alternative */
4211 return TRUE;
4212 }
4213
4214
4215
4216 /*************************************************
4217 * Check for starting with ^ or .* *
4218 *************************************************/
4219
4220 /* This is called to find out if every branch starts with ^ or .* so that
4221 "first char" processing can be done to speed things up in multiline
4222 matching and for non-DOTALL patterns that start with .* (which must start at
4223 the beginning or after \n). As in the case of is_anchored() (see above), we
4224 have to take account of back references to capturing brackets that contain .*
4225 because in that case we can't make the assumption.
4226
4227 Arguments:
4228 code points to start of expression (the bracket)
4229 bracket_map a bitmap of which brackets we are inside while testing; this
4230 handles up to substring 31; after that we just have to take
4231 the less precise approach
4232 backref_map the back reference bitmap
4233
4234 Returns: TRUE or FALSE
4235 */
4236
4237 static BOOL
4238 is_startline(const uschar *code, unsigned int bracket_map,
4239 unsigned int backref_map)
4240 {
4241 do {
4242 const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0,
4243 FALSE);
4244 register int op = *scode;
4245
4246 /* Capturing brackets */
4247
4248 if (op > OP_BRA)
4249 {
4250 int new_map;
4251 op -= OP_BRA;
4252 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
4253 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
4254 if (!is_startline(scode, new_map, backref_map)) return FALSE;
4255 }
4256
4257 /* Other brackets */
4258
4259 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4260 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
4261
4262 /* .* means "start at start or after \n" if it isn't in brackets that
4263 may be referenced. */
4264
4265 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
4266 {
4267 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
4268 }
4269
4270 /* Check for explicit circumflex */
4271
4272 else if (op != OP_CIRC) return FALSE;
4273
4274 /* Move on to the next alternative */
4275
4276 code += GET(code, 1);
4277 }
4278 while (*code == OP_ALT); /* Loop for each alternative */
4279 return TRUE;
4280 }
4281
4282
4283
4284 /*************************************************
4285 * Check for asserted fixed first char *
4286 *************************************************/
4287
4288 /* During compilation, the "first char" settings from forward assertions are
4289 discarded, because they can cause conflicts with actual literals that follow.
4290 However, if we end up without a first char setting for an unanchored pattern,
4291 it is worth scanning the regex to see if there is an initial asserted first
4292 char. If all branches start with the same asserted char, or with a bracket all
4293 of whose alternatives start with the same asserted char (recurse ad lib), then
4294 we return that char, otherwise -1.
4295
4296 Arguments:
4297 code points to start of expression (the bracket)
4298 options pointer to the options (used to check casing changes)
4299 inassert TRUE if in an assertion
4300
4301 Returns: -1 or the fixed first char
4302 */
4303
4304 static int
4305 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
4306 {
4307 register int c = -1;
4308 do {
4309 int d;
4310 const uschar *scode =
4311 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
4312 register int op = *scode;
4313
4314 if (op >= OP_BRA) op = OP_BRA;
4315
4316 switch(op)
4317 {
4318 default:
4319 return -1;
4320
4321 case OP_BRA:
4322 case OP_ASSERT:
4323 case OP_ONCE:
4324 case OP_COND:
4325 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
4326 return -1;
4327 if (c < 0) c = d; else if (c != d) return -1;
4328 break;
4329
4330 case OP_EXACT: /* Fall through */
4331 scode += 2;
4332
4333 case OP_CHAR:
4334 case OP_CHARNC:
4335 case OP_PLUS:
4336 case OP_MINPLUS:
4337 if (!inassert) return -1;
4338 if (c < 0)
4339 {
4340 c = scode[1];
4341 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
4342 }
4343 else if (c != scode[1]) return -1;
4344 break;
4345 }
4346
4347 code += GET(code, 1);
4348 }
4349 while (*code == OP_ALT);
4350 return c;
4351 }
4352
4353
4354
4355
4356 #ifdef SUPPORT_UTF8
4357 /*************************************************
4358 * Validate a UTF-8 string *
4359 *************************************************/
4360
4361 /* This function is called (optionally) at the start of compile or match, to
4362 validate that a supposed UTF-8 string is actually valid. The early check means
4363 that subsequent code can assume it is dealing with a valid string. The check
4364 can be turned off for maximum performance, but then consequences of supplying
4365 an invalid string are then undefined.
4366
4367 Arguments:
4368 string points to the string
4369 length length of string, or -1 if the string is zero-terminated
4370
4371 Returns: < 0 if the string is a valid UTF-8 string
4372 >= 0 otherwise; the value is the offset of the bad byte
4373 */
4374
4375 static int
4376 valid_utf8(const uschar *string, int length)
4377 {
4378 register const uschar *p;
4379
4380 if (length < 0)
4381 {
4382 for (p = string; *p != 0; p++);
4383 length = p - string;
4384 }
4385
4386 for (p = string; length-- > 0; p++)
4387 {
4388 register int ab;
4389 register int c = *p;
4390 if (c < 128) continue;
4391 if ((c & 0xc0) != 0xc0) return p - string;
4392 ab = utf8_table4[c & 0x3f]; /* Number of additional bytes */
4393 if (length < ab) return p - string;
4394 length -= ab;
4395
4396 /* Check top bits in the second byte */
4397 if ((*(++p) & 0xc0) != 0x80) return p - string;
4398
4399 /* Check for overlong sequences for each different length */
4400 switch (ab)
4401 {
4402 /* Check for xx00 000x */
4403 case 1:
4404 if ((c & 0x3e) == 0) return p - string;
4405 continue; /* We know there aren't any more bytes to check */
4406
4407 /* Check for 1110 0000, xx0x xxxx */
4408 case 2:
4409 if (c == 0xe0 && (*p & 0x20) == 0) return p - string;
4410 break;
4411
4412 /* Check for 1111 0000, xx00 xxxx */
4413 case 3:
4414 if (c == 0xf0 && (*p & 0x30) == 0) return p - string;
4415 break;
4416
4417 /* Check for 1111 1000, xx00 0xxx */
4418 case 4:
4419 if (c == 0xf8 && (*p & 0x38) == 0) return p - string;
4420 break;
4421
4422 /* Check for leading 0xfe or 0xff, and then for 1111 1100, xx00 00xx */
4423 case 5:
4424 if (c == 0xfe || c == 0xff ||
4425 (c == 0xfc && (*p & 0x3c) == 0)) return p - string;
4426 break;
4427 }
4428
4429 /* Check for valid bytes after the 2nd, if any; all must start 10 */
4430 while (--ab > 0)
4431 {
4432 if ((*(++p) & 0xc0) != 0x80) return p - string;
4433 }
4434 }
4435
4436 return -1;
4437 }
4438 #endif
4439
4440
4441
4442 /*************************************************
4443 * Compile a Regular Expression *
4444 *************************************************/
4445
4446 /* This function takes a string and returns a pointer to a block of store
4447 holding a compiled version of the expression.
4448
4449 Arguments:
4450 pattern the regular expression
4451 options various option bits
4452 errorptr pointer to pointer to error text
4453 erroroffset ptr offset in pattern where error was detected
4454 tables pointer to character tables or NULL
4455
4456 Returns: pointer to compiled data block, or NULL on error,
4457 with errorptr and erroroffset set
4458 */
4459
4460 EXPORT pcre *
4461 pcre_compile(const char *pattern, int options, const char **errorptr,
4462 int *erroroffset, const unsigned char *tables)
4463 {
4464 real_pcre *re;
4465 int length = 1 + LINK_SIZE; /* For initial BRA plus length */
4466 int c, firstbyte, reqbyte;
4467 int bracount = 0;
4468 int branch_extra = 0;
4469 int branch_newextra;
4470 int item_count = -1;
4471 int name_count = 0;
4472 int max_name_size = 0;
4473 int lastitemlength = 0;
4474 #ifdef SUPPORT_UTF8
4475 BOOL utf8;
4476 BOOL class_utf8;
4477 #endif
4478 BOOL inescq = FALSE;
4479 unsigned int brastackptr = 0;
4480 size_t size;
4481 uschar *code;
4482 const uschar *codestart;
4483 const uschar *ptr;
4484 compile_data compile_block;
4485 int brastack[BRASTACK_SIZE];
4486 uschar bralenstack[BRASTACK_SIZE];
4487
4488 /* We can't pass back an error message if errorptr is NULL; I guess the best we
4489 can do is just return NULL. */
4490
4491 if (errorptr == NULL) return NULL;
4492 *errorptr = NULL;
4493
4494 /* However, we can give a message for this error */
4495
4496 if (erroroffset == NULL)
4497 {
4498 *errorptr = ERR16;
4499 return NULL;
4500 }
4501 *erroroffset = 0;
4502
4503 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
4504
4505 #ifdef SUPPORT_UTF8
4506 utf8 = (options & PCRE_UTF8) != 0;
4507 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
4508 (*erroroffset = valid_utf8((uschar *)pattern, -1)) >= 0)
4509 {
4510 *errorptr = ERR44;
4511 return NULL;
4512 }
4513 #else
4514 if ((options & PCRE_UTF8) != 0)
4515 {
4516 *errorptr = ERR32;
4517 return NULL;
4518 }
4519 #endif
4520
4521 if ((options & ~PUBLIC_OPTIONS) != 0)
4522 {
4523 *errorptr = ERR17;
4524 return NULL;
4525 }
4526
4527 /* Set up pointers to the individual character tables */
4528
4529 if (tables == NULL) tables = pcre_default_tables;
4530 compile_block.lcc = tables + lcc_offset;
4531 compile_block.fcc = tables + fcc_offset;
4532 compile_block.cbits = tables + cbits_offset;
4533 compile_block.ctypes = tables + ctypes_offset;
4534
4535 /* Maximum back reference and backref bitmap. This is updated for numeric
4536 references during the first pass, but for named references during the actual
4537 compile pass. The bitmap records up to 31 back references to help in deciding
4538 whether (.*) can be treated as anchored or not. */
4539
4540 compile_block.top_backref = 0;
4541 compile_block.backref_map = 0;
4542
4543 /* Reflect pattern for debugging output */
4544
4545 DPRINTF(("------------------------------------------------------------------\n"));
4546 DPRINTF(("%s\n", pattern));
4547
4548 /* The first thing to do is to make a pass over the pattern to compute the
4549 amount of store required to hold the compiled code. This does not have to be
4550 perfect as long as errors are overestimates. At the same time we can detect any
4551 flag settings right at the start, and extract them. Make an attempt to correct
4552 for any counted white space if an "extended" flag setting appears late in the
4553 pattern. We can't be so clever for #-comments. */
4554
4555 ptr = (const uschar *)(pattern - 1);
4556 while ((c = *(++ptr)) != 0)
4557 {
4558 int min, max;
4559 int class_optcount;
4560 int bracket_length;
4561 int duplength;
4562
4563 /* If we are inside a \Q...\E sequence, all chars are literal */
4564
4565 if (inescq)
4566 {
4567 if ((options & PCRE_AUTO_CALLOUT) != 0) length += 2 + 2*LINK_SIZE;
4568 goto NORMAL_CHAR;
4569 }
4570
4571 /* Otherwise, first check for ignored whitespace and comments */
4572
4573 if ((options & PCRE_EXTENDED) != 0)
4574 {
4575 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
4576 if (c == '#')
4577 {
4578 /* The space before the ; is to avoid a warning on a silly compiler
4579 on the Macintosh. */
4580 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
4581 if (c == 0) break;
4582 continue;
4583 }
4584 }
4585
4586 item_count++; /* Is zero for the first non-comment item */
4587
4588 /* Allow space for auto callout before every item except quantifiers. */
4589
4590 if ((options & PCRE_AUTO_CALLOUT) != 0 &&
4591 c != '*' && c != '+' && c != '?' &&
4592 (c != '{' || !is_counted_repeat(ptr + 1)))
4593 length += 2 + 2*LINK_SIZE;
4594
4595 switch(c)
4596 {
4597 /* A backslashed item may be an escaped data character or it may be a
4598 character type. */
4599
4600 case '\\':
4601 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
4602 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4603
4604 lastitemlength = 1; /* Default length of last item for repeats */
4605
4606 if (c >= 0) /* Data character */
4607 {
4608 length += 2; /* For a one-byte character */
4609
4610 #ifdef SUPPORT_UTF8
4611 if (utf8 && c > 127)
4612 {
4613 int i;
4614 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
4615 if (c <= utf8_table1[i]) break;
4616 length += i;
4617 lastitemlength += i;
4618 }
4619 #endif
4620
4621 continue;
4622 }
4623
4624 /* If \Q, enter "literal" mode */
4625
4626 if (-c == ESC_Q)
4627 {
4628 inescq = TRUE;
4629 continue;
4630 }
4631
4632 /* \X is supported only if Unicode property support is compiled */
4633
4634 #ifndef SUPPORT_UCP
4635 if (-c == ESC_X)
4636 {
4637 *errorptr = ERR45;
4638 goto PCRE_ERROR_RETURN;
4639 }
4640 #endif
4641
4642 /* \P and \p are for Unicode properties, but only when the support has
4643 been compiled. Each item needs 2 bytes. */
4644
4645 else if (-c == ESC_P || -c == ESC_p)
4646 {
4647 #ifdef SUPPORT_UCP
4648 BOOL negated;
4649 length += 2;
4650 lastitemlength = 2;
4651 if (get_ucp(&ptr, &negated, errorptr) < 0) goto PCRE_ERROR_RETURN;
4652 continue;
4653 #else
4654 *errorptr = ERR45;
4655 goto PCRE_ERROR_RETURN;
4656 #endif
4657 }
4658
4659 /* Other escapes need one byte */
4660
4661 length++;
4662
4663 /* A back reference needs an additional 2 bytes, plus either one or 5
4664 bytes for a repeat. We also need to keep the value of the highest
4665 back reference. */
4666
4667 if (c <= -ESC_REF)
4668 {
4669 int refnum = -c - ESC_REF;
4670 compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
4671 if (refnum > compile_block.top_backref)
4672 compile_block.top_backref = refnum;
4673 length += 2; /* For single back reference */
4674 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
4675 {
4676 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
4677 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4678 if ((min == 0 && (max == 1 || max == -1)) ||
4679 (min == 1 && max == -1))
4680 length++;
4681 else length += 5;
4682 if (ptr[1] == '?') ptr++;
4683 }
4684 }
4685 continue;
4686
4687 case '^': /* Single-byte metacharacters */
4688 case '.':
4689 case '$':
4690 length++;
4691 lastitemlength = 1;
4692 continue;
4693
4694 case '*': /* These repeats won't be after brackets; */
4695 case '+': /* those are handled separately */
4696 case '?':
4697 length++;
4698 goto POSESSIVE; /* A few lines below */
4699
4700 /* This covers the cases of braced repeats after a single char, metachar,
4701 class, or back reference. */
4702
4703 case '{':
4704 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
4705 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
4706 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4707
4708 /* These special cases just insert one extra opcode */
4709
4710 if ((min == 0 && (max == 1 || max == -1)) ||
4711 (min == 1 && max == -1))
4712 length++;
4713
4714 /* These cases might insert additional copies of a preceding character. */
4715
4716 else
4717 {
4718 if (min != 1)
4719 {
4720 length -= lastitemlength; /* Uncount the original char or metachar */
4721 if (min > 0) length += 3 + lastitemlength;
4722 }
4723 length += lastitemlength + ((max > 0)? 3 : 1);
4724 }
4725
4726 if (ptr[1] == '?') ptr++; /* Needs no extra length */
4727
4728 POSESSIVE: /* Test for possessive quantifier */
4729 if (ptr[1] == '+')
4730 {
4731 ptr++;
4732 length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */
4733 }
4734 continue;
4735
4736 /* An alternation contains an offset to the next branch or ket. If any ims
4737 options changed in the previous branch(es), and/or if we are in a
4738 lookbehind assertion, extra space will be needed at the start of the
4739 branch. This is handled by branch_extra. */
4740
4741 case '|':
4742 length += 1 + LINK_SIZE + branch_extra;
4743 continue;
4744
4745 /* A character class uses 33 characters provided that all the character
4746 values are less than 256. Otherwise, it uses a bit map for low valued
4747 characters, and individual items for others. Don't worry about character
4748 types that aren't allowed in classes - they'll get picked up during the
4749 compile. A character class that contains only one single-byte character
4750 uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
4751 where we can. (In UTF-8 mode we can do this only for chars < 128.) */
4752
4753 case '[':
4754 if (*(++ptr) == '^')
4755 {
4756 class_optcount = 10; /* Greater than one */
4757 ptr++;
4758 }
4759 else class_optcount = 0;
4760
4761 #ifdef SUPPORT_UTF8
4762 class_utf8 = FALSE;
4763 #endif
4764
4765 /* Written as a "do" so that an initial ']' is taken as data */
4766
4767 if (*ptr != 0) do
4768 {
4769 /* Inside \Q...\E everything is literal except \E */
4770
4771 if (inescq)
4772 {
4773 if (*ptr != '\\' || ptr[1] != 'E') goto GET_ONE_CHARACTER;
4774 inescq = FALSE;
4775 ptr += 1;
4776 continue;
4777 }
4778
4779 /* Outside \Q...\E, check for escapes */
4780
4781 if (*ptr == '\\')
4782 {
4783 c = check_escape(&ptr, errorptr, bracount, options, TRUE);
4784 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4785
4786 /* \b is backspace inside a class; \X is literal */
4787
4788 if (-c == ESC_b) c = '\b';
4789 else if (-c == ESC_X) c = 'X';
4790
4791 /* \Q enters quoting mode */
4792
4793 else if (-c == ESC_Q)
4794 {
4795 inescq = TRUE;
4796 continue;
4797 }
4798
4799 /* Handle escapes that turn into characters */
4800
4801 if (c >= 0) goto NON_SPECIAL_CHARACTER;
4802
4803 /* Escapes that are meta-things. The normal ones just affect the
4804 bit map, but Unicode properties require an XCLASS extended item. */
4805
4806 else
4807 {
4808 class_optcount = 10; /* \d, \s etc; make sure > 1 */
4809 #ifdef SUPPORT_UTF8
4810 if (-c == ESC_p || -c == ESC_P)
4811 {
4812 if (!class_utf8)
4813 {
4814 class_utf8 = TRUE;
4815 length += LINK_SIZE + 2;
4816 }
4817 length += 2;
4818 }
4819 #endif
4820 }
4821 }
4822
4823 /* Check the syntax for POSIX stuff. The bits we actually handle are
4824 checked during the real compile phase. */
4825
4826 else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block))
4827 {
4828 ptr++;
4829 class_optcount = 10; /* Make sure > 1 */
4830 }
4831
4832 /* Anything else increments the possible optimization count. We have to
4833 detect ranges here so that we can compute the number of extra ranges for
4834 caseless wide characters when UCP support is available. If there are wide
4835 characters, we are going to have to use an XCLASS, even for single
4836 characters. */
4837
4838 else
4839 {
4840 int d;
4841
4842 GET_ONE_CHARACTER:
4843
4844 #ifdef SUPPORT_UTF8
4845 if (utf8)
4846 {
4847 int extra = 0;
4848 GETCHARLEN(c, ptr, extra);
4849 ptr += extra;
4850 }
4851 else c = *ptr;
4852 #else
4853 c = *ptr;
4854 #endif
4855
4856 /* Come here from handling \ above when it escapes to a char value */
4857
4858 NON_SPECIAL_CHARACTER:
4859 class_optcount++;
4860
4861 d = -1;
4862 if (ptr[1] == '-')
4863 {
4864 uschar const *hyptr = ptr++;
4865 if (ptr[1] == '\\')
4866 {
4867 ptr++;
4868 d = check_escape(&ptr, errorptr, bracount, options, TRUE);
4869 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4870 if (-d == ESC_b) d = '\b'; /* backspace */
4871 else if (-d == ESC_X) d = 'X'; /* literal X in a class */
4872 }
4873 else if (ptr[1] != 0 && ptr[1] != ']')
4874 {
4875 ptr++;
4876 #ifdef SUPPORT_UTF8
4877 if (utf8)
4878 {
4879 int extra = 0;
4880 GETCHARLEN(d, ptr, extra);
4881 ptr += extra;
4882 }
4883 else
4884 #endif
4885 d = *ptr;
4886 }
4887 if (d < 0) ptr = hyptr; /* go back to hyphen as data */
4888 }
4889
4890 /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
4891 127 for caseless matching, we will need to use an XCLASS. */
4892
4893 if (d >= 0)
4894 {
4895 class_optcount = 10; /* Ensure > 1 */
4896 if (d < c)
4897 {
4898 *errorptr = ERR8;
4899 goto PCRE_ERROR_RETURN;
4900 }
4901
4902 #ifdef SUPPORT_UTF8
4903 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
4904 {
4905 uschar buffer[6];
4906 if (!class_utf8) /* Allow for XCLASS overhead */
4907 {
4908 class_utf8 = TRUE;
4909 length += LINK_SIZE + 2;
4910 }
4911
4912 #ifdef SUPPORT_UCP
4913 /* If we have UCP support, find out how many extra ranges are
4914 needed to map the other case of characters within this range. We
4915 have to mimic the range optimization here, because extending the
4916 range upwards might push d over a boundary that makes is use
4917 another byte in the UTF-8 representation. */
4918
4919 if ((options & PCRE_CASELESS) != 0)
4920 {
4921 int occ, ocd;
4922 int cc = c;
4923 int origd = d;
4924 while (get_othercase_range(&cc, origd, &occ, &ocd))
4925 {
4926 if (occ >= c && ocd <= d) continue; /* Skip embedded */
4927
4928 if (occ < c && ocd >= c - 1) /* Extend the basic range */
4929 { /* if there is overlap, */
4930 c = occ; /* noting that if occ < c */
4931 continue; /* we can't have ocd > d */
4932 } /* because a subrange is */
4933 if (ocd > d && occ <= d + 1) /* always shorter than */
4934 { /* the basic range. */
4935 d = ocd;
4936 continue;
4937 }
4938
4939 /* An extra item is needed */
4940
4941 length += 1 + ord2utf8(occ, buffer) +
4942 ((occ == ocd)? 0 : ord2utf8(ocd, buffer));
4943 }
4944 }
4945 #endif /* SUPPORT_UCP */
4946
4947 /* The length of the (possibly extended) range */
4948
4949 length += 1 + ord2utf8(c, buffer) + ord2utf8(d, buffer);
4950 }
4951 #endif /* SUPPORT_UTF8 */
4952
4953 }
4954
4955 /* We have a single character. There is nothing to be done unless we
4956 are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
4957 allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
4958 support. */
4959
4960 else
4961 {
4962 #ifdef SUPPORT_UTF8
4963 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
4964 {
4965 uschar buffer[6];
4966 class_optcount = 10; /* Ensure > 1 */
4967 if (!class_utf8) /* Allow for XCLASS overhead */
4968 {
4969 class_utf8 = TRUE;
4970 length += LINK_SIZE + 2;
4971 }
4972 #ifdef SUPPORT_UCP
4973 length += (((options & PCRE_CASELESS) != 0)? 2 : 1) *
4974 (1 + ord2utf8(c, buffer));
4975 #else /* SUPPORT_UCP */
4976 length += 1 + ord2utf8(c, buffer);
4977 #endif /* SUPPORT_UCP */
4978 }
4979 #endif /* SUPPORT_UTF8 */
4980 }
4981 }
4982 }
4983 while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */
4984
4985 if (*ptr == 0) /* Missing terminating ']' */
4986 {
4987 *errorptr = ERR6;
4988 goto PCRE_ERROR_RETURN;
4989 }
4990
4991 /* We can optimize when there was only one optimizable character. Repeats
4992 for positive and negated single one-byte chars are handled by the general
4993 code. Here, we handle repeats for the class opcodes. */
4994
4995 if (class_optcount == 1) length += 3; else
4996 {
4997 length += 33;
4998
4999 /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
5000 we also need extra for wrapping the whole thing in a sub-pattern. */
5001
5002 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
5003 {
5004 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
5005 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
5006 if ((min == 0 && (max == 1 || max == -1)) ||
5007 (min == 1 && max == -1))
5008 length++;
5009 else length += 5;
5010 if (ptr[1] == '+')
5011 {
5012 ptr++;
5013 length += 2 + 2*LINK_SIZE;
5014 }
5015 else if (ptr[1] == '?') ptr++;
5016 }
5017 }
5018 continue;
5019
5020 /* Brackets may be genuine groups or special things */
5021
5022 case '(':
5023 branch_newextra = 0;
5024 bracket_length = 1 + LINK_SIZE;
5025
5026 /* Handle special forms of bracket, which all start (? */
5027
5028 if (ptr[1] == '?')
5029 {
5030 int set, unset;
5031 int *optset;
5032
5033 switch (c = ptr[2])
5034 {
5035 /* Skip over comments entirely */
5036 case '#':
5037 ptr += 3;
5038 while (*ptr != 0 && *ptr != ')') ptr++;
5039 if (*ptr == 0)
5040 {
5041 *errorptr = ERR18;
5042 goto PCRE_ERROR_RETURN;
5043 }
5044 continue;
5045
5046 /* Non-referencing groups and lookaheads just move the pointer on, and
5047 then behave like a non-special bracket, except that they don't increment
5048 the count of extracting brackets. Ditto for the "once only" bracket,
5049 which is in Perl from version 5.005. */
5050
5051 case ':':
5052 case '=':
5053 case '!':
5054 case '>':
5055 ptr += 2;
5056 break;
5057
5058 /* (?R) specifies a recursive call to the regex, which is an extension
5059 to provide the facility which can be obtained by (?p{perl-code}) in
5060 Perl 5.6. In Perl 5.8 this has become (??{perl-code}).
5061
5062 From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to
5063 the appropriate numbered brackets. This includes both recursive and
5064 non-recursive calls. (?R) is now synonymous with (?0). */
5065
5066 case 'R':
5067 ptr++;
5068
5069 case '0': case '1': case '2': case '3': case '4':
5070 case '5': case '6': case '7': case '8': case '9':
5071 ptr += 2;
5072 if (c != 'R')
5073 while ((digitab[*(++ptr)] & ctype_digit) != 0);
5074 if (*ptr != ')')
5075 {
5076 *errorptr = ERR29;
5077 goto PCRE_ERROR_RETURN;
5078 }
5079 length += 1 + LINK_SIZE;
5080
5081 /* If this item is quantified, it will get wrapped inside brackets so
5082 as to use the code for quantified brackets. We jump down and use the
5083 code that handles this for real brackets. */
5084
5085 if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{')
5086 {
5087 length += 2 + 2 * LINK_SIZE; /* to make bracketed */
5088 duplength = 5 + 3 * LINK_SIZE;
5089 goto HANDLE_QUANTIFIED_BRACKETS;
5090 }
5091 continue;
5092
5093 /* (?C) is an extension which provides "callout" - to provide a bit of
5094 the functionality of the Perl (?{...}) feature. An optional number may
5095 follow (default is zero). */
5096
5097 case 'C':
5098 ptr += 2;
5099 while ((digitab[*(++ptr)] & ctype_digit) != 0);
5100 if (*ptr != ')')
5101 {
5102 *errorptr = ERR39;
5103 goto PCRE_ERROR_RETURN;
5104 }
5105 length += 2 + 2*LINK_SIZE;
5106 continue;
5107
5108 /* Named subpatterns are an extension copied from Python */
5109
5110 case 'P':
5111 ptr += 3;
5112 if (*ptr == '<')
5113 {
5114 const uschar *p; /* Don't amalgamate; some compilers */
5115 p = ++ptr; /* grumble at autoincrement in declaration */
5116 while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;
5117 if (*ptr != '>')
5118 {
5119 *errorptr = ERR42;
5120 goto PCRE_ERROR_RETURN;
5121 }
5122 name_count++;
5123 if (ptr - p > max_name_size) max_name_size = (ptr - p);
5124 break;
5125 }
5126
5127 if (*ptr == '=' || *ptr == '>')
5128 {
5129 while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0);
5130 if (*ptr != ')')
5131 {
5132 *errorptr = ERR42;
5133 goto PCRE_ERROR_RETURN;
5134 }
5135 break;
5136 }
5137
5138 /* Unknown character after (?P */
5139
5140 *errorptr = ERR41;
5141 goto PCRE_ERROR_RETURN;
5142
5143 /* Lookbehinds are in Perl from version 5.005 */
5144
5145 case '<':
5146 ptr += 3;
5147 if (*ptr == '=' || *ptr == '!')
5148 {
5149 branch_newextra = 1 + LINK_SIZE;
5150 length += 1 + LINK_SIZE; /* For the first branch */
5151 break;
5152 }
5153 *errorptr = ERR24;
5154 goto PCRE_ERROR_RETURN;
5155
5156 /* Conditionals are in Perl from version 5.005. The bracket must either
5157 be followed by a number (for bracket reference) or by an assertion
5158 group, or (a PCRE extension) by 'R' for a recursion test. */
5159
5160 case '(':
5161 if (ptr[3] == 'R' && ptr[4] == ')')
5162 {
5163 ptr += 4;
5164 length += 3;
5165 }
5166 else if ((digitab[ptr[3]] & ctype_digit) != 0)
5167 {
5168 ptr += 4;
5169 length += 3;
5170 while ((digitab[*ptr] & ctype_digit) != 0) ptr++;
5171 if (*ptr != ')')
5172 {
5173 *errorptr = ERR26;
5174 goto PCRE_ERROR_RETURN;
5175 }
5176 }
5177 else /* An assertion must follow */
5178 {
5179 ptr++; /* Can treat like ':' as far as spacing is concerned */
5180 if (ptr[2] != '?' ||
5181 (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
5182 {
5183 ptr += 2; /* To get right offset in message */
5184 *errorptr = ERR28;
5185 goto PCRE_ERROR_RETURN;
5186 }
5187 }
5188 break;
5189
5190 /* Else loop checking valid options until ) is met. Anything else is an
5191 error. If we are without any brackets, i.e. at top level, the settings
5192 act as if specified in the options, so massage the options immediately.
5193 This is for backward compatibility with Perl 5.004. */
5194
5195 default:
5196 set = unset = 0;
5197 optset = &set;
5198 ptr += 2;
5199
5200 for (;; ptr++)
5201 {
5202 c = *ptr;
5203 switch (c)
5204 {
5205 case 'i':
5206 *optset |= PCRE_CASELESS;
5207 continue;
5208
5209 case 'm':
5210 *optset |= PCRE_MULTILINE;
5211 continue;
5212
5213 case 's':
5214 *optset |= PCRE_DOTALL;
5215 continue;
5216
5217 case 'x':
5218 *optset |= PCRE_EXTENDED;
5219 continue;
5220
5221 case 'X':
5222 *optset |= PCRE_EXTRA;
5223 continue;
5224
5225 case 'U':
5226 *optset |= PCRE_UNGREEDY;
5227 continue;
5228
5229 case '-':
5230 optset = &unset;
5231 continue;
5232
5233 /* A termination by ')' indicates an options-setting-only item; if
5234 this is at the very start of the pattern (indicated by item_count
5235 being zero), we use it to set the global options. This is helpful
5236 when analyzing the pattern for first characters, etc. Otherwise
5237 nothing is done here and it is handled during the compiling
5238 process.
5239
5240 [Historical note: Up to Perl 5.8, options settings at top level
5241 were always global settings, wherever they appeared in the pattern.
5242 That is, they were equivalent to an external setting. From 5.8
5243 onwards, they apply only to what follows (which is what you might
5244 expect).] */
5245
5246 case ')':
5247 if (item_count == 0)
5248 {
5249 options = (options | set) & (~unset);
5250 set = unset = 0; /* To save length */
5251 item_count--; /* To allow for several */
5252 }
5253
5254 /* Fall through */
5255
5256 /* A termination by ':' indicates the start of a nested group with
5257 the given options set. This is again handled at compile time, but
5258 we must allow for compiled space if any of the ims options are
5259 set. We also have to allow for resetting space at the end of
5260 the group, which is why 4 is added to the length and not just 2.
5261 If there are several changes of options within the same group, this
5262 will lead to an over-estimate on the length, but this shouldn't
5263 matter very much. We also have to allow for resetting options at
5264 the start of any alternations, which we do by setting
5265 branch_newextra to 2. Finally, we record whether the case-dependent
5266 flag ever changes within the regex. This is used by the "required
5267 character" code. */
5268
5269 case ':':
5270 if (((set|unset) & PCRE_IMS) != 0)
5271 {
5272 length += 4;
5273 branch_newextra = 2;
5274 if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
5275 }
5276 goto END_OPTIONS;
5277
5278 /* Unrecognized option character */
5279
5280 default:
5281 *errorptr = ERR12;
5282 goto PCRE_ERROR_RETURN;
5283 }
5284 }
5285
5286 /* If we hit a closing bracket, that's it - this is a freestanding
5287 option-setting. We need to ensure that branch_extra is updated if
5288 necessary. The only values branch_newextra can have here are 0 or 2.
5289 If the value is 2, then branch_extra must either be 2 or 5, depending
5290 on whether this is a lookbehind group or not. */
5291
5292 END_OPTIONS:
5293 if (c == ')')
5294 {
5295 if (branch_newextra == 2 &&
5296 (branch_extra == 0 || branch_extra == 1+LINK_SIZE))
5297 branch_extra += branch_newextra;
5298 continue;
5299 }
5300
5301 /* If options were terminated by ':' control comes here. Fall through
5302 to handle the group below. */
5303 }
5304 }
5305
5306 /* Extracting brackets must be counted so we can process escapes in a
5307 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to
5308 need an additional 3 bytes of store per extracting bracket. However, if
5309 PCRE_NO_AUTO)CAPTURE is set, unadorned brackets become non-capturing, so we
5310 must leave the count alone (it will aways be zero). */
5311
5312 else if ((options & PCRE_NO_AUTO_CAPTURE) == 0)
5313 {
5314 bracount++;
5315 if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
5316 }
5317
5318 /* Save length for computing whole length at end if there's a repeat that
5319 requires duplication of the group. Also save the current value of
5320 branch_extra, and start the new group with the new value. If non-zero, this
5321 will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
5322
5323 if (brastackptr >= sizeof(brastack)/sizeof(int))
5324 {
5325 *errorptr = ERR19;
5326 goto PCRE_ERROR_RETURN;
5327 }
5328
5329 bralenstack[brastackptr] = branch_extra;
5330 branch_extra = branch_newextra;
5331
5332 brastack[brastackptr++] = length;
5333 length += bracket_length;
5334 continue;
5335
5336 /* Handle ket. Look for subsequent max/min; for certain sets of values we
5337 have to replicate this bracket up to that many times. If brastackptr is
5338 0 this is an unmatched bracket which will generate an error, but take care
5339 not to try to access brastack[-1] when computing the length and restoring
5340 the branch_extra value. */
5341
5342 case ')':
5343 length += 1 + LINK_SIZE;
5344 if (brastackptr > 0)
5345 {
5346 duplength = length - brastack[--brastackptr];
5347 branch_extra = bralenstack[brastackptr];
5348 }
5349 else duplength = 0;
5350
5351 /* The following code is also used when a recursion such as (?3) is
5352 followed by a quantifier, because in that case, it has to be wrapped inside
5353 brackets so that the quantifier works. The value of duplength must be
5354 set before arrival. */
5355
5356 HANDLE_QUANTIFIED_BRACKETS:
5357
5358 /* Leave ptr at the final char; for read_repeat_counts this happens
5359 automatically; for the others we need an increment. */
5360
5361 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
5362 {
5363 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
5364 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
5365 }
5366 else if (c == '*') { min = 0; max = -1; ptr++; }
5367 else if (c == '+') { min = 1; max = -1; ptr++; }
5368 else if (c == '?') { min = 0; max = 1; ptr++; }
5369 else { min = 1; max = 1; }
5370
5371 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
5372 group, and if the maximum is greater than zero, we have to replicate
5373 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
5374 bracket set. */
5375
5376 if (min == 0)
5377 {
5378 length++;
5379 if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
5380 }
5381
5382 /* When the minimum is greater than zero, we have to replicate up to
5383 minval-1 times, with no additions required in the copies. Then, if there
5384 is a limited maximum we have to replicate up to maxval-1 times allowing
5385 for a BRAZERO item before each optional copy and nesting brackets for all
5386 but one of the optional copies. */
5387
5388 else
5389 {
5390 length += (min - 1) * duplength;
5391 if (max > min) /* Need this test as max=-1 means no limit */
5392 length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
5393 - (2 + 2*LINK_SIZE);
5394 }
5395
5396 /* Allow space for once brackets for "possessive quantifier" */
5397
5398 if (ptr[1] == '+')
5399 {
5400 ptr++;
5401 length += 2 + 2*LINK_SIZE;
5402 }
5403 continue;
5404
5405 /* Non-special character. It won't be space or # in extended mode, so it is
5406 always a genuine character. If we are in a \Q...\E sequence, check for the
5407 end; if not, we have a literal. */
5408
5409 default:
5410 NORMAL_CHAR:
5411
5412 if (inescq && c == '\\' && ptr[1] == 'E')
5413 {
5414 inescq = FALSE;
5415 ptr++;
5416 continue;
5417 }
5418
5419 length += 2; /* For a one-byte character */
5420 lastitemlength = 1; /* Default length of last item for repeats */
5421
5422 /* In UTF-8 mode, check for additional bytes. */
5423
5424 #ifdef SUPPORT_UTF8
5425 if (utf8 && (c & 0xc0) == 0xc0)
5426 {
5427 while ((ptr[1] & 0xc0) == 0x80) /* Can't flow over the end */
5428 { /* because the end is marked */
5429 lastitemlength++; /* by a zero byte. */
5430 length++;
5431 ptr++;
5432 }
5433 }
5434 #endif
5435
5436 continue;
5437 }
5438 }
5439
5440 length += 2 + LINK_SIZE; /* For final KET and END */
5441
5442 if ((options & PCRE_AUTO_CALLOUT) != 0)
5443 length += 2 + 2*LINK_SIZE; /* For final callout */
5444
5445 if (length > MAX_PATTERN_SIZE)
5446 {
5447 *errorptr = ERR20;
5448 return NULL;
5449 }
5450
5451 /* Compute the size of data block needed and get it, either from malloc or
5452 externally provided function. */
5453
5454 size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
5455 re = (real_pcre *)(pcre_malloc)(size);
5456
5457 if (re == NULL)
5458 {
5459 *errorptr = ERR21;
5460 return NULL;
5461 }
5462
5463 /* Put in the magic number, and save the sizes, options, and character table
5464 pointer. NULL is used for the default character tables. The nullpad field is at
5465 the end; it's there to help in the case when a regex compiled on a system with
5466 4-byte pointers is run on another with 8-byte pointers. */
5467
5468 re->magic_number = MAGIC_NUMBER;
5469 re->size = size;
5470 re->options = options;
5471 re->dummy1 = re->dummy2 = 0;
5472 re->name_table_offset = sizeof(real_pcre);
5473 re->name_entry_size = max_name_size + 3;
5474 re->name_count = name_count;
5475 re->tables = (tables == pcre_default_tables)? NULL : tables;
5476 re->nullpad = NULL;
5477
5478 /* The starting points of the name/number translation table and of the code are
5479 passed around in the compile data block. */
5480
5481 compile_block.names_found = 0;
5482 compile_block.name_entry_size = max_name_size + 3;
5483 compile_block.name_table = (uschar *)re + re->name_table_offset;
5484 codestart = compile_block.name_table + re->name_entry_size * re->name_count;
5485 compile_block.start_code = codestart;
5486 compile_block.start_pattern = (const uschar *)pattern;
5487 compile_block.req_varyopt = 0;
5488 compile_block.nopartial = FALSE;
5489
5490 /* Set up a starting, non-extracting bracket, then compile the expression. On
5491 error, *errorptr will be set non-NULL, so we don't need to look at the result
5492 of the function here. */
5493
5494 ptr = (const uschar *)pattern;
5495 code = (uschar *)codestart;
5496 *code = OP_BRA;
5497 bracount = 0;
5498 (void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr,
5499 errorptr, FALSE, 0, &firstbyte, &reqbyte, NULL, &compile_block);
5500 re->top_bracket = bracount;
5501 re->top_backref = compile_block.top_backref;
5502
5503 if (compile_block.nopartial) re->options |= PCRE_NOPARTIAL;
5504
5505 /* If not reached end of pattern on success, there's an excess bracket. */
5506
5507 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
5508
5509 /* Fill in the terminating state and check for disastrous overflow, but
5510 if debugging, leave the test till after things are printed out. */
5511
5512 *code++ = OP_END;
5513
5514 #ifndef DEBUG
5515 if (code - codestart > length) *errorptr = ERR23;
5516 #endif
5517
5518 /* Give an error if there's back reference to a non-existent capturing
5519 subpattern. */
5520
5521 if (re->top_backref > re->top_bracket) *errorptr = ERR15;
5522
5523 /* Failed to compile, or error while post-processing */
5524
5525 if (*errorptr != NULL)
5526 {
5527 (pcre_free)(re);
5528 PCRE_ERROR_RETURN:
5529 *erroroffset = ptr - (const uschar *)pattern;
5530 return NULL;
5531 }
5532
5533 /* If the anchored option was not passed, set the flag if we can determine that
5534 the pattern is anchored by virtue of ^ characters or \A or anything else (such
5535 as starting with .* when DOTALL is set).
5536
5537 Otherwise, if we know what the first character has to be, save it, because that
5538 speeds up unanchored matches no end. If not, see if we can set the
5539 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
5540 start with ^. and also when all branches start with .* for non-DOTALL matches.
5541 */
5542
5543 if ((options & PCRE_ANCHORED) == 0)
5544 {
5545 int temp_options = options;
5546 if (is_anchored(codestart, &temp_options, 0, compile_block.backref_map))
5547 re->options |= PCRE_ANCHORED;
5548 else
5549 {
5550 if (firstbyte < 0)
5551 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
5552 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
5553 {
5554 int ch = firstbyte & 255;
5555 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
5556 compile_block.fcc[ch] == ch)? ch : firstbyte;
5557 re->options |= PCRE_FIRSTSET;
5558 }
5559 else if (is_startline(codestart, 0, compile_block.backref_map))
5560 re->options |= PCRE_STARTLINE;
5561 }
5562 }
5563
5564 /* For an anchored pattern, we use the "required byte" only if it follows a
5565 variable length item in the regex. Remove the caseless flag for non-caseable
5566 bytes. */
5567
5568 if (reqbyte >= 0 &&
5569 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
5570 {
5571 int ch = reqbyte & 255;
5572 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
5573 compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5574 re->options |= PCRE_REQCHSET;
5575 }
5576
5577 /* Print out the compiled data for debugging */
5578
5579 #ifdef DEBUG
5580
5581 printf("Length = %d top_bracket = %d top_backref = %d\n",
5582 length, re->top_bracket, re->top_backref);
5583
5584 if (re->options != 0)
5585 {
5586 printf("%s%s%s%s%s%s%s%s%s%s\n",
5587 ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5588 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5589 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5590 ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
5591 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5592 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5593 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5594 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5595 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5596 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5597 }
5598
5599 if ((re->options & PCRE_FIRSTSET) != 0)
5600 {
5601 int ch = re->first_byte & 255;
5602 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
5603 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5604 else printf("First char = \\x%02x%s\n", ch, caseless);
5605 }
5606
5607 if ((re->options & PCRE_REQCHSET) != 0)
5608 {
5609 int ch = re->req_byte & 255;
5610 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
5611 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5612 else printf("Req char = \\x%02x%s\n", ch, caseless);
5613 }
5614
5615 print_internals(re, stdout);
5616
5617 /* This check is done here in the debugging case so that the code that
5618 was compiled can be seen. */
5619
5620 if (code - codestart > length)
5621 {
5622 *errorptr = ERR23;
5623 (pcre_free)(re);
5624 *erroroffset = ptr - (uschar *)pattern;
5625 return NULL;
5626 }
5627 #endif
5628
5629 return (pcre *)re;
5630 }
5631
5632
5633
5634 /*************************************************
5635 * Match a back-reference *
5636 *************************************************/
5637
5638 /* If a back reference hasn't been set, the length that is passed is greater
5639 than the number of characters left in the string, so the match fails.
5640
5641 Arguments:
5642 offset index into the offset vector
5643 eptr points into the subject
5644 length length to be matched
5645 md points to match data block
5646 ims the ims flags
5647
5648 Returns: TRUE if matched
5649 */
5650
5651 static BOOL
5652 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
5653 unsigned long int ims)
5654 {
5655 const uschar *p = md->start_subject + md->offset_vector[offset];
5656
5657 #ifdef DEBUG
5658 if (eptr >= md->end_subject)
5659 printf("matching subject <null>");
5660 else
5661 {
5662 printf("matching subject ");
5663 pchars(eptr, length, TRUE, md);
5664 }
5665 printf(" against backref ");
5666 pchars(p, length, FALSE, md);
5667 printf("\n");
5668 #endif
5669
5670 /* Always fail if not enough characters left */
5671
5672 if (length > md->end_subject - eptr) return FALSE;
5673
5674 /* Separate the caselesss case for speed */
5675
5676 if ((ims & PCRE_CASELESS) != 0)
5677 {
5678 while (length-- > 0)
5679 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
5680 }
5681 else
5682 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
5683
5684 return TRUE;
5685 }
5686
5687
5688 #ifdef SUPPORT_UTF8
5689 /*************************************************
5690 * Match character against an XCLASS *
5691 *************************************************/
5692
5693 /* This function is called from within the XCLASS code below, to match a
5694 character against an extended class which might match values > 255.
5695
5696 Arguments:
5697 c the character
5698 data points to the flag byte of the XCLASS data
5699
5700 Returns: TRUE if character matches, else FALSE
5701 */
5702
5703 static BOOL
5704 match_xclass(int c, const uschar *data)
5705 {
5706 int t;
5707 BOOL negated = (*data & XCL_NOT) != 0;
5708
5709 /* Character values < 256 are matched against a bitmap, if one is present. If
5710 not, we still carry on, because there may be ranges that start below 256 in the
5711 additional data. */
5712
5713 if (c < 256)
5714 {
5715 if ((*data & XCL_MAP) != 0 && (data[1 + c/8] & (1 << (c&7))) != 0)
5716 return !negated; /* char found */
5717 }
5718
5719 /* First skip the bit map if present. Then match against the list of Unicode
5720 properties or large chars or ranges that end with a large char. We won't ever
5721 encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */
5722
5723 if ((*data++ & XCL_MAP) != 0) data += 32;
5724
5725 while ((t = *data++) != XCL_END)
5726 {
5727 int x, y;
5728 if (t == XCL_SINGLE)
5729 {
5730 GETCHARINC(x, data);
5731 if (c == x) return !negated;
5732 }
5733 else if (t == XCL_RANGE)
5734 {
5735 GETCHARINC(x, data);
5736 GETCHARINC(y, data);
5737 if (c >= x && c <= y) return !negated;
5738 }
5739
5740 #ifdef SUPPORT_UCP
5741 else /* XCL_PROP & XCL_NOTPROP */
5742 {
5743 int chartype, othercase;
5744 int rqdtype = *data++;
5745 int category = ucp_findchar(c, &chartype, &othercase);
5746 if (rqdtype >= 128)
5747 {
5748 if ((rqdtype - 128 == category) == (t == XCL_PROP)) return !negated;
5749 }
5750 else
5751 {
5752 if ((rqdtype == chartype) == (t == XCL_PROP)) return !negated;
5753 }
5754 }
5755 #endif /* SUPPORT_UCP */
5756 }
5757
5758 return negated; /* char did not match */
5759 }
5760 #endif
5761
5762
5763 /***************************************************************************
5764 ****************************************************************************
5765 RECURSION IN THE match() FUNCTION
5766
5767 The match() function is highly recursive. Some regular expressions can cause
5768 it to recurse thousands of times. I was writing for Unix, so I just let it
5769 call itself recursively. This uses the stack for saving everything that has
5770 to be saved for a recursive call. On Unix, the stack can be large, and this
5771 works fine.
5772
5773 It turns out that on non-Unix systems there are problems with programs that
5774 use a lot of stack. (This despite the fact that every last chip has oodles
5775 of memory these days, and techniques for extending the stack have been known
5776 for decades.) So....
5777
5778 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
5779 calls by keeping local variables that need to be preserved in blocks of memory
5780 obtained from malloc instead instead of on the stack. Macros are used to
5781 achieve this so that the actual code doesn't look very different to what it
5782 always used to.
5783 ****************************************************************************
5784 ***************************************************************************/
5785
5786
5787 /* These versions of the macros use the stack, as normal */
5788
5789 #ifndef NO_RECURSE
5790 #define REGISTER register
5791 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) rx = match(ra,rb,rc,rd,re,rf,rg)
5792 #define RRETURN(ra) return ra
5793 #else
5794
5795
5796 /* These versions of the macros manage a private stack on the heap. Note
5797 that the rd argument of RMATCH isn't actually used. It's the md argument of
5798 match(), which never changes. */
5799
5800 #define REGISTER
5801
5802 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg)\
5803 {\
5804 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
5805 if (setjmp(frame->Xwhere) == 0)\
5806 {\
5807 newframe->Xeptr = ra;\
5808 newframe->Xecode = rb;\
5809 newframe->Xoffset_top = rc;\
5810 newframe->Xims = re;\
5811 newframe->Xeptrb = rf;\
5812 newframe->Xflags = rg;\
5813 newframe->Xprevframe = frame;\
5814 frame = newframe;\
5815 DPRINTF(("restarting from line %d\n", __LINE__));\
5816 goto HEAP_RECURSE;\
5817 }\
5818 else\
5819 {\
5820 DPRINTF(("longjumped back to line %d\n", __LINE__));\
5821 frame = md->thisframe;\
5822 rx = frame->Xresult;\
5823 }\
5824 }
5825
5826 #define RRETURN(ra)\
5827 {\
5828 heapframe *newframe = frame;\
5829 frame = newframe->Xprevframe;\
5830 (pcre_stack_free)(newframe);\
5831 if (frame != NULL)\
5832 {\
5833 frame->Xresult = ra;\
5834 md->thisframe = frame;\
5835 longjmp(frame->Xwhere, 1);\
5836 }\
5837 return ra;\
5838 }
5839
5840
5841 /* Structure for remembering the local variables in a private frame */
5842
5843 typedef struct heapframe {
5844 struct heapframe *Xprevframe;
5845
5846 /* Function arguments that may change */
5847
5848 const uschar *Xeptr;
5849 const uschar *Xecode;
5850 int Xoffset_top;
5851 long int Xims;
5852 eptrblock *Xeptrb;
5853 int Xflags;
5854
5855 /* Function local variables */
5856
5857 const uschar *Xcallpat;
5858 const uschar *Xcharptr;
5859 const uschar *Xdata;
5860 const uschar *Xnext;
5861 const uschar *Xpp;
5862 const uschar *Xprev;
5863 const uschar *Xsaved_eptr;
5864
5865 recursion_info Xnew_recursive;
5866
5867 BOOL Xcur_is_word;
5868 BOOL Xcondition;
5869 BOOL Xminimize;
5870 BOOL Xprev_is_word;
5871
5872 unsigned long int Xoriginal_ims;
5873
5874 #ifdef SUPPORT_UCP
5875 int Xprop_type;
5876 int Xprop_fail_result;
5877 int Xprop_category;
5878 int Xprop_chartype;
5879 int Xprop_othercase;
5880 int Xprop_test_against;
5881 int *Xprop_test_variable;
5882 #endif
5883
5884 int Xctype;
5885 int Xfc;
5886 int Xfi;
5887 int Xlength;
5888 int Xmax;
5889 int Xmin;
5890 int Xnumber;
5891 int Xoffset;
5892 int Xop;
5893 int Xsave_capture_last;
5894 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
5895 int Xstacksave[REC_STACK_SAVE_MAX];
5896
5897 eptrblock Xnewptrb;
5898
5899 /* Place to pass back result, and where to jump back to */
5900
5901 int Xresult;
5902 jmp_buf Xwhere;
5903
5904 } heapframe;
5905
5906 #endif
5907
5908
5909 /***************************************************************************
5910 ***************************************************************************/
5911
5912
5913
5914 /*************************************************
5915 * Match from current position *
5916 *************************************************/
5917
5918 /* On entry ecode points to the first opcode, and eptr to the first character
5919 in the subject string, while eptrb holds the value of eptr at the start of the
5920 last bracketed group - used for breaking infinite loops matching zero-length
5921 strings. This function is called recursively in many circumstances. Whenever it
5922 returns a negative (error) response, the outer incarnation must also return the
5923 same response.
5924
5925 Performance note: It might be tempting to extract commonly used fields from the
5926 md structure (e.g. utf8, end_subject) into individual variables to improve
5927 performance. Tests using gcc on a SPARC disproved this; in the first case, it
5928 made performance worse.
5929
5930 Arguments:
5931 eptr pointer in subject
5932 ecode position in code
5933 offset_top current top pointer
5934 md pointer to "static" info for the match
5935 ims current /i, /m, and /s options
5936 eptrb pointer to chain of blocks containing eptr at start of
5937 brackets - for testing for empty matches
5938 flags can contain
5939 match_condassert - this is an assertion condition
5940 match_isgroup - this is the start of a bracketed group
5941
5942 Returns: MATCH_MATCH if matched ) these values are >= 0
5943 MATCH_NOMATCH if failed to match )
5944 a negative PCRE_ERROR_xxx value if aborted by an error condition
5945 (e.g. stopped by recursion limit)
5946 */
5947
5948 static int
5949 match(REGISTER const uschar *eptr, REGISTER const uschar *ecode,
5950 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
5951 int flags)
5952 {
5953 /* These variables do not need to be preserved over recursion in this function,
5954 so they can be ordinary variables in all cases. Mark them with "register"
5955 because they are used a lot in loops. */
5956
5957 register int rrc; /* Returns from recursive calls */
5958 register int i; /* Used for loops not involving calls to RMATCH() */
5959 register int c; /* Character values not kept over RMATCH() calls */
5960
5961 /* When recursion is not being used, all "local" variables that have to be
5962 preserved over calls to RMATCH() are part of a "frame" which is obtained from
5963 heap storage. Set up the top-level frame here; others are obtained from the
5964 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
5965
5966 #ifdef NO_RECURSE
5967 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
5968 frame->Xprevframe = NULL; /* Marks the top level */
5969
5970 /* Copy in the original argument variables */
5971
5972 frame->Xeptr = eptr;
5973 frame->Xecode = ecode;
5974 frame->Xoffset_top = offset_top;
5975 frame->Xims = ims;
5976 frame->Xeptrb = eptrb;
5977 frame->Xflags = flags;
5978
5979 /* This is where control jumps back to to effect "recursion" */
5980
5981 HEAP_RECURSE:
5982
5983 /* Macros make the argument variables come from the current frame */
5984
5985 #define eptr frame->Xeptr
5986 #define ecode frame->Xecode
5987 #define offset_top frame->Xoffset_top
5988 #define ims frame->Xims
5989 #define eptrb frame->Xeptrb
5990 #define flags frame->Xflags
5991
5992 /* Ditto for the local variables */
5993
5994 #ifdef SUPPORT_UTF8
5995 #define charptr frame->Xcharptr
5996 #endif
5997 #define callpat frame->Xcallpat
5998 #define data frame->Xdata
5999 #define next frame->Xnext
6000 #define pp frame->Xpp
6001 #define prev frame->Xprev
6002 #define saved_eptr frame->Xsaved_eptr
6003
6004 #define new_recursive frame->Xnew_recursive
6005
6006 #define cur_is_word frame->Xcur_is_word
6007 #define condition frame->Xcondition
6008 #define minimize frame->Xminimize
6009 #define prev_is_word frame->Xprev_is_word
6010
6011 #define original_ims frame->Xoriginal_ims
6012
6013 #ifdef SUPPORT_UCP
6014 #define prop_type frame->Xprop_type
6015 #define prop_fail_result frame->Xprop_fail_result
6016 #define prop_category frame->Xprop_category
6017 #define prop_chartype frame->Xprop_chartype
6018 #define prop_othercase frame->Xprop_othercase
6019 #define prop_test_against frame->Xprop_test_against
6020 #define prop_test_variable frame->Xprop_test_variable
6021 #endif
6022
6023 #define ctype frame->Xctype
6024 #define fc frame->Xfc
6025 #define fi frame->Xfi
6026 #define length frame->Xlength
6027 #define max frame->Xmax
6028 #define min frame->Xmin
6029 #define number frame->Xnumber
6030 #define offset frame->Xoffset
6031 #define op frame->Xop
6032 #define save_capture_last frame->Xsave_capture_last
6033 #define save_offset1 frame->Xsave_offset1
6034 #define save_offset2 frame->Xsave_offset2
6035 #define save_offset3 frame->Xsave_offset3
6036 #define stacksave frame->Xstacksave
6037
6038 #define newptrb frame->Xnewptrb
6039
6040 /* When recursion is being used, local variables are allocated on the stack and
6041 get preserved during recursion in the normal way. In this environment, fi and
6042 i, and fc and c, can be the same variables. */
6043
6044 #else
6045 #define fi i
6046 #define fc c
6047
6048
6049 #ifdef SUPPORT_UTF8 /* Many of these variables are used ony */
6050 const uschar *charptr; /* small blocks of the code. My normal */
6051 #endif /* style of coding would have declared */
6052 const uschar *callpat; /* them within each of those blocks. */
6053 const uschar *data; /* However, in order to accommodate the */
6054 const uschar *next; /* version of this code that uses an */
6055 const uschar *pp; /* external "stack" implemented on the */
6056 const uschar *prev; /* heap, it is easier to declare them */
6057 const uschar *saved_eptr; /* all here, so the declarations can */
6058 /* be cut out in a block. The only */
6059 recursion_info new_recursive; /* declarations within blocks below are */
6060 /* for variables that do not have to */
6061 BOOL cur_is_word; /* be preserved over a recursive call */
6062 BOOL condition; /* to RMATCH(). */
6063 BOOL minimize;
6064 BOOL prev_is_word;
6065
6066 unsigned long int original_ims;
6067
6068 #ifdef SUPPORT_UCP
6069 int prop_type;
6070 int prop_fail_result;
6071 int prop_category;
6072 int prop_chartype;
6073 int prop_othercase;
6074 int prop_test_against;
6075 int *prop_test_variable;
6076 #endif
6077
6078 int ctype;
6079 int length;
6080 int max;
6081 int min;
6082 int number;
6083 int offset;
6084 int op;
6085 int save_capture_last;
6086 int save_offset1, save_offset2, save_offset3;
6087 int stacksave[REC_STACK_SAVE_MAX];
6088
6089 eptrblock newptrb;
6090 #endif
6091
6092 /* These statements are here to stop the compiler complaining about unitialized
6093 variables. */
6094
6095 #ifdef SUPPORT_UCP
6096 prop_fail_result = 0;
6097 prop_test_against = 0;
6098 prop_test_variable = NULL;
6099 #endif
6100
6101 /* OK, now we can get on with the real code of the function. Recursion is
6102 specified by the macros RMATCH and RRETURN. When NO_RECURSE is *not* defined,
6103 these just turn into a recursive call to match() and a "return", respectively.
6104 However, RMATCH isn't like a function call because it's quite a complicated
6105 macro. It has to be used in one particular way. This shouldn't, however, impact
6106 performance when true recursion is being used. */
6107
6108 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
6109
6110 original_ims = ims; /* Save for resetting on ')' */
6111
6112 /* At the start of a bracketed group, add the current subject pointer to the
6113 stack of such pointers, to be re-instated at the end of the group when we hit
6114 the closing ket. When match() is called in other circumstances, we don't add to
6115 this stack. */
6116
6117 if ((flags & match_isgroup) != 0)
6118 {
6119 newptrb.epb_prev = eptrb;
6120 newptrb.epb_saved_eptr = eptr;
6121 eptrb = &newptrb;
6122 }
6123
6124 /* Now start processing the operations. */
6125
6126 for (;;)
6127 {
6128 op = *ecode;
6129 minimize = FALSE;
6130
6131 /* For partial matching, remember if we ever hit the end of the subject after
6132 matching at least one subject character. */
6133
6134 if (md->partial &&
6135 eptr >= md->end_subject &&
6136 eptr > md->start_match)
6137 md->hitend = TRUE;
6138
6139 /* Opening capturing bracket. If there is space in the offset vector, save
6140 the current subject position in the working slot at the top of the vector. We
6141 mustn't change the current values of the data slot, because they may be set
6142 from a previous iteration of this group, and be referred to by a reference
6143 inside the group.
6144
6145 If the bracket fails to match, we need to restore this value and also the
6146 values of the final offsets, in case they were set by a previous iteration of
6147 the same bracket.
6148
6149 If there isn't enough space in the offset vector, treat this as if it were a
6150 non-capturing bracket. Don't worry about setting the flag for the error case
6151 here; that is handled in the code for KET. */
6152
6153 if (op > OP_BRA)
6154 {
6155 number = op - OP_BRA;
6156
6157 /* For extended extraction brackets (large number), we have to fish out the
6158 number from a dummy opcode at the start. */
6159
6160 if (number > EXTRACT_BASIC_MAX)
6161 number = GET2(ecode, 2+LINK_SIZE);
6162 offset = number << 1;
6163
6164 #ifdef DEBUG
6165 printf("start bracket %d subject=", number);
6166 pchars(eptr, 16, TRUE, md);
6167 printf("\n");
6168 #endif
6169
6170 if (offset < md->offset_max)
6171 {
6172 save_offset1 = md->offset_vector[offset];
6173 save_offset2 = md->offset_vector[offset+1];
6174 save_offset3 = md->offset_vector[md->offset_end - number];
6175 save_capture_last = md->capture_last;
6176
6177 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
6178 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
6179
6180 do
6181 {
6182 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
6183 match_isgroup);
6184 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6185 md->capture_last = save_capture_last;
6186 ecode += GET(ecode, 1);
6187 }
6188 while (*ecode == OP_ALT);
6189
6190 DPRINTF(("bracket %d failed\n", number));
6191
6192 md->offset_vector[offset] = save_offset1;
6193 md->offset_vector[offset+1] = save_offset2;
6194 md->offset_vector[md->offset_end - number] = save_offset3;
6195
6196 RRETURN(MATCH_NOMATCH);
6197 }
6198
6199 /* Insufficient room for saving captured contents */
6200
6201 else op = OP_BRA;
6202 }
6203
6204 /* Other types of node can be handled by a switch */
6205
6206 switch(op)
6207 {
6208 case OP_BRA: /* Non-capturing bracket: optimized */
6209 DPRINTF(("start bracket 0\n"));
6210 do
6211 {
6212 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
6213 match_isgroup);
6214 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6215 ecode += GET(ecode, 1);
6216 }
6217 while (*ecode == OP_ALT);
6218 DPRINTF(("bracket 0 failed\n"));
6219 RRETURN(MATCH_NOMATCH);
6220
6221 /* Conditional group: compilation checked that there are no more than
6222 two branches. If the condition is false, skipping the first branch takes us
6223 past the end if there is only one branch, but that's OK because that is
6224 exactly what going to the ket would do. */
6225
6226 case OP_COND:
6227 if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */
6228 {
6229 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
6230 condition = (offset == CREF_RECURSE * 2)?
6231 (md->recursive != NULL) :
6232 (offset < offset_top && md->offset_vector[offset] >= 0);
6233 RMATCH(rrc, eptr, ecode + (condition?
6234 (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1))),
6235 offset_top, md, ims, eptrb, match_isgroup);
6236 RRETURN(rrc);
6237 }
6238
6239 /* The condition is an assertion. Call match() to evaluate it - setting
6240 the final argument TRUE causes it to stop at the end of an assertion. */
6241
6242 else
6243 {
6244 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
6245 match_condassert | match_isgroup);
6246 if (rrc == MATCH_MATCH)
6247 {
6248 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2);
6249 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
6250 }
6251 else if (rrc != MATCH_NOMATCH)
6252 {
6253 RRETURN(rrc); /* Need braces because of following else */
6254 }
6255 else ecode += GET(ecode, 1);
6256 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
6257 match_isgroup);
6258 RRETURN(rrc);
6259 }
6260 /* Control never reaches here */
6261
6262 /* Skip over conditional reference or large extraction number data if
6263 encountered. */
6264
6265 case OP_CREF:
6266 case OP_BRANUMBER:
6267 ecode += 3;
6268 break;
6269
6270 /* End of the pattern. If we are in a recursion, we should restore the
6271 offsets appropriately and continue from after the call. */
6272
6273 case OP_END:
6274 if (md->recursive != NULL && md->recursive->group_num == 0)
6275 {
6276 recursion_info *rec = md->recursive;
6277 DPRINTF(("Hit the end in a (?0) recursion\n"));
6278 md->recursive = rec->prevrec;
6279 memmove(md->offset_vector, rec->offset_save,
6280 rec->saved_max * sizeof(int));
6281 md->start_match = rec->save_start;
6282 ims = original_ims;
6283 ecode = rec->after_call;
6284 break;
6285 }
6286
6287 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
6288 string - backtracking will then try other alternatives, if any. */
6289
6290 if (md->notempty && eptr == md->start_match) RRETURN(MATCH_NOMATCH);
6291 md->end_match_ptr = eptr; /* Record where we ended */
6292 md->end_offset_top = offset_top; /* and how many extracts were taken */
6293 RRETURN(MATCH_MATCH);
6294
6295 /* Change option settings */
6296
6297 case OP_OPT:
6298 ims = ecode[1];
6299 ecode += 2;
6300 DPRINTF(("ims set to %02lx\n", ims));
6301 break;
6302
6303 /* Assertion brackets. Check the alternative branches in turn - the
6304 matching won't pass the KET for an assertion. If any one branch matches,
6305 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
6306 start of each branch to move the current point backwards, so the code at
6307 this level is identical to the lookahead case. */
6308
6309 case OP_ASSERT:
6310 case OP_ASSERTBACK:
6311 do
6312 {
6313 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
6314 match_isgroup);
6315 if (rrc == MATCH_MATCH) break;
6316 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6317 ecode += GET(ecode, 1);
6318 }
6319 while (*ecode == OP_ALT);
6320 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
6321
6322 /* If checking an assertion for a condition, return MATCH_MATCH. */
6323
6324 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
6325
6326 /* Continue from after the assertion, updating the offsets high water
6327 mark, since extracts may have been taken during the assertion. */
6328
6329 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
6330 ecode += 1 + LINK_SIZE;
6331 offset_top = md->end_offset_top;
6332 continue;
6333
6334 /* Negative assertion: all branches must fail to match */
6335
6336 case OP_ASSERT_NOT:
6337 case OP_ASSERTBACK_NOT:
6338 do
6339 {
6340 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
6341 match_isgroup);
6342 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
6343 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6344 ecode += GET(ecode,1);
6345 }
6346 while (*ecode == OP_ALT);
6347
6348 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
6349
6350 ecode += 1 + LINK_SIZE;
6351 continue;
6352
6353 /* Move the subject pointer back. This occurs only at the start of
6354 each branch of a lookbehind assertion. If we are too close to the start to
6355 move back, this match function fails. When working with UTF-8 we move
6356 back a number of characters, not bytes. */
6357
6358 case OP_REVERSE:
6359 #ifdef SUPPORT_UTF8
6360 if (md->utf8)
6361 {
6362 c = GET(ecode,1);
6363 for (i = 0; i < c; i++)
6364 {
6365 eptr--;
6366 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
6367 BACKCHAR(eptr)
6368 }
6369 }
6370 else
6371 #endif
6372
6373 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
6374
6375 {
6376 eptr -= GET(ecode,1);
6377 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
6378 }
6379
6380 /* Skip to next op code */
6381
6382 ecode += 1 + LINK_SIZE;
6383 break;
6384
6385 /* The callout item calls an external function, if one is provided, passing
6386 details of the match so far. This is mainly for debugging, though the
6387 function is able to force a failure. */
6388
6389 case OP_CALLOUT:
6390 if (pcre_callout != NULL)
6391 {
6392 pcre_callout_block cb;
6393 cb.version = 1; /* Version 1 of the callout block */
6394 cb.callout_number = ecode[1];
6395 cb.offset_vector = md->offset_vector;
6396 cb.subject = (const char *)md->start_subject;
6397 cb.subject_length = md->end_subject - md->start_subject;
6398 cb.start_match = md->start_match - md->start_subject;
6399 cb.current_position = eptr - md->start_subject;
6400 cb.pattern_position = GET(ecode, 2);
6401 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
6402 cb.capture_top = offset_top/2;
6403 cb.capture_last = md->capture_last;
6404 cb.callout_data = md->callout_data;
6405 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
6406 if (rrc < 0) RRETURN(rrc);
6407 }
6408 ecode += 2 + 2*LINK_SIZE;
6409 break;
6410
6411 /* Recursion either matches the current regex, or some subexpression. The
6412 offset data is the offset to the starting bracket from the start of the
6413 whole pattern. (This is so that it works from duplicated subpatterns.)
6414
6415 If there are any capturing brackets started but not finished, we have to
6416 save their starting points and reinstate them after the recursion. However,
6417 we don't know how many such there are (offset_top records the completed
6418 total) so we just have to save all the potential data. There may be up to
6419 65535 such values, which is too large to put on the stack, but using malloc
6420 for small numbers seems expensive. As a compromise, the stack is used when
6421 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
6422 is used. A problem is what to do if the malloc fails ... there is no way of
6423 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
6424 values on the stack, and accept that the rest may be wrong.
6425
6426 There are also other values that have to be saved. We use a chained
6427 sequence of blocks that actually live on the stack. Thanks to Robin Houston
6428 for the original version of this logic. */
6429
6430 case OP_RECURSE:
6431 {
6432 callpat = md->start_code + GET(ecode, 1);
6433 new_recursive.group_num = *callpat - OP_BRA;
6434
6435 /* For extended extraction brackets (large number), we have to fish out
6436 the number from a dummy opcode at the start. */
6437
6438 if (new_recursive.group_num > EXTRACT_BASIC_MAX)
6439 new_recursive.group_num = GET2(callpat, 2+LINK_SIZE);
6440
6441 /* Add to "recursing stack" */
6442
6443 new_recursive.prevrec = md->recursive;
6444 md->recursive = &new_recursive;
6445
6446 /* Find where to continue from afterwards */
6447
6448 ecode += 1 + LINK_SIZE;
6449 new_recursive.after_call = ecode;
6450
6451 /* Now save the offset data. */
6452
6453 new_recursive.saved_max = md->offset_end;
6454 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
6455 new_recursive.offset_save = stacksave;
6456 else
6457 {
6458 new_recursive.offset_save =
6459 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
6460 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
6461 }
6462
6463 memcpy(new_recursive.offset_save, md->offset_vector,
6464 new_recursive.saved_max * sizeof(int));
6465 new_recursive.save_start = md->start_match;
6466 md->start_match = eptr;
6467
6468 /* OK, now we can do the recursion. For each top-level alternative we
6469 restore the offset and recursion data. */
6470
6471 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
6472 do
6473 {
6474 RMATCH(rrc, eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims,
6475 eptrb, match_isgroup);
6476 if (rrc == MATCH_MATCH)
6477 {
6478 md->recursive = new_recursive.prevrec;
6479 if (new_recursive.offset_save != stacksave)
6480 (pcre_free)(new_recursive.offset_save);
6481 RRETURN(MATCH_MATCH);
6482 }
6483 else if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6484
6485 md->recursive = &new_recursive;
6486 memcpy(md->offset_vector, new_recursive.offset_save,
6487 new_recursive.saved_max * sizeof(int));
6488 callpat += GET(callpat, 1);
6489 }
6490 while (*callpat == OP_ALT);
6491
6492 DPRINTF(("Recursion didn't match\n"));
6493 md->recursive = new_recursive.prevrec;
6494 if (new_recursive.offset_save != stacksave)
6495 (pcre_free)(new_recursive.offset_save);
6496 RRETURN(MATCH_NOMATCH);
6497 }
6498 /* Control never reaches here */
6499
6500 /* "Once" brackets are like assertion brackets except that after a match,
6501 the point in the subject string is not moved back. Thus there can never be
6502 a move back into the brackets. Friedl calls these "atomic" subpatterns.
6503 Check the alternative branches in turn - the matching won't pass the KET
6504 for this kind of subpattern. If any one branch matches, we carry on as at
6505 the end of a normal bracket, leaving the subject pointer. */
6506
6507 case OP_ONCE:
6508 {
6509 prev = ecode;
6510 saved_eptr = eptr;
6511
6512 do
6513 {
6514 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
6515 eptrb, match_isgroup);
6516 if (rrc == MATCH_MATCH) break;
6517 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6518 ecode += GET(ecode,1);
6519 }
6520 while (*ecode == OP_ALT);
6521
6522 /* If hit the end of the group (which could be repeated), fail */
6523
6524 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
6525
6526 /* Continue as from after the assertion, updating the offsets high water
6527 mark, since extracts may have been taken. */
6528
6529 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
6530
6531 offset_top = md->end_offset_top;
6532 eptr = md->end_match_ptr;
6533
6534 /* For a non-repeating ket, just continue at this level. This also
6535 happens for a repeating ket if no characters were matched in the group.
6536 This is the forcible breaking of infinite loops as implemented in Perl
6537 5.005. If there is an options reset, it will get obeyed in the normal
6538 course of events. */
6539
6540 if (*ecode == OP_KET || eptr == saved_eptr)
6541 {
6542 ecode += 1+LINK_SIZE;
6543 break;
6544 }
6545
6546 /* The repeating kets try the rest of the pattern or restart from the
6547 preceding bracket, in the appropriate order. We need to reset any options
6548 that changed within the bracket before re-running it, so check the next
6549 opcode. */
6550
6551 if (ecode[1+LINK_SIZE] == OP_OPT)
6552 {
6553 ims = (ims & ~PCRE_IMS) | ecode[4];
6554 DPRINTF(("ims set to %02lx at group repeat\n", ims));
6555 }
6556
6557 if (*ecode == OP_KETRMIN)
6558 {
6559 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);
6560 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6561 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6562 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6563 }
6564 else /* OP_KETRMAX */
6565 {
6566 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6567 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6568 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
6569 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6570 }
6571 }
6572 RRETURN(MATCH_NOMATCH);
6573
6574 /* An alternation is the end of a branch; scan along to find the end of the
6575 bracketed group and go to there. */
6576
6577 case OP_ALT:
6578 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
6579 break;
6580
6581 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
6582 that it may occur zero times. It may repeat infinitely, or not at all -
6583 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
6584 repeat limits are compiled as a number of copies, with the optional ones
6585 preceded by BRAZERO or BRAMINZERO. */
6586
6587 case OP_BRAZERO:
6588 {
6589 next = ecode+1;
6590 RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, match_isgroup);
6591 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6592 do next += GET(next,1); while (*next == OP_ALT);
6593 ecode = next + 1+LINK_SIZE;
6594 }
6595 break;
6596
6597 case OP_BRAMINZERO:
6598 {
6599 next = ecode+1;
6600 do next += GET(next,1); while (*next == OP_ALT);
6601 RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb,
6602 match_isgroup);
6603 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6604 ecode++;
6605 }
6606 break;
6607
6608 /* End of a group, repeated or non-repeating. If we are at the end of
6609 an assertion "group", stop matching and return MATCH_MATCH, but record the
6610 current high water mark for use by positive assertions. Do this also
6611 for the "once" (not-backup up) groups. */
6612
6613 case OP_KET:
6614 case OP_KETRMIN:
6615 case OP_KETRMAX:
6616 {
6617 prev = ecode - GET(ecode, 1);
6618 saved_eptr = eptrb->epb_saved_eptr;
6619
6620 /* Back up the stack of bracket start pointers. */
6621
6622 eptrb = eptrb->epb_prev;
6623
6624 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
6625 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
6626 *prev == OP_ONCE)
6627 {
6628 md->end_match_ptr = eptr; /* For ONCE */
6629 md->end_offset_top = offset_top;
6630 RRETURN(MATCH_MATCH);
6631 }
6632
6633 /* In all other cases except a conditional group we have to check the
6634 group number back at the start and if necessary complete handling an
6635 extraction by setting the offsets and bumping the high water mark. */
6636
6637 if (*prev != OP_COND)
6638 {
6639 number = *prev - OP_BRA;
6640
6641 /* For extended extraction brackets (large number), we have to fish out
6642 the number from a dummy opcode at the start. */
6643
6644 if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);
6645 offset = number << 1;
6646
6647 #ifdef DEBUG
6648 printf("end bracket %d", number);
6649 printf("\n");
6650 #endif
6651
6652 /* Test for a numbered group. This includes groups called as a result
6653 of recursion. Note that whole-pattern recursion is coded as a recurse
6654 into group 0, so it won't be picked up here. Instead, we catch it when
6655 the OP_END is reached. */
6656
6657 if (number > 0)
6658 {
6659 md->capture_last = number;
6660 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
6661 {
6662 md->offset_vector[offset] =
6663 md->offset_vector[md->offset_end - number];
6664 md->offset_vector[offset+1] = eptr - md->start_subject;
6665 if (offset_top <= offset) offset_top = offset + 2;
6666 }
6667
6668 /* Handle a recursively called group. Restore the offsets
6669 appropriately and continue from after the call. */
6670
6671 if (md->recursive != NULL && md->recursive->group_num == number)
6672 {
6673 recursion_info *rec = md->recursive;
6674 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
6675 md->recursive = rec->prevrec;
6676 md->start_match = rec->save_start;
6677 memcpy(md->offset_vector, rec->offset_save,
6678 rec->saved_max * sizeof(int));
6679 ecode = rec->after_call;
6680 ims = original_ims;
6681 break;
6682 }
6683 }
6684 }
6685
6686 /* Reset the value of the ims flags, in case they got changed during
6687 the group. */
6688
6689 ims = original_ims;
6690 DPRINTF(("ims reset to %02lx\n", ims));
6691
6692 /* For a non-repeating ket, just continue at this level. This also
6693 happens for a repeating ket if no characters were matched in the group.
6694 This is the forcible breaking of infinite loops as implemented in Perl
6695 5.005. If there is an options reset, it will get obeyed in the normal
6696 course of events. */
6697
6698 if (*ecode == OP_KET || eptr == saved_eptr)
6699 {
6700 ecode += 1 + LINK_SIZE;
6701 break;
6702 }
6703
6704 /* The repeating kets try the rest of the pattern or restart from the
6705 preceding bracket, in the appropriate order. */
6706
6707 if (*ecode == OP_KETRMIN)
6708 {
6709 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
6710 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6711 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6712 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6713 }
6714 else /* OP_KETRMAX */
6715 {
6716 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6717 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6718 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
6719 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6720 }
6721 }
6722
6723 RRETURN(MATCH_NOMATCH);
6724
6725 /* Start of subject unless notbol, or after internal newline if multiline */
6726
6727 case OP_CIRC:
6728 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
6729 if ((ims & PCRE_MULTILINE) != 0)
6730 {
6731 if (eptr != md->start_subject && eptr[-1] != NEWLINE)
6732 RRETURN(MATCH_NOMATCH);
6733 ecode++;
6734 break;
6735 }
6736 /* ... else fall through */
6737
6738 /* Start of subject assertion */
6739
6740 case OP_SOD:
6741 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
6742 ecode++;
6743 break;
6744
6745 /* Start of match assertion */
6746
6747 case OP_SOM:
6748 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
6749 ecode++;
6750 break;
6751
6752 /* Assert before internal newline if multiline, or before a terminating
6753 newline unless endonly is set, else end of subject unless noteol is set. */
6754
6755 case OP_DOLL:
6756 if ((ims & PCRE_MULTILINE) != 0)
6757 {
6758 if (eptr < md->end_subject)
6759 { if (*eptr != NEWLINE) RRETURN(MATCH_NOMATCH); }
6760 else
6761 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
6762 ecode++;
6763 break;
6764 }
6765 else
6766 {
6767 if (md->noteol) RRETURN(MATCH_NOMATCH);
6768 if (!md->endonly)
6769 {
6770 if (eptr < md->end_subject - 1 ||
6771 (eptr == md->end_subject - 1 && *eptr != NEWLINE))
6772 RRETURN(MATCH_NOMATCH);
6773 ecode++;
6774 break;
6775 }
6776 }
6777 /* ... else fall through */
6778
6779 /* End of subject assertion (\z) */
6780
6781 case OP_EOD:
6782 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
6783 ecode++;
6784 break;
6785
6786 /* End of subject or ending \n assertion (\Z) */
6787
6788 case OP_EODN:
6789 if (eptr < md->end_subject - 1 ||
6790 (eptr == md->end_subject - 1 && *eptr != NEWLINE)) RRETURN(MATCH_NOMATCH);
6791 ecode++;
6792 break;
6793
6794 /* Word boundary assertions */
6795
6796 case OP_NOT_WORD_BOUNDARY:
6797 case OP_WORD_BOUNDARY:
6798 {
6799
6800 /* Find out if the previous and current characters are "word" characters.
6801 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
6802 be "non-word" characters. */
6803
6804 #ifdef SUPPORT_UTF8
6805 if (md->utf8)
6806 {
6807 if (eptr == md->start_subject) prev_is_word = FALSE; else
6808 {
6809 const uschar *lastptr = eptr - 1;
6810 while((*lastptr & 0xc0) == 0x80) lastptr--;
6811 GETCHAR(c, lastptr);
6812 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
6813 }
6814 if (eptr >= md->end_subject) cur_is_word = FALSE; else
6815 {
6816 GETCHAR(c, eptr);
6817 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
6818 }
6819 }
6820 else
6821 #endif
6822
6823 /* More streamlined when not in UTF-8 mode */
6824
6825 {
6826 prev_is_word = (eptr != md->start_subject) &&
6827 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
6828 cur_is_word = (eptr < md->end_subject) &&
6829 ((md->ctypes[*eptr] & ctype_word) != 0);
6830 }
6831
6832 /* Now see if the situation is what we want */
6833
6834 if ((*ecode++ == OP_WORD_BOUNDARY)?
6835 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
6836 RRETURN(MATCH_NOMATCH);
6837 }
6838 break;
6839
6840 /* Match a single character type; inline for speed */
6841
6842 case OP_ANY:
6843 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
6844 RRETURN(MATCH_NOMATCH);
6845 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
6846 #ifdef SUPPORT_UTF8
6847 if (md->utf8)
6848 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
6849 #endif
6850 ecode++;
6851 break;
6852
6853 /* Match a single byte, even in UTF-8 mode. This opcode really does match
6854 any byte, even newline, independent of the setting of PCRE_DOTALL. */
6855
6856 case OP_ANYBYTE:
6857 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
6858 ecode++;
6859 break;
6860
6861 case OP_NOT_DIGIT:
6862 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6863 GETCHARINCTEST(c, eptr);
6864 if (
6865 #ifdef SUPPORT_UTF8
6866 c < 256 &&
6867 #endif
6868 (md->ctypes[c] & ctype_digit) != 0
6869 )
6870 RRETURN(MATCH_NOMATCH);
6871 ecode++;
6872 break;
6873
6874 case OP_DIGIT:
6875 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6876 GETCHARINCTEST(c, eptr);
6877 if (
6878 #ifdef SUPPORT_UTF8
6879 c >= 256 ||
6880 #endif
6881 (md->ctypes[c] & ctype_digit) == 0
6882 )
6883 RRETURN(MATCH_NOMATCH);
6884 ecode++;
6885 break;
6886
6887 case OP_NOT_WHITESPACE:
6888 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6889 GETCHARINCTEST(c, eptr);
6890 if (
6891 #ifdef SUPPORT_UTF8
6892 c < 256 &&
6893 #endif
6894 (md->ctypes[c] & ctype_space) != 0
6895 )
6896 RRETURN(MATCH_NOMATCH);
6897 ecode++;
6898 break;
6899
6900 case OP_WHITESPACE:
6901 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6902 GETCHARINCTEST(c, eptr);
6903 if (
6904 #ifdef SUPPORT_UTF8
6905 c >= 256 ||
6906 #endif
6907 (md->ctypes[c] & ctype_space) == 0
6908 )
6909 RRETURN(MATCH_NOMATCH);
6910 ecode++;
6911 break;
6912
6913 case OP_NOT_WORDCHAR:
6914 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6915 GETCHARINCTEST(c, eptr);
6916 if (
6917 #ifdef SUPPORT_UTF8
6918 c < 256 &&
6919 #endif
6920 (md->ctypes[c] & ctype_word) != 0
6921 )
6922 RRETURN(MATCH_NOMATCH);
6923 ecode++;
6924 break;
6925
6926 case OP_WORDCHAR:
6927 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6928 GETCHARINCTEST(c, eptr);
6929 if (
6930 #ifdef SUPPORT_UTF8
6931 c >= 256 ||
6932 #endif
6933 (md->ctypes[c] & ctype_word) == 0
6934 )
6935 RRETURN(MATCH_NOMATCH);
6936 ecode++;
6937 break;
6938
6939 #ifdef SUPPORT_UCP
6940 /* Check the next character by Unicode property. We will get here only
6941 if the support is in the binary; otherwise a compile-time error occurs. */
6942
6943 case OP_PROP:
6944 case OP_NOTPROP:
6945 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6946 GETCHARINCTEST(c, eptr);
6947 {
6948 int chartype, rqdtype;
6949 int othercase;
6950 int category = ucp_findchar(c, &chartype, &othercase);
6951
6952 rqdtype = *(++ecode);
6953 ecode++;
6954
6955 if (rqdtype >= 128)
6956 {
6957 if ((rqdtype - 128 != category) == (op == OP_PROP))
6958 RRETURN(MATCH_NOMATCH);
6959 }
6960 else
6961 {
6962 if ((rqdtype != chartype) == (op == OP_PROP))
6963 RRETURN(MATCH_NOMATCH);
6964 }
6965 }
6966 break;
6967
6968 /* Match an extended Unicode sequence. We will get here only if the support
6969 is in the binary; otherwise a compile-time error occurs. */
6970
6971 case OP_EXTUNI:
6972 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6973 GETCHARINCTEST(c, eptr);
6974 {
6975 int chartype;
6976 int othercase;
6977 int category = ucp_findchar(c, &chartype, &othercase);
6978 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
6979 while (eptr < md->end_subject)
6980 {
6981 int len = 1;
6982 if (!md->utf8) c = *eptr; else
6983 {
6984 GETCHARLEN(c, eptr, len);
6985 }
6986 category = ucp_findchar(c, &chartype, &othercase);
6987 if (category != ucp_M) break;
6988 eptr += len;
6989 }
6990 }
6991 ecode++;
6992 break;
6993 #endif
6994
6995
6996 /* Match a back reference, possibly repeatedly. Look past the end of the
6997 item to see if there is repeat information following. The code is similar
6998 to that for character classes, but repeated for efficiency. Then obey
6999 similar code to character type repeats - written out again for speed.
7000 However, if the referenced string is the empty string, always treat
7001 it as matched, any number of times (otherwise there could be infinite
7002 loops). */
7003
7004 case OP_REF:
7005 {
7006 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
7007 ecode += 3; /* Advance past item */
7008
7009 /* If the reference is unset, set the length to be longer than the amount
7010 of subject left; this ensures that every attempt at a match fails. We
7011 can't just fail here, because of the possibility of quantifiers with zero
7012 minima. */
7013
7014 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
7015 md->end_subject - eptr + 1 :
7016 md->offset_vector[offset+1] - md->offset_vector[offset];
7017
7018 /* Set up for repetition, or handle the non-repeated case */
7019
7020 switch (*ecode)
7021 {
7022 case OP_CRSTAR:
7023 case OP_CRMINSTAR:
7024 case OP_CRPLUS:
7025 case OP_CRMINPLUS:
7026 case OP_CRQUERY:
7027 case OP_CRMINQUERY:
7028 c = *ecode++ - OP_CRSTAR;
7029 minimize = (c & 1) != 0;
7030 min = rep_min[c]; /* Pick up values from tables; */
7031 max = rep_max[c]; /* zero for max => infinity */
7032 if (max == 0) max = INT_MAX;
7033 break;
7034
7035 case OP_CRRANGE:
7036 case OP_CRMINRANGE:
7037 minimize = (*ecode == OP_CRMINRANGE);
7038 min = GET2(ecode, 1);
7039 max = GET2(ecode, 3);
7040 if (max == 0) max = INT_MAX;
7041 ecode += 5;
7042 break;
7043
7044 default: /* No repeat follows */
7045 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
7046 eptr += length;
7047 continue; /* With the main loop */
7048 }
7049
7050 /* If the length of the reference is zero, just continue with the
7051 main loop. */
7052
7053 if (length == 0) continue;
7054
7055 /* First, ensure the minimum number of matches are present. We get back
7056 the length of the reference string explicitly rather than passing the
7057 address of eptr, so that eptr can be a register variable. */
7058
7059 for (i = 1; i <= min; i++)
7060 {
7061 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
7062 eptr += length;
7063 }
7064
7065 /* If min = max, continue at the same level without recursion.
7066 They are not both allowed to be zero. */
7067
7068 if (min == max) continue;
7069
7070 /* If minimizing, keep trying and advancing the pointer */
7071
7072 if (minimize)
7073 {
7074 for (fi = min;; fi++)
7075 {
7076 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7077 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7078 if (fi >= max || !match_ref(offset, eptr, length, md, ims))
7079 RRETURN(MATCH_NOMATCH);
7080 eptr += length;
7081 }
7082 /* Control never gets here */
7083 }
7084
7085 /* If maximizing, find the longest string and work backwards */
7086
7087 else
7088 {
7089 pp = eptr;
7090 for (i = min; i < max; i++)
7091 {
7092 if (!match_ref(offset, eptr, length, md, ims)) break;
7093 eptr += length;
7094 }
7095 while (eptr >= pp)
7096 {
7097 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7098 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7099 eptr -= length;
7100 }
7101 RRETURN(MATCH_NOMATCH);
7102 }
7103 }
7104 /* Control never gets here */
7105
7106
7107
7108 /* Match a bit-mapped character class, possibly repeatedly. This op code is
7109 used when all the characters in the class have values in the range 0-255,
7110 and either the matching is caseful, or the characters are in the range
7111 0-127 when UTF-8 processing is enabled. The only difference between
7112 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
7113 encountered.
7114
7115 First, look past the end of the item to see if there is repeat information
7116 following. Then obey similar code to character type repeats - written out
7117 again for speed. */
7118
7119 case OP_NCLASS:
7120 case OP_CLASS:
7121 {
7122 data = ecode + 1; /* Save for matching */
7123 ecode += 33; /* Advance past the item */
7124
7125 switch (*ecode)
7126 {
7127 case OP_CRSTAR:
7128 case OP_CRMINSTAR:
7129 case OP_CRPLUS:
7130 case OP_CRMINPLUS:
7131 case OP_CRQUERY:
7132 case OP_CRMINQUERY:
7133 c = *ecode++ - OP_CRSTAR;
7134 minimize = (c & 1) != 0;
7135 min = rep_min[c]; /* Pick up values from tables; */
7136 max = rep_max[c]; /* zero for max => infinity */
7137 if (max == 0) max = INT_MAX;
7138 break;
7139
7140 case OP_CRRANGE:
7141 case OP_CRMINRANGE:
7142 minimize = (*ecode == OP_CRMINRANGE);
7143 min = GET2(ecode, 1);
7144 max = GET2(ecode, 3);
7145 if (max == 0) max = INT_MAX;
7146 ecode += 5;
7147 break;
7148
7149 default: /* No repeat follows */
7150 min = max = 1;
7151 break;
7152 }
7153
7154 /* First, ensure the minimum number of matches are present. */
7155
7156 #ifdef SUPPORT_UTF8
7157 /* UTF-8 mode */
7158 if (md->utf8)
7159 {
7160 for (i = 1; i <= min; i++)
7161 {
7162 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7163 GETCHARINC(c, eptr);
7164 if (c > 255)
7165 {
7166 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
7167 }
7168 else
7169 {
7170 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
7171 }
7172 }
7173 }
7174 else
7175 #endif
7176 /* Not UTF-8 mode */
7177 {
7178 for (i = 1; i <= min; i++)
7179 {
7180 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7181 c = *eptr++;
7182 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
7183 }
7184 }
7185
7186 /* If max == min we can continue with the main loop without the
7187 need to recurse. */
7188
7189 if (min == max) continue;
7190
7191 /* If minimizing, keep testing the rest of the expression and advancing
7192 the pointer while it matches the class. */
7193
7194 if (minimize)
7195 {
7196 #ifdef SUPPORT_UTF8
7197 /* UTF-8 mode */
7198 if (md->utf8)
7199 {
7200 for (fi = min;; fi++)
7201 {
7202 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7203 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7204 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7205 GETCHARINC(c, eptr);
7206 if (c > 255)
7207 {
7208 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
7209 }
7210 else
7211 {
7212 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
7213 }
7214 }
7215 }
7216 else
7217 #endif
7218 /* Not UTF-8 mode */
7219 {
7220 for (fi = min;; fi++)
7221 {
7222 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7223 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7224 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7225 c = *eptr++;
7226 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
7227 }
7228 }
7229 /* Control never gets here */
7230 }
7231
7232 /* If maximizing, find the longest possible run, then work backwards. */
7233
7234 else
7235 {
7236 pp = eptr;
7237
7238 #ifdef SUPPORT_UTF8
7239 /* UTF-8 mode */
7240 if (md->utf8)
7241 {
7242 for (i = min; i < max; i++)
7243 {
7244 int len = 1;
7245 if (eptr >= md->end_subject) break;
7246 GETCHARLEN(c, eptr, len);
7247 if (c > 255)
7248 {
7249 if (op == OP_CLASS) break;
7250 }
7251 else
7252 {
7253 if ((data[c/8] & (1 << (c&7))) == 0) break;
7254 }
7255 eptr += len;
7256 }
7257 for (;;)
7258 {
7259 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7260 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7261 if (eptr-- == pp) break; /* Stop if tried at original pos */
7262 BACKCHAR(eptr);
7263 }
7264 }
7265 else
7266 #endif
7267 /* Not UTF-8 mode */
7268 {
7269 for (i = min; i < max; i++)
7270 {
7271 if (eptr >= md->end_subject) break;
7272 c = *eptr;
7273 if ((data[c/8] & (1 << (c&7))) == 0) break;
7274 eptr++;
7275 }
7276 while (eptr >= pp)
7277 {
7278 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7279 eptr--;
7280 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7281 }
7282 }
7283
7284 RRETURN(MATCH_NOMATCH);
7285 }
7286 }
7287 /* Control never gets here */
7288
7289
7290 /* Match an extended character class. This opcode is encountered only
7291 in UTF-8 mode, because that's the only time it is compiled. */
7292
7293 #ifdef SUPPORT_UTF8
7294 case OP_XCLASS:
7295 {
7296 data = ecode + 1 + LINK_SIZE; /* Save for matching */
7297 ecode += GET(ecode, 1); /* Advance past the item */
7298
7299 switch (*ecode)
7300 {
7301 case OP_CRSTAR:
7302 case OP_CRMINSTAR:
7303 case OP_CRPLUS:
7304 case OP_CRMINPLUS:
7305 case OP_CRQUERY:
7306 case OP_CRMINQUERY:
7307 c = *ecode++ - OP_CRSTAR;
7308 minimize = (c & 1) != 0;
7309 min = rep_min[c]; /* Pick up values from tables; */
7310 max = rep_max[c]; /* zero for max => infinity */
7311 if (max == 0) max = INT_MAX;
7312 break;
7313
7314 case OP_CRRANGE:
7315 case OP_CRMINRANGE:
7316 minimize = (*ecode == OP_CRMINRANGE);
7317 min = GET2(ecode, 1);
7318 max = GET2(ecode, 3);
7319 if (max == 0) max = INT_MAX;
7320 ecode += 5;
7321 break;
7322
7323 default: /* No repeat follows */
7324 min = max = 1;
7325 break;
7326 }
7327
7328 /* First, ensure the minimum number of matches are present. */
7329
7330 for (i = 1; i <= min; i++)
7331 {
7332 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7333 GETCHARINC(c, eptr);
7334 if (!match_xclass(c, data)) RRETURN(MATCH_NOMATCH);
7335 }
7336
7337 /* If max == min we can continue with the main loop without the
7338 need to recurse. */
7339
7340 if (min == max) continue;
7341
7342 /* If minimizing, keep testing the rest of the expression and advancing
7343 the pointer while it matches the class. */
7344
7345 if (minimize)
7346 {
7347 for (fi = min;; fi++)
7348 {
7349 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7350 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7351 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7352 GETCHARINC(c, eptr);
7353 if (!match_xclass(c, data)) RRETURN(MATCH_NOMATCH);
7354 }
7355 /* Control never gets here */
7356 }
7357
7358 /* If maximizing, find the longest possible run, then work backwards. */
7359
7360 else
7361 {
7362 pp = eptr;
7363 for (i = min; i < max; i++)
7364 {
7365 int len = 1;
7366 if (eptr >= md->end_subject) break;
7367 GETCHARLEN(c, eptr, len);
7368 if (!match_xclass(c, data)) break;
7369 eptr += len;
7370 }
7371 for(;;)
7372 {
7373 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7374 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7375 if (eptr-- == pp) break; /* Stop if tried at original pos */
7376 BACKCHAR(eptr)
7377 }
7378 RRETURN(MATCH_NOMATCH);
7379 }
7380
7381 /* Control never gets here */
7382 }
7383 #endif /* End of XCLASS */
7384
7385 /* Match a single character, casefully */
7386
7387 case OP_CHAR:
7388 #ifdef SUPPORT_UTF8
7389 if (md->utf8)
7390 {
7391 length = 1;
7392 ecode++;
7393 GETCHARLEN(fc, ecode, length);
7394 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7395 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
7396 }
7397 else
7398 #endif
7399
7400 /* Non-UTF-8 mode */
7401 {
7402 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
7403 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
7404 ecode += 2;
7405 }
7406 break;
7407
7408 /* Match a single character, caselessly */
7409
7410 case OP_CHARNC:
7411 #ifdef SUPPORT_UTF8
7412 if (md->utf8)
7413 {
7414 length = 1;
7415 ecode++;
7416 GETCHARLEN(fc, ecode, length);
7417
7418 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7419
7420 /* If the pattern character's value is < 128, we have only one byte, and
7421 can use the fast lookup table. */
7422
7423 if (fc < 128)
7424 {
7425 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
7426 }
7427
7428 /* Otherwise we must pick up the subject character */
7429
7430 else
7431 {
7432 int dc;
7433 GETCHARINC(dc, eptr);
7434 ecode += length;
7435
7436 /* If we have Unicode property support, we can use it to test the other
7437 case of the character, if there is one. The result of ucp_findchar() is
7438 < 0 if the char isn't found, and othercase is returned as zero if there
7439 isn't one. */
7440
7441 if (fc != dc)
7442 {
7443 #ifdef SUPPORT_UCP
7444 int chartype;
7445 int othercase;
7446 if (ucp_findchar(fc, &chartype, &othercase) < 0 || dc != othercase)
7447 #endif
7448 RRETURN(MATCH_NOMATCH);
7449 }
7450 }
7451 }
7452 else
7453 #endif /* SUPPORT_UTF8 */
7454
7455 /* Non-UTF-8 mode */
7456 {
7457 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
7458 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
7459 ecode += 2;
7460 }
7461 break;
7462
7463 /* Match a single character repeatedly; different opcodes share code. */
7464
7465 case OP_EXACT:
7466 min = max = GET2(ecode, 1);
7467 ecode += 3;
7468 goto REPEATCHAR;
7469
7470 case OP_UPTO:
7471 case OP_MINUPTO:
7472 min = 0;
7473 max = GET2(ecode, 1);
7474 minimize = *ecode == OP_MINUPTO;
7475 ecode += 3;
7476 goto REPEATCHAR;
7477
7478 case OP_STAR:
7479 case OP_MINSTAR:
7480 case OP_PLUS:
7481 case OP_MINPLUS:
7482 case OP_QUERY:
7483 case OP_MINQUERY:
7484 c = *ecode++ - OP_STAR;
7485 minimize = (c & 1) != 0;
7486 min = rep_min[c]; /* Pick up values from tables; */
7487 max = rep_max[c]; /* zero for max => infinity */
7488 if (max == 0) max = INT_MAX;
7489
7490 /* Common code for all repeated single-character matches. We can give
7491 up quickly if there are fewer than the minimum number of characters left in
7492 the subject. */
7493
7494 REPEATCHAR:
7495 #ifdef SUPPORT_UTF8
7496 if (md->utf8)
7497 {
7498 length = 1;
7499 charptr = ecode;
7500 GETCHARLEN(fc, ecode, length);
7501 if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7502 ecode += length;
7503
7504 /* Handle multibyte character matching specially here. There is
7505 support for caseless matching if UCP support is present. */
7506
7507 if (length > 1)
7508 {
7509 int oclength = 0;
7510 uschar occhars[8];
7511
7512 #ifdef SUPPORT_UCP
7513 int othercase;
7514 int chartype;
7515 if ((ims & PCRE_CASELESS) != 0 &&
7516 ucp_findchar(fc, &chartype, &othercase) >= 0 &&
7517 othercase > 0)
7518 oclength = ord2utf8(othercase, occhars);
7519 #endif /* SUPPORT_UCP */
7520
7521 for (i = 1; i <= min; i++)
7522 {
7523 if (memcmp(eptr, charptr, length) == 0) eptr += length;
7524 /* Need braces because of following else */
7525 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
7526 else
7527 {
7528 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
7529 eptr += oclength;
7530 }
7531 }
7532
7533 if (min == max) continue;
7534
7535 if (minimize)
7536 {
7537 for (fi = min;; fi++)
7538 {
7539 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7540 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7541 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7542 if (memcmp(eptr, charptr, length) == 0) eptr += length;
7543 /* Need braces because of following else */
7544 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
7545 else
7546 {
7547 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
7548 eptr += oclength;
7549 }
7550 }
7551 /* Control never gets here */
7552 }
7553 else
7554 {
7555 pp = eptr;
7556 for (i = min; i < max; i++)
7557 {
7558 if (eptr > md->end_subject - length) break;
7559 if (memcmp(eptr, charptr, length) == 0) eptr += length;
7560 else if (oclength == 0) break;
7561 else
7562 {
7563 if (memcmp(eptr, occhars, oclength) != 0) break;
7564 eptr += oclength;
7565 }
7566 }
7567 while (eptr >= pp)
7568 {
7569 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7570 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7571 eptr -= length;
7572 }
7573 RRETURN(MATCH_NOMATCH);
7574 }
7575 /* Control never gets here */
7576 }
7577
7578 /* If the length of a UTF-8 character is 1, we fall through here, and
7579 obey the code as for non-UTF-8 characters below, though in this case the
7580 value of fc will always be < 128. */
7581 }
7582 else
7583 #endif /* SUPPORT_UTF8 */
7584
7585 /* When not in UTF-8 mode, load a single-byte character. */
7586 {
7587 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7588 fc = *ecode++;
7589 }
7590
7591 /* The value of fc at this point is always less than 256, though we may or
7592 may not be in UTF-8 mode. The code is duplicated for the caseless and
7593 caseful cases, for speed, since matching characters is likely to be quite
7594 common. First, ensure the minimum number of matches are present. If min =
7595 max, continue at the same level without recursing. Otherwise, if
7596 minimizing, keep trying the rest of the expression and advancing one
7597 matching character if failing, up to the maximum. Alternatively, if
7598 maximizing, find the maximum number of characters and work backwards. */
7599
7600 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
7601 max, eptr));
7602
7603 if ((ims & PCRE_CASELESS) != 0)
7604 {
7605 fc = md->lcc[fc];
7606 for (i = 1; i <= min; i++)
7607 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
7608 if (min == max) continue;
7609 if (minimize)
7610 {
7611 for (fi = min;; fi++)
7612 {
7613 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7614 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7615 if (fi >= max || eptr >= md->end_subject ||
7616 fc != md->lcc[*eptr++])
7617 RRETURN(MATCH_NOMATCH);
7618 }
7619 /* Control never gets here */
7620 }
7621 else
7622 {
7623 pp = eptr;
7624 for (i = min; i < max; i++)
7625 {
7626 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
7627 eptr++;
7628 }
7629 while (eptr >= pp)
7630 {
7631 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7632 eptr--;
7633 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7634 }
7635 RRETURN(MATCH_NOMATCH);
7636 }
7637 /* Control never gets here */
7638 }
7639
7640 /* Caseful comparisons (includes all multi-byte characters) */
7641
7642 else
7643 {
7644 for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
7645 if (min == max) continue;
7646 if (minimize)
7647 {
7648 for (fi = min;; fi++)
7649 {
7650 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7651 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7652 if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
7653 RRETURN(MATCH_NOMATCH);
7654 }
7655 /* Control never gets here */
7656 }
7657 else
7658 {
7659 pp = eptr;
7660 for (i = min; i < max; i++)
7661 {
7662 if (eptr >= md->end_subject || fc != *eptr) break;
7663 eptr++;
7664 }
7665 while (eptr >= pp)
7666 {
7667 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7668 eptr--;
7669 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7670 }
7671 RRETURN(MATCH_NOMATCH);
7672 }
7673 }
7674 /* Control never gets here */
7675
7676 /* Match a negated single one-byte character. The character we are
7677 checking can be multibyte. */
7678
7679 case OP_NOT:
7680 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7681 ecode++;
7682 GETCHARINCTEST(c, eptr);
7683 if ((ims & PCRE_CASELESS) != 0)
7684 {
7685 #ifdef SUPPORT_UTF8
7686 if (c < 256)
7687 #endif
7688 c = md->lcc[c];
7689 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
7690 }
7691 else
7692 {
7693 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
7694 }
7695 break;
7696
7697 /* Match a negated single one-byte character repeatedly. This is almost a
7698 repeat of the code for a repeated single character, but I haven't found a
7699 nice way of commoning these up that doesn't require a test of the
7700 positive/negative option for each character match. Maybe that wouldn't add
7701 very much to the time taken, but character matching *is* what this is all
7702 about... */
7703
7704 case OP_NOTEXACT:
7705 min = max = GET2(ecode, 1);
7706 ecode += 3;
7707 goto REPEATNOTCHAR;
7708
7709 case OP_NOTUPTO:
7710 case OP_NOTMINUPTO:
7711 min = 0;
7712 max = GET2(ecode, 1);
7713 minimize = *ecode == OP_NOTMINUPTO;
7714 ecode += 3;
7715 goto REPEATNOTCHAR;
7716
7717 case OP_NOTSTAR:
7718 case OP_NOTMINSTAR:
7719 case OP_NOTPLUS:
7720 case OP_NOTMINPLUS:
7721 case OP_NOTQUERY:
7722 case OP_NOTMINQUERY:
7723 c = *ecode++ - OP_NOTSTAR;
7724 minimize = (c & 1) != 0;
7725 min = rep_min[c]; /* Pick up values from tables; */
7726 max = rep_max[c]; /* zero for max => infinity */
7727 if (max == 0) max = INT_MAX;
7728
7729 /* Common code for all repeated single-byte matches. We can give up quickly
7730 if there are fewer than the minimum number of bytes left in the
7731 subject. */
7732
7733 REPEATNOTCHAR:
7734 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7735 fc = *ecode++;
7736
7737 /* The code is duplicated for the caseless and caseful cases, for speed,
7738 since matching characters is likely to be quite common. First, ensure the
7739 minimum number of matches are present. If min = max, continue at the same
7740 level without recursing. Otherwise, if minimizing, keep trying the rest of
7741 the expression and advancing one matching character if failing, up to the
7742 maximum. Alternatively, if maximizing, find the maximum number of
7743 characters and work backwards. */
7744
7745 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
7746 max, eptr));
7747
7748 if ((ims & PCRE_CASELESS) != 0)
7749 {
7750 fc = md->lcc[fc];
7751
7752 #ifdef SUPPORT_UTF8
7753 /* UTF-8 mode */
7754 if (md->utf8)
7755 {
7756 register int d;
7757 for (i = 1; i <= min; i++)
7758 {
7759 GETCHARINC(d, eptr);
7760 if (d < 256) d = md->lcc[d];
7761 if (fc == d) RRETURN(MATCH_NOMATCH);
7762 }
7763 }
7764 else
7765 #endif
7766
7767 /* Not UTF-8 mode */
7768 {
7769 for (i = 1; i <= min; i++)
7770 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
7771 }
7772
7773 if (min == max) continue;
7774
7775 if (minimize)
7776 {
7777 #ifdef SUPPORT_UTF8
7778 /* UTF-8 mode */
7779 if (md->utf8)
7780 {
7781 register int d;
7782 for (fi = min;; fi++)
7783 {
7784 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7785 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7786 GETCHARINC(d, eptr);
7787 if (d < 256) d = md->lcc[d];
7788 if (fi >= max || eptr >= md->end_subject || fc == d)
7789 RRETURN(MATCH_NOMATCH);
7790 }
7791 }
7792 else
7793 #endif
7794 /* Not UTF-8 mode */
7795 {
7796 for (fi = min;; fi++)
7797 {
7798 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7799 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7800 if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
7801 RRETURN(MATCH_NOMATCH);
7802 }
7803 }
7804 /* Control never gets here */
7805 }
7806
7807 /* Maximize case */
7808
7809 else
7810 {
7811 pp = eptr;
7812
7813 #ifdef SUPPORT_UTF8
7814 /* UTF-8 mode */
7815 if (md->utf8)
7816 {
7817 register int d;
7818 for (i = min; i < max; i++)
7819 {
7820 int len = 1;
7821 if (eptr >= md->end_subject) break;
7822 GETCHARLEN(d, eptr, len);
7823 if (d < 256) d = md->lcc[d];
7824 if (fc == d) break;
7825 eptr += len;
7826 }
7827 for(;;)
7828 {
7829 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7830 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7831 if (eptr-- == pp) break; /* Stop if tried at original pos */
7832 BACKCHAR(eptr);
7833 }
7834 }
7835 else
7836 #endif
7837 /* Not UTF-8 mode */
7838 {
7839 for (i = min; i < max; i++)
7840 {
7841 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
7842 eptr++;
7843 }
7844 while (eptr >= pp)
7845 {
7846 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7847 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7848 eptr--;
7849 }
7850 }
7851
7852 RRETURN(MATCH_NOMATCH);
7853 }
7854 /* Control never gets here */
7855 }
7856
7857 /* Caseful comparisons */
7858
7859 else
7860 {
7861 #ifdef SUPPORT_UTF8
7862 /* UTF-8 mode */
7863 if (md->utf8)
7864 {
7865 register int d;
7866 for (i = 1; i <= min; i++)
7867 {
7868 GETCHARINC(d, eptr);
7869 if (fc == d) RRETURN(MATCH_NOMATCH);
7870 }
7871 }
7872 else
7873 #endif
7874 /* Not UTF-8 mode */
7875 {
7876 for (i = 1; i <= min; i++)
7877 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
7878 }
7879
7880 if (min == max) continue;
7881
7882 if (minimize)
7883 {
7884 #ifdef SUPPORT_UTF8
7885 /* UTF-8 mode */
7886 if (md->utf8)
7887 {
7888 register int d;
7889 for (fi = min;; fi++)
7890 {
7891 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7892 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7893 GETCHARINC(d, eptr);
7894 if (fi >= max || eptr >= md->end_subject || fc == d)
7895 RRETURN(MATCH_NOMATCH);
7896 }
7897 }
7898 else
7899 #endif
7900 /* Not UTF-8 mode */
7901 {
7902 for (fi = min;; fi++)
7903 {
7904 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7905 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7906 if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
7907 RRETURN(MATCH_NOMATCH);
7908 }
7909 }
7910 /* Control never gets here */
7911 }
7912
7913 /* Maximize case */
7914
7915 else
7916 {
7917 pp = eptr;
7918
7919 #ifdef SUPPORT_UTF8
7920 /* UTF-8 mode */
7921 if (md->utf8)
7922 {
7923 register int d;
7924 for (i = min; i < max; i++)
7925 {
7926 int len = 1;
7927 if (eptr >= md->end_subject) break;
7928 GETCHARLEN(d, eptr, len);
7929 if (fc == d) break;
7930 eptr += len;
7931 }
7932 for(;;)
7933 {
7934 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7935 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7936 if (eptr-- == pp) break; /* Stop if tried at original pos */
7937 BACKCHAR(eptr);
7938 }
7939 }
7940 else
7941 #endif
7942 /* Not UTF-8 mode */
7943 {
7944 for (i = min; i < max; i++)
7945 {
7946 if (eptr >= md->end_subject || fc == *eptr) break;
7947 eptr++;
7948 }
7949 while (eptr >= pp)
7950 {
7951 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7952 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7953 eptr--;
7954 }
7955 }
7956
7957 RRETURN(MATCH_NOMATCH);
7958 }
7959 }
7960 /* Control never gets here */
7961
7962 /* Match a single character type repeatedly; several different opcodes
7963 share code. This is very similar to the code for single characters, but we
7964 repeat it in the interests of efficiency. */
7965
7966 case OP_TYPEEXACT:
7967 min = max = GET2(ecode, 1);
7968 minimize = TRUE;
7969 ecode += 3;
7970 goto REPEATTYPE;
7971
7972 case OP_TYPEUPTO:
7973 case OP_TYPEMINUPTO:
7974 min = 0;
7975 max = GET2(ecode, 1);
7976 minimize = *ecode == OP_TYPEMINUPTO;
7977 ecode += 3;
7978 goto REPEATTYPE;
7979
7980 case OP_TYPESTAR:
7981 case OP_TYPEMINSTAR:
7982 case OP_TYPEPLUS:
7983 case OP_TYPEMINPLUS:
7984 case OP_TYPEQUERY:
7985 case OP_TYPEMINQUERY:
7986 c = *ecode++ - OP_TYPESTAR;
7987 minimize = (c & 1) != 0;
7988 min = rep_min[c]; /* Pick up values from tables; */
7989 max = rep_max[c]; /* zero for max => infinity */
7990 if (max == 0) max = INT_MAX;
7991
7992 /* Common code for all repeated single character type matches. Note that
7993 in UTF-8 mode, '.' matches a character of any length, but for the other
7994 character types, the valid characters are all one-byte long. */
7995
7996 REPEATTYPE:
7997 ctype = *ecode++; /* Code for the character type */
7998
7999 #ifdef SUPPORT_UCP
8000 if (ctype == OP_PROP || ctype == OP_NOTPROP)
8001 {
8002 prop_fail_result = ctype == OP_NOTPROP;
8003 prop_type = *ecode++;
8004 if (prop_type >= 128)
8005 {
8006 prop_test_against = prop_type - 128;
8007 prop_test_variable = &prop_category;
8008 }
8009 else
8010 {
8011 prop_test_against = prop_type;
8012 prop_test_variable = &prop_chartype;
8013 }
8014 }
8015 else prop_type = -1;
8016 #endif
8017
8018 /* First, ensure the minimum number of matches are present. Use inline
8019 code for maximizing the speed, and do the type test once at the start
8020 (i.e. keep it out of the loop). Also we can test that there are at least
8021 the minimum number of bytes before we start. This isn't as effective in
8022 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
8023 is tidier. Also separate the UCP code, which can be the same for both UTF-8
8024 and single-bytes. */
8025
8026 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
8027 if (min > 0)
8028 {
8029 #ifdef SUPPORT_UCP
8030 if (prop_type > 0)
8031 {
8032 for (i = 1; i <= min; i++)
8033 {
8034 GETCHARINC(c, eptr);
8035 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8036 if ((*prop_test_variable == prop_test_against) == prop_fail_result)
8037 RRETURN(MATCH_NOMATCH);
8038 }
8039 }
8040
8041 /* Match extended Unicode sequences. We will get here only if the
8042 support is in the binary; otherwise a compile-time error occurs. */
8043
8044 else if (ctype == OP_EXTUNI)
8045 {
8046 for (i = 1; i <= min; i++)
8047 {
8048 GETCHARINCTEST(c, eptr);
8049 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8050 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
8051 while (eptr < md->end_subject)
8052 {
8053 int len = 1;
8054 if (!md->utf8) c = *eptr; else
8055 {
8056 GETCHARLEN(c, eptr, len);
8057 }
8058 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8059 if (prop_category != ucp_M) break;
8060 eptr += len;
8061 }
8062 }
8063 }
8064
8065 else
8066 #endif /* SUPPORT_UCP */
8067
8068 /* Handle all other cases when the coding is UTF-8 */
8069
8070 #ifdef SUPPORT_UTF8
8071 if (md->utf8) switch(ctype)
8072 {
8073 case OP_ANY:
8074 for (i = 1; i <= min; i++)
8075 {
8076 if (eptr >= md->end_subject ||
8077 (*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0))
8078 RRETURN(MATCH_NOMATCH);
8079 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8080 }
8081 break;
8082
8083 case OP_ANYBYTE:
8084 eptr += min;
8085 break;
8086
8087 case OP_NOT_DIGIT:
8088 for (i = 1; i <= min; i++)
8089 {
8090 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8091 GETCHARINC(c, eptr);
8092 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
8093 RRETURN(MATCH_NOMATCH);
8094 }
8095 break;
8096
8097 case OP_DIGIT:
8098 for (i = 1; i <= min; i++)
8099 {
8100 if (eptr >= md->end_subject ||
8101 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
8102 RRETURN(MATCH_NOMATCH);
8103 /* No need to skip more bytes - we know it's a 1-byte character */
8104 }
8105 break;
8106
8107 case OP_NOT_WHITESPACE:
8108 for (i = 1; i <= min; i++)
8109 {
8110 if (eptr >= md->end_subject ||
8111 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
8112 RRETURN(MATCH_NOMATCH);
8113 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8114 }
8115 break;
8116
8117 case OP_WHITESPACE:
8118 for (i = 1; i <= min; i++)
8119 {
8120 if (eptr >= md->end_subject ||
8121 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
8122 RRETURN(MATCH_NOMATCH);
8123 /* No need to skip more bytes - we know it's a 1-byte character */
8124 }
8125 break;
8126
8127 case OP_NOT_WORDCHAR:
8128 for (i = 1; i <= min; i++)
8129 {
8130 if (eptr >= md->end_subject ||
8131 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
8132 RRETURN(MATCH_NOMATCH);
8133 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8134 }
8135 break;
8136
8137 case OP_WORDCHAR:
8138 for (i = 1; i <= min; i++)
8139 {
8140 if (eptr >= md->end_subject ||
8141 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
8142 RRETURN(MATCH_NOMATCH);
8143 /* No need to skip more bytes - we know it's a 1-byte character */
8144 }
8145 break;
8146
8147 default:
8148 RRETURN(PCRE_ERROR_INTERNAL);
8149 } /* End switch(ctype) */
8150
8151 else
8152 #endif /* SUPPORT_UTF8 */
8153
8154 /* Code for the non-UTF-8 case for minimum matching of operators other
8155 than OP_PROP and OP_NOTPROP. */
8156
8157 switch(ctype)
8158 {
8159 case OP_ANY:
8160 if ((ims & PCRE_DOTALL) == 0)
8161 {
8162 for (i = 1; i <= min; i++)
8163 if (*eptr++ == NEWLINE) RRETURN(MATCH_NOMATCH);
8164 }
8165 else eptr += min;
8166 break;
8167
8168 case OP_ANYBYTE:
8169 eptr += min;
8170 break;
8171
8172 case OP_NOT_DIGIT:
8173 for (i = 1; i <= min; i++)
8174 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
8175 break;
8176
8177 case OP_DIGIT:
8178 for (i = 1; i <= min; i++)
8179 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
8180 break;
8181
8182 case OP_NOT_WHITESPACE:
8183 for (i = 1; i <= min; i++)
8184 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
8185 break;
8186
8187 case OP_WHITESPACE:
8188 for (i = 1; i <= min; i++)
8189 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
8190 break;
8191
8192 case OP_NOT_WORDCHAR:
8193 for (i = 1; i <= min; i++)
8194 if ((md->ctypes[*eptr++] & ctype_word) != 0)
8195 RRETURN(MATCH_NOMATCH);
8196 break;
8197
8198 case OP_WORDCHAR:
8199 for (i = 1; i <= min; i++)
8200 if ((md->ctypes[*eptr++] & ctype_word) == 0)
8201 RRETURN(MATCH_NOMATCH);
8202 break;
8203
8204 default:
8205 RRETURN(PCRE_ERROR_INTERNAL);
8206 }
8207 }
8208
8209 /* If min = max, continue at the same level without recursing */
8210
8211 if (min == max) continue;
8212
8213 /* If minimizing, we have to test the rest of the pattern before each
8214 subsequent match. Again, separate the UTF-8 case for speed, and also
8215 separate the UCP cases. */
8216
8217 if (minimize)
8218 {
8219 #ifdef SUPPORT_UCP
8220 if (prop_type > 0)
8221 {
8222 for (fi = min;; fi++)
8223 {
8224 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8225 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8226 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8227 GETCHARINC(c, eptr);
8228 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8229 if ((*prop_test_variable == prop_test_against) == prop_fail_result)
8230 RRETURN(MATCH_NOMATCH);
8231 }
8232 }
8233
8234 /* Match extended Unicode sequences. We will get here only if the
8235 support is in the binary; otherwise a compile-time error occurs. */
8236
8237 else if (ctype == OP_EXTUNI)
8238 {
8239 for (fi = min;; fi++)
8240 {
8241 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8242 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8243 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8244 GETCHARINCTEST(c, eptr);
8245 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8246 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
8247 while (eptr < md->end_subject)
8248 {
8249 int len = 1;
8250 if (!md->utf8) c = *eptr; else
8251 {
8252 GETCHARLEN(c, eptr, len);
8253 }
8254 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8255 if (prop_category != ucp_M) break;
8256 eptr += len;
8257 }
8258 }
8259 }
8260
8261 else
8262 #endif /* SUPPORT_UCP */
8263
8264 #ifdef SUPPORT_UTF8
8265 /* UTF-8 mode */
8266 if (md->utf8)
8267 {
8268 for (fi = min;; fi++)
8269 {
8270 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8271 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8272 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8273
8274 GETCHARINC(c, eptr);
8275 switch(ctype)
8276 {
8277 case OP_ANY:
8278 if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
8279 break;
8280
8281 case OP_ANYBYTE:
8282 break;
8283
8284 case OP_NOT_DIGIT:
8285 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
8286 RRETURN(MATCH_NOMATCH);
8287 break;
8288
8289 case OP_DIGIT:
8290 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
8291 RRETURN(MATCH_NOMATCH);
8292 break;
8293
8294 case OP_NOT_WHITESPACE:
8295 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
8296 RRETURN(MATCH_NOMATCH);
8297 break;
8298
8299 case OP_WHITESPACE:
8300 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
8301 RRETURN(MATCH_NOMATCH);
8302 break;
8303
8304 case OP_NOT_WORDCHAR:
8305 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
8306 RRETURN(MATCH_NOMATCH);
8307 break;
8308
8309 case OP_WORDCHAR:
8310 if (c >= 256 && (md->ctypes[c] & ctype_word) == 0)
8311 RRETURN(MATCH_NOMATCH);
8312 break;
8313
8314 default:
8315 RRETURN(PCRE_ERROR_INTERNAL);
8316 }
8317 }
8318 }
8319 else
8320 #endif
8321 /* Not UTF-8 mode */
8322 {
8323 for (fi = min;; fi++)
8324 {
8325 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8326 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8327 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8328 c = *eptr++;
8329 switch(ctype)
8330 {
8331 case OP_ANY:
8332 if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
8333 break;
8334
8335 case OP_ANYBYTE:
8336 break;
8337
8338 case OP_NOT_DIGIT:
8339 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
8340 break;
8341
8342 case OP_DIGIT:
8343 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
8344 break;
8345
8346 case OP_NOT_WHITESPACE:
8347 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
8348 break;
8349
8350 case OP_WHITESPACE:
8351 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
8352 break;
8353
8354 case OP_NOT_WORDCHAR:
8355 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
8356 break;
8357
8358 case OP_WORDCHAR:
8359 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
8360 break;
8361
8362 default:
8363 RRETURN(PCRE_ERROR_INTERNAL);
8364 }
8365 }
8366 }
8367 /* Control never gets here */
8368 }
8369
8370 /* If maximizing it is worth using inline code for speed, doing the type
8371 test once at the start (i.e. keep it out of the loop). Again, keep the
8372 UTF-8 and UCP stuff separate. */
8373
8374 else
8375 {
8376 pp = eptr; /* Remember where we started */
8377
8378 #ifdef SUPPORT_UCP
8379 if (prop_type > 0)
8380 {
8381 for (i = min; i < max; i++)
8382 {
8383 int len = 1;
8384 if (eptr >= md->end_subject) break;
8385 GETCHARLEN(c, eptr, len);
8386 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8387 if ((*prop_test_variable == prop_test_against) == prop_fail_result)
8388 break;
8389 eptr+= len;
8390 }
8391
8392 /* eptr is now past the end of the maximum run */
8393
8394 for(;;)
8395 {
8396 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8397 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8398 if (eptr-- == pp) break; /* Stop if tried at original pos */
8399 BACKCHAR(eptr);
8400 }
8401 }
8402
8403 /* Match extended Unicode sequences. We will get here only if the
8404 support is in the binary; otherwise a compile-time error occurs. */
8405
8406 else if (ctype == OP_EXTUNI)
8407 {
8408 for (i = min; i < max; i++)
8409 {
8410 if (eptr >= md->end_subject) break;
8411 GETCHARINCTEST(c, eptr);
8412 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8413 if (prop_category == ucp_M) break;
8414 while (eptr < md->end_subject)
8415 {
8416 int len = 1;
8417 if (!md->utf8) c = *eptr; else
8418 {
8419 GETCHARLEN(c, eptr, len);
8420 }
8421 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8422 if (prop_category != ucp_M) break;
8423 eptr += len;
8424 }
8425 }
8426
8427 /* eptr is now past the end of the maximum run */
8428
8429 for(;;)
8430 {
8431 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8432 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8433 if (eptr-- == pp) break; /* Stop if tried at original pos */
8434 for (;;) /* Move back over one extended */
8435 {
8436 int len = 1;
8437 BACKCHAR(eptr);
8438 if (!md->utf8) c = *eptr; else
8439 {
8440 GETCHARLEN(c, eptr, len);
8441 }
8442 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8443 if (prop_category != ucp_M) break;
8444 eptr--;
8445 }
8446 }
8447 }
8448
8449 else
8450 #endif /* SUPPORT_UCP */
8451
8452 #ifdef SUPPORT_UTF8
8453 /* UTF-8 mode */
8454
8455 if (md->utf8)
8456 {
8457 switch(ctype)
8458 {
8459 case OP_ANY:
8460
8461 /* Special code is required for UTF8, but when the maximum is unlimited
8462 we don't need it, so we repeat the non-UTF8 code. This is probably
8463 worth it, because .* is quite a common idiom. */
8464
8465 if (max < INT_MAX)
8466 {
8467 if ((ims & PCRE_DOTALL) == 0)
8468 {
8469 for (i = min; i < max; i++)
8470 {
8471 if (eptr >= md->end_subject || *eptr == NEWLINE) break;
8472 eptr++;
8473 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8474 }
8475 }
8476 else
8477 {
8478 for (i = min; i < max; i++)
8479 {
8480 eptr++;
8481 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8482 }
8483 }
8484 }
8485
8486 /* Handle unlimited UTF-8 repeat */
8487
8488 else
8489 {
8490 if ((ims & PCRE_DOTALL) == 0)
8491 {
8492 for (i = min; i < max; i++)
8493 {
8494 if (eptr >= md->end_subject || *eptr == NEWLINE) break;
8495 eptr++;
8496 }
8497 break;
8498 }
8499 else
8500 {
8501 c = max - min;
8502 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
8503 eptr += c;
8504 }
8505 }
8506 break;
8507
8508 /* The byte case is the same as non-UTF8 */
8509
8510 case OP_ANYBYTE:
8511 c = max - min;
8512 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
8513 eptr += c;
8514 break;
8515
8516 case OP_NOT_DIGIT:
8517 for (i = min; i < max; i++)
8518 {
8519 int len = 1;
8520 if (eptr >= md->end_subject) break;
8521 GETCHARLEN(c, eptr, len);
8522 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
8523 eptr+= len;
8524 }
8525 break;
8526
8527 case OP_DIGIT:
8528 for (i = min; i < max; i++)
8529 {
8530 int len = 1;
8531 if (eptr >= md->end_subject) break;
8532 GETCHARLEN(c, eptr, len);
8533 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
8534 eptr+= len;
8535 }
8536 break;
8537
8538 case OP_NOT_WHITESPACE:
8539 for (i = min; i < max; i++)
8540 {
8541 int len = 1;
8542 if (eptr >= md->end_subject) break;
8543 GETCHARLEN(c, eptr, len);
8544 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
8545 eptr+= len;
8546 }
8547 break;
8548
8549 case OP_WHITESPACE:
8550 for (i = min; i < max; i++)
8551 {
8552 int len = 1;
8553 if (eptr >= md->end_subject) break;
8554 GETCHARLEN(c, eptr, len);
8555 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
8556 eptr+= len;
8557 }
8558 break;
8559
8560 case OP_NOT_WORDCHAR:
8561 for (i = min; i < max; i++)
8562 {
8563 int len = 1;
8564 if (eptr >= md->end_subject) break;
8565 GETCHARLEN(c, eptr, len);
8566 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
8567 eptr+= len;
8568 }
8569 break;
8570
8571 case OP_WORDCHAR:
8572 for (i = min; i < max; i++)
8573 {
8574 int len = 1;
8575 if (eptr >= md->end_subject) break;
8576 GETCHARLEN(c, eptr, len);
8577 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
8578 eptr+= len;
8579 }
8580 break;
8581
8582 default:
8583 RRETURN(PCRE_ERROR_INTERNAL);
8584 }
8585
8586 /* eptr is now past the end of the maximum run */
8587
8588 for(;;)
8589 {
8590 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8591 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8592 if (eptr-- == pp) break; /* Stop if tried at original pos */
8593 BACKCHAR(eptr);
8594 }
8595 }
8596 else
8597 #endif
8598
8599 /* Not UTF-8 mode */
8600 {
8601 switch(ctype)
8602 {
8603 case OP_ANY:
8604 if ((ims & PCRE_DOTALL) == 0)
8605 {
8606 for (i = min; i < max; i++)
8607 {
8608 if (eptr >= md->end_subject || *eptr == NEWLINE) break;
8609 eptr++;
8610 }
8611 break;
8612 }
8613 /* For DOTALL case, fall through and treat as \C */
8614
8615 case OP_ANYBYTE:
8616 c = max - min;
8617 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
8618 eptr += c;
8619 break;
8620
8621 case OP_NOT_DIGIT:
8622 for (i = min; i < max; i++)
8623 {
8624 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
8625 break;
8626 eptr++;
8627 }
8628 break;
8629
8630 case OP_DIGIT:
8631 for (i = min; i < max; i++)
8632 {
8633 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
8634 break;
8635 eptr++;
8636 }
8637 break;
8638
8639 case OP_NOT_WHITESPACE:
8640 for (i = min; i < max; i++)
8641 {
8642 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
8643 break;
8644 eptr++;
8645 }
8646 break;
8647
8648 case OP_WHITESPACE:
8649 for (i = min; i < max; i++)
8650 {
8651 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
8652 break;
8653 eptr++;
8654 }
8655 break;
8656
8657 case OP_NOT_WORDCHAR:
8658 for (i = min; i < max; i++)
8659 {
8660 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
8661 break;
8662 eptr++;
8663 }
8664 break;
8665
8666 case OP_WORDCHAR:
8667 for (i = min; i < max; i++)
8668 {
8669 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
8670 break;
8671 eptr++;
8672 }
8673 break;
8674
8675 default:
8676 RRETURN(PCRE_ERROR_INTERNAL);
8677 }
8678
8679 /* eptr is now past the end of the maximum run */
8680
8681 while (eptr >= pp)
8682 {
8683 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8684 eptr--;
8685 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8686 }
8687 }
8688
8689 /* Get here if we can't make it match with any permitted repetitions */
8690
8691 RRETURN(MATCH_NOMATCH);
8692 }
8693 /* Control never gets here */
8694
8695 /* There's been some horrible disaster. Since all codes > OP_BRA are
8696 for capturing brackets, and there shouldn't be any gaps between 0 and
8697 OP_BRA, arrival here can only mean there is something seriously wrong
8698 in the code above or the OP_xxx definitions. */
8699
8700 default:
8701 DPRINTF(("Unknown opcode %d\n", *ecode));
8702 RRETURN(PCRE_ERROR_UNKNOWN_NODE);
8703 }
8704
8705 /* Do not stick any code in here without much thought; it is assumed
8706 that "continue" in the code above comes out to here to repeat the main
8707 loop. */
8708
8709 } /* End of main loop */
8710 /* Control never reaches here */
8711 }
8712
8713
8714 /***************************************************************************
8715 ****************************************************************************
8716 RECURSION IN THE match() FUNCTION
8717
8718 Undefine all the macros that were defined above to handle this. */
8719
8720 #ifdef NO_RECURSE
8721 #undef eptr
8722 #undef ecode
8723 #undef offset_top
8724 #undef ims
8725 #undef eptrb
8726 #undef flags
8727
8728 #undef callpat
8729 #undef charptr
8730 #undef data
8731 #undef next
8732 #undef pp
8733 #undef prev
8734 #undef saved_eptr
8735
8736 #undef new_recursive
8737
8738 #undef cur_is_word
8739 #undef condition
8740 #undef minimize
8741 #undef prev_is_word
8742
8743 #undef original_ims
8744
8745 #undef ctype
8746 #undef length
8747 #undef max
8748 #undef min
8749 #undef number
8750 #undef offset
8751 #undef op
8752 #undef save_capture_last
8753 #undef save_offset1
8754 #undef save_offset2
8755 #undef save_offset3
8756 #undef stacksave
8757
8758 #undef newptrb
8759
8760 #endif
8761
8762 /* These two are defined as macros in both cases */
8763
8764 #undef fc
8765 #undef fi
8766
8767 /***************************************************************************
8768 ***************************************************************************/
8769
8770
8771
8772 /*************************************************
8773 * Execute a Regular Expression *
8774 *************************************************/
8775
8776 /* This function applies a compiled re to a subject string and picks out
8777 portions of the string if it matches. Two elements in the vector are set for
8778 each substring: the offsets to the start and end of the substring.
8779
8780 Arguments:
8781 argument_re points to the compiled expression
8782 extra_data points to extra data or is NULL
8783 subject points to the subject string
8784 length length of subject string (may contain binary zeros)
8785 start_offset where to start in the subject string
8786 options option bits
8787 offsets points to a vector of ints to be filled in with offsets
8788 offsetcount the number of elements in the vector
8789
8790 Returns: > 0 => success; value is the number of elements filled in
8791 = 0 => success, but offsets is not big enough
8792 -1 => failed to match
8793 < -1 => some kind of unexpected problem
8794 */
8795
8796 EXPORT int
8797 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
8798 const char *subject, int length, int start_offset, int options, int *offsets,
8799 int offsetcount)
8800 {
8801 int rc, resetcount, ocount;
8802 int first_byte = -1;
8803 int req_byte = -1;
8804 int req_byte2 = -1;
8805 unsigned long int ims = 0;
8806 BOOL using_temporary_offsets = FALSE;
8807 BOOL anchored;
8808 BOOL startline;
8809 BOOL first_byte_caseless = FALSE;
8810 BOOL req_byte_caseless = FALSE;
8811 match_data match_block;
8812 const uschar *tables;
8813 const uschar *start_bits = NULL;
8814 const uschar *start_match = (const uschar *)subject + start_offset;
8815 const uschar *end_subject;
8816 const uschar *req_byte_ptr = start_match - 1;
8817
8818 pcre_study_data internal_study;
8819 const pcre_study_data *study;
8820
8821 real_pcre internal_re;
8822 const real_pcre *external_re = (const real_pcre *)argument_re;
8823 const real_pcre *re = external_re;
8824
8825 /* Plausibility checks */
8826
8827 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
8828 if (re == NULL || subject == NULL ||
8829 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
8830 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
8831
8832 /* Fish out the optional data from the extra_data structure, first setting
8833 the default values. */
8834
8835 study = NULL;
8836 match_block.match_limit = MATCH_LIMIT;
8837 match_block.callout_data = NULL;
8838
8839 /* The table pointer is always in native byte order. */
8840
8841 tables = external_re->tables;
8842
8843 if (extra_data != NULL)
8844 {
8845 register unsigned int flags = extra_data->flags;
8846 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
8847 study = (const pcre_study_data *)extra_data->study_data;
8848 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
8849 match_block.match_limit = extra_data->match_limit;
8850 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
8851 match_block.callout_data = extra_data->callout_data;
8852 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
8853 }
8854
8855 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
8856 is a feature that makes it possible to save compiled regex and re-use them
8857 in other programs later. */
8858
8859 if (tables == NULL) tables = pcre_default_tables;
8860
8861 /* Check that the first field in the block is the magic number. If it is not,
8862 test for a regex that was compiled on a host of opposite endianness. If this is
8863 the case, flipped values are put in internal_re and internal_study if there was
8864 study data too. */
8865
8866 if (re->magic_number != MAGIC_NUMBER)
8867 {
8868 re = try_flipped(re, &internal_re, study, &internal_study);
8869 if (re == NULL) return PCRE_ERROR_BADMAGIC;
8870 if (study != NULL) study = &internal_study;
8871 }
8872
8873 /* Set up other data */
8874
8875 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
8876 startline = (re->options & PCRE_STARTLINE) != 0;
8877
8878 /* The code starts after the real_pcre block and the capture name table. */
8879
8880 match_block.start_code = (const uschar *)external_re + re->name_table_offset +
8881 re->name_count * re->name_entry_size;
8882
8883 match_block.start_subject = (const uschar *)subject;
8884 match_block.start_offset = start_offset;
8885 match_block.end_subject = match_block.start_subject + length;
8886 end_subject = match_block.end_subject;
8887
8888 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
8889 match_block.utf8 = (re->options & PCRE_UTF8) != 0;
8890
8891 match_block.notbol = (options & PCRE_NOTBOL) != 0;
8892 match_block.noteol = (options & PCRE_NOTEOL) != 0;
8893 match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
8894 match_block.partial = (options & PCRE_PARTIAL) != 0;
8895 match_block.hitend = FALSE;
8896
8897 match_block.recursive = NULL; /* No recursion at top level */
8898
8899 match_block.lcc = tables + lcc_offset;
8900 match_block.ctypes = tables + ctypes_offset;
8901
8902 /* Partial matching is supported only for a restricted set of regexes at the
8903 moment. */
8904
8905 if (match_block.partial && (re->options & PCRE_NOPARTIAL) != 0)
8906 return PCRE_ERROR_BADPARTIAL;
8907
8908 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
8909 back the character offset. */
8910
8911 #ifdef SUPPORT_UTF8
8912 if (match_block.utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
8913 {
8914 if (valid_utf8((uschar *)subject, length) >= 0)
8915 return PCRE_ERROR_BADUTF8;
8916 if (start_offset > 0 && start_offset < length)
8917 {
8918 int tb = ((uschar *)subject)[start_offset];
8919 if (tb > 127)
8920 {
8921 tb &= 0xc0;
8922 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
8923 }
8924 }
8925 }
8926 #endif
8927
8928 /* The ims options can vary during the matching as a result of the presence
8929 of (?ims) items in the pattern. They are kept in a local variable so that
8930 restoring at the exit of a group is easy. */
8931
8932 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
8933
8934 /* If the expression has got more back references than the offsets supplied can
8935 hold, we get a temporary chunk of working store to use during the matching.
8936 Otherwise, we can use the vector supplied, rounding down its size to a multiple
8937 of 3. */
8938
8939 ocount = offsetcount - (offsetcount % 3);
8940
8941 if (re->top_backref > 0 && re->top_backref >= ocount/3)
8942 {
8943 ocount = re->top_backref * 3 + 3;
8944 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
8945 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
8946 using_temporary_offsets = TRUE;
8947 DPRINTF(("Got memory to hold back references\n"));
8948 }
8949 else match_block.offset_vector = offsets;
8950
8951 match_block.offset_end = ocount;
8952 match_block.offset_max = (2*ocount)/3;
8953 match_block.offset_overflow = FALSE;
8954 match_block.capture_last = -1;
8955
8956 /* Compute the minimum number of offsets that we need to reset each time. Doing
8957 this makes a huge difference to execution time when there aren't many brackets
8958 in the pattern. */
8959
8960 resetcount = 2 + re->top_bracket * 2;
8961 if (resetcount > offsetcount) resetcount = ocount;
8962
8963 /* Reset the working variable associated with each extraction. These should
8964 never be used unless previously set, but they get saved and restored, and so we
8965 initialize them to avoid reading uninitialized locations. */
8966
8967 if (match_block.offset_vector != NULL)
8968 {
8969 register int *iptr = match_block.offset_vector + ocount;
8970 register int *iend = iptr - resetcount/2 + 1;
8971 while (--iptr >= iend) *iptr = -1;
8972 }
8973
8974 /* Set up the first character to match, if available. The first_byte value is
8975 never set for an anchored regular expression, but the anchoring may be forced
8976 at run time, so we have to test for anchoring. The first char may be unset for
8977 an unanchored pattern, of course. If there's no first char and the pattern was
8978 studied, there may be a bitmap of possible first characters. */
8979
8980 if (!anchored)
8981 {
8982 if ((re->options & PCRE_FIRSTSET) != 0)
8983 {
8984 first_byte = re->first_byte & 255;
8985 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
8986 first_byte = match_block.lcc[first_byte];
8987 }
8988 else
8989 if (!startline && study != NULL &&
8990 (study->options & PCRE_STUDY_MAPPED) != 0)
8991 start_bits = study->start_bits;
8992 }
8993
8994 /* For anchored or unanchored matches, there may be a "last known required
8995 character" set. */
8996
8997 if ((re->options & PCRE_REQCHSET) != 0)
8998 {
8999 req_byte = re->req_byte & 255;
9000 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
9001 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
9002 }
9003
9004 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
9005 the loop runs just once. */
9006
9007 do
9008 {
9009 /* Reset the maximum number of extractions we might see. */
9010
9011 if (match_block.offset_vector != NULL)
9012 {
9013 register int *iptr = match_block.offset_vector;
9014 register int *iend = iptr + resetcount;
9015 while (iptr < iend) *iptr++ = -1;
9016 }
9017
9018 /* Advance to a unique first char if possible */
9019
9020 if (first_byte >= 0)
9021 {
9022 if (first_byte_caseless)
9023 while (start_match < end_subject &&
9024 match_block.lcc[*start_match] != first_byte)
9025 start_match++;
9026 else
9027 while (start_match < end_subject && *start_match != first_byte)
9028 start_match++;
9029 }
9030
9031 /* Or to just after \n for a multiline match if possible */
9032
9033 else if (startline)
9034 {
9035 if (start_match > match_block.start_subject + start_offset)
9036 {
9037 while (start_match < end_subject && start_match[-1] != NEWLINE)
9038 start_match++;
9039 }
9040 }
9041
9042 /* Or to a non-unique first char after study */
9043
9044 else if (start_bits != NULL)
9045 {
9046 while (start_match < end_subject)
9047 {
9048 register unsigned int c = *start_match;
9049 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
9050 }
9051 }
9052
9053 #ifdef DEBUG /* Sigh. Some compilers never learn. */
9054 printf(">>>> Match against: ");
9055 pchars(start_match, end_subject - start_match, TRUE, &match_block);
9056 printf("\n");
9057 #endif
9058
9059 /* If req_byte is set, we know that that character must appear in the subject
9060 for the match to succeed. If the first character is set, req_byte must be
9061 later in the subject; otherwise the test starts at the match point. This
9062 optimization can save a huge amount of backtracking in patterns with nested
9063 unlimited repeats that aren't going to match. Writing separate code for
9064 cased/caseless versions makes it go faster, as does using an autoincrement
9065 and backing off on a match.
9066
9067 HOWEVER: when the subject string is very, very long, searching to its end can
9068 take a long time, and give bad performance on quite ordinary patterns. This
9069 showed up when somebody was matching /^C/ on a 32-megabyte string... so we
9070 don't do this when the string is sufficiently long.
9071
9072 ALSO: this processing is disabled when partial matching is requested.
9073 */
9074
9075 if (req_byte >= 0 &&
9076 end_subject - start_match < REQ_BYTE_MAX &&
9077 !match_block.partial)
9078 {
9079 register const uschar *p = start_match + ((first_byte >= 0)? 1 : 0);
9080
9081 /* We don't need to repeat the search if we haven't yet reached the
9082 place we found it at last time. */
9083
9084 if (p > req_byte_ptr)
9085 {
9086 if (req_byte_caseless)
9087 {
9088 while (p < end_subject)
9089 {
9090 register int pp = *p++;
9091 if (pp == req_byte || pp == req_byte2) { p--; break; }
9092 }
9093 }
9094 else
9095 {
9096 while (p < end_subject)
9097 {
9098 if (*p++ == req_byte) { p--; break; }
9099 }
9100 }
9101
9102 /* If we can't find the required character, break the matching loop */
9103
9104 if (p >= end_subject) break;
9105
9106 /* If we have found the required character, save the point where we
9107 found it, so that we don't search again next time round the loop if
9108 the start hasn't passed this character yet. */
9109
9110 req_byte_ptr = p;
9111 }
9112 }
9113
9114 /* When a match occurs, substrings will be set for all internal extractions;
9115 we just need to set up the whole thing as substring 0 before returning. If
9116 there were too many extractions, set the return code to zero. In the case
9117 where we had to get some local store to hold offsets for backreferences, copy
9118 those back references that we can. In this case there need not be overflow
9119 if certain parts of the pattern were not used. */
9120
9121 match_block.start_match = start_match;
9122 match_block.match_call_count = 0;
9123
9124 rc = match(start_match, match_block.start_code, 2, &match_block, ims, NULL,
9125 match_isgroup);
9126
9127 if (rc == MATCH_NOMATCH)
9128 {
9129 start_match++;
9130 #ifdef SUPPORT_UTF8
9131 if (match_block.utf8)
9132 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
9133 start_match++;
9134 #endif
9135 continue;
9136 }
9137
9138 if (rc != MATCH_MATCH)
9139 {
9140 DPRINTF((">>>> error: returning %d\n", rc));
9141 return rc;
9142 }
9143
9144 /* We have a match! Copy the offset information from temporary store if
9145 necessary */
9146
9147 if (using_temporary_offsets)
9148 {
9149 if (offsetcount >= 4)
9150 {
9151 memcpy(offsets + 2, match_block.offset_vector + 2,
9152 (offsetcount - 2) * sizeof(int));
9153 DPRINTF(("Copied offsets from temporary memory\n"));
9154 }
9155 if (match_block.end_offset_top > offsetcount)
9156 match_block.offset_overflow = TRUE;
9157
9158 DPRINTF(("Freeing temporary memory\n"));
9159 (pcre_free)(match_block.offset_vector);
9160 }
9161
9162 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
9163
9164 if (offsetcount < 2) rc = 0; else
9165 {
9166 offsets[0] = start_match - match_block.start_subject;
9167 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
9168 }
9169
9170 DPRINTF((">>>> returning %d\n", rc));
9171 return rc;
9172 }
9173
9174 /* This "while" is the end of the "do" above */
9175
9176 while (!anchored && start_match <= end_subject);
9177
9178 if (using_temporary_offsets)
9179 {
9180 DPRINTF(("Freeing temporary memory\n"));
9181 (pcre_free)(match_block.offset_vector);
9182 }
9183
9184 if (match_block.partial && match_block.hitend)
9185 {
9186 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
9187 return PCRE_ERROR_PARTIAL;
9188 }
9189 else
9190 {
9191 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
9192 return PCRE_ERROR_NOMATCH;
9193 }
9194 }
9195
9196 /* End of pcre.c */