Added -dd for daemon debugging.
[exim.git] / src / src / pcre / pcre.c
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /*
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
9
10 Written by: Philip Hazel <ph10@cam.ac.uk>
11
12 Copyright (c) 1997-2004 University of Cambridge
13
14 -----------------------------------------------------------------------------
15 Redistribution and use in source and binary forms, with or without
16 modification, are permitted provided that the following conditions are met:
17
18 * Redistributions of source code must retain the above copyright notice,
19 this list of conditions and the following disclaimer.
20
21 * Redistributions in binary form must reproduce the above copyright
22 notice, this list of conditions and the following disclaimer in the
23 documentation and/or other materials provided with the distribution.
24
25 * Neither the name of the University of Cambridge nor the names of its
26 contributors may be used to endorse or promote products derived from
27 this software without specific prior written permission.
28
29 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
30 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
33 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
34 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
35 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
36 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
37 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
38 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
39 POSSIBILITY OF SUCH DAMAGE.
40 -----------------------------------------------------------------------------
41 */
42
43
44 /* Define DEBUG to get debugging output on stdout. */
45 /* #define DEBUG */
46
47 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
48 inline, and there are *still* stupid compilers about that don't like indented
49 pre-processor statements. I suppose it's only been 10 years... */
50
51 #ifdef DEBUG
52 #define DPRINTF(p) printf p
53 #else
54 #define DPRINTF(p) /*nothing*/
55 #endif
56
57 /* Include the internals header, which itself includes "config.h", the Standard
58 C headers, and the external pcre header. */
59
60 #include "internal.h"
61
62 /* If Unicode Property support is wanted, include a private copy of the
63 function that does it, and the table that translates names to numbers. */
64
65 #ifdef SUPPORT_UCP
66 #include "ucp.c"
67 #include "ucptypetable.c"
68 #endif
69
70 /* Maximum number of items on the nested bracket stacks at compile time. This
71 applies to the nesting of all kinds of parentheses. It does not limit
72 un-nested, non-capturing parentheses. This number can be made bigger if
73 necessary - it is used to dimension one int and one unsigned char vector at
74 compile time. */
75
76 #define BRASTACK_SIZE 200
77
78
79 /* Maximum number of ints of offset to save on the stack for recursive calls.
80 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81 because the offset vector is always a multiple of 3 long. */
82
83 #define REC_STACK_SAVE_MAX 30
84
85
86 /* The maximum remaining length of subject we are prepared to search for a
87 req_byte match. */
88
89 #define REQ_BYTE_MAX 1000
90
91
92 /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
93 the definition is next to the definition of the opcodes in internal.h. */
94
95 static const uschar OP_lengths[] = { OP_LENGTHS };
96
97 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
98
99 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
100 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
101
102 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
103 are simple data values; negative values are for special things like \d and so
104 on. Zero means further processing is needed (for things like \x), or the escape
105 is invalid. */
106
107 #if !EBCDIC /* This is the "normal" table for ASCII systems */
108 static const short int escapes[] = {
109 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
110 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
111 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
112 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
113 -ESC_P, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
114 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
115 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
116 0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
117 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
118 0, 0, -ESC_z /* x - z */
119 };
120
121 #else /* This is the "abnormal" table for EBCDIC systems */
122 static const short int escapes[] = {
123 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
124 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
125 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
126 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
127 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
128 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
129 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
130 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
131 /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
132 /* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, -ESC_p,
133 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
134 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
135 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
136 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
137 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
138 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
139 /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
140 /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
141 /* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0,
142 /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
143 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
144 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
145 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
146 };
147 #endif
148
149
150 /* Tables of names of POSIX character classes and their lengths. The list is
151 terminated by a zero length entry. The first three must be alpha, upper, lower,
152 as this is assumed for handling case independence. */
153
154 static const char *const posix_names[] = {
155 "alpha", "lower", "upper",
156 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
157 "print", "punct", "space", "word", "xdigit" };
158
159 static const uschar posix_name_lengths[] = {
160 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
161
162 /* Table of class bit maps for each POSIX class; up to three may be combined
163 to form the class. The table for [:blank:] is dynamically modified to remove
164 the vertical space characters. */
165
166 static const int posix_class_maps[] = {
167 cbit_lower, cbit_upper, -1, /* alpha */
168 cbit_lower, -1, -1, /* lower */
169 cbit_upper, -1, -1, /* upper */
170 cbit_digit, cbit_lower, cbit_upper, /* alnum */
171 cbit_print, cbit_cntrl, -1, /* ascii */
172 cbit_space, -1, -1, /* blank - a GNU extension */
173 cbit_cntrl, -1, -1, /* cntrl */
174 cbit_digit, -1, -1, /* digit */
175 cbit_graph, -1, -1, /* graph */
176 cbit_print, -1, -1, /* print */
177 cbit_punct, -1, -1, /* punct */
178 cbit_space, -1, -1, /* space */
179 cbit_word, -1, -1, /* word - a Perl extension */
180 cbit_xdigit,-1, -1 /* xdigit */
181 };
182
183 /* Table to identify digits and hex digits. This is used when compiling
184 patterns. Note that the tables in chartables are dependent on the locale, and
185 may mark arbitrary characters as digits - but the PCRE compiling code expects
186 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
187 a private table here. It costs 256 bytes, but it is a lot faster than doing
188 character value tests (at least in some simple cases I timed), and in some
189 applications one wants PCRE to compile efficiently as well as match
190 efficiently.
191
192 For convenience, we use the same bit definitions as in chartables:
193
194 0x04 decimal digit
195 0x08 hexadecimal digit
196
197 Then we can use ctype_digit and ctype_xdigit in the code. */
198
199 #if !EBCDIC /* This is the "normal" case, for ASCII systems */
200 static const unsigned char digitab[] =
201 {
202 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
203 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
204 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
205 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
206 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
207 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
208 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
209 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
210 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
211 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
212 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
213 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
214 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
215 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
216 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
217 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
218 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
219 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
220 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
221 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
222 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
223 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
224 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
225 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
226 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
227 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
228 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
229 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
230 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
231 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
232 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
233 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
234
235 #else /* This is the "abnormal" case, for EBCDIC systems */
236 static const unsigned char digitab[] =
237 {
238 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
239 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
240 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
241 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
242 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
243 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
244 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
245 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
246 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
247 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
248 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
249 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- ¬ */
250 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
251 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
252 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
253 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
254 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
255 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
256 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
257 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
258 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
259 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
260 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
261 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
262 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
263 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
264 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
265 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
266 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
267 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
268 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
269 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
270
271 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
272 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
273 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
274 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
275 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
276 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
277 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
278 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
279 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
280 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
281 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
282 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
283 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- ¬ */
284 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
285 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
286 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
287 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
288 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
289 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
290 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
291 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
292 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
293 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
294 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
295 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
296 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
297 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
298 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
299 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
300 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
301 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
302 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
303 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
304 #endif
305
306
307 /* Definition to allow mutual recursion */
308
309 static BOOL
310 compile_regex(int, int, int *, uschar **, const uschar **, const char **,
311 BOOL, int, int *, int *, branch_chain *, compile_data *);
312
313 /* Structure for building a chain of data that actually lives on the
314 stack, for holding the values of the subject pointer at the start of each
315 subpattern, so as to detect when an empty string has been matched by a
316 subpattern - to break infinite loops. When NO_RECURSE is set, these blocks
317 are on the heap, not on the stack. */
318
319 typedef struct eptrblock {
320 struct eptrblock *epb_prev;
321 const uschar *epb_saved_eptr;
322 } eptrblock;
323
324 /* Flag bits for the match() function */
325
326 #define match_condassert 0x01 /* Called to check a condition assertion */
327 #define match_isgroup 0x02 /* Set if start of bracketed group */
328
329 /* Non-error returns from the match() function. Error returns are externally
330 defined PCRE_ERROR_xxx codes, which are all negative. */
331
332 #define MATCH_MATCH 1
333 #define MATCH_NOMATCH 0
334
335
336
337 /*************************************************
338 * Global variables *
339 *************************************************/
340
341 /* PCRE is thread-clean and doesn't use any global variables in the normal
342 sense. However, it calls memory allocation and free functions via the four
343 indirections below, and it can optionally do callouts. These values can be
344 changed by the caller, but are shared between all threads. However, when
345 compiling for Virtual Pascal, things are done differently (see pcre.in). */
346
347 #ifndef VPCOMPAT
348 #ifdef __cplusplus
349 extern "C" void *(*pcre_malloc)(size_t) = malloc;
350 extern "C" void (*pcre_free)(void *) = free;
351 extern "C" void *(*pcre_stack_malloc)(size_t) = malloc;
352 extern "C" void (*pcre_stack_free)(void *) = free;
353 extern "C" int (*pcre_callout)(pcre_callout_block *) = NULL;
354 #else
355 void *(*pcre_malloc)(size_t) = malloc;
356 void (*pcre_free)(void *) = free;
357 void *(*pcre_stack_malloc)(size_t) = malloc;
358 void (*pcre_stack_free)(void *) = free;
359 int (*pcre_callout)(pcre_callout_block *) = NULL;
360 #endif
361 #endif
362
363
364 /*************************************************
365 * Macros and tables for character handling *
366 *************************************************/
367
368 /* When UTF-8 encoding is being used, a character is no longer just a single
369 byte. The macros for character handling generate simple sequences when used in
370 byte-mode, and more complicated ones for UTF-8 characters. */
371
372 #ifndef SUPPORT_UTF8
373 #define GETCHAR(c, eptr) c = *eptr;
374 #define GETCHARINC(c, eptr) c = *eptr++;
375 #define GETCHARINCTEST(c, eptr) c = *eptr++;
376 #define GETCHARLEN(c, eptr, len) c = *eptr;
377 #define BACKCHAR(eptr)
378
379 #else /* SUPPORT_UTF8 */
380
381 /* Get the next UTF-8 character, not advancing the pointer. This is called when
382 we know we are in UTF-8 mode. */
383
384 #define GETCHAR(c, eptr) \
385 c = *eptr; \
386 if ((c & 0xc0) == 0xc0) \
387 { \
388 int gcii; \
389 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
390 int gcss = 6*gcaa; \
391 c = (c & utf8_table3[gcaa]) << gcss; \
392 for (gcii = 1; gcii <= gcaa; gcii++) \
393 { \
394 gcss -= 6; \
395 c |= (eptr[gcii] & 0x3f) << gcss; \
396 } \
397 }
398
399 /* Get the next UTF-8 character, advancing the pointer. This is called when we
400 know we are in UTF-8 mode. */
401
402 #define GETCHARINC(c, eptr) \
403 c = *eptr++; \
404 if ((c & 0xc0) == 0xc0) \
405 { \
406 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
407 int gcss = 6*gcaa; \
408 c = (c & utf8_table3[gcaa]) << gcss; \
409 while (gcaa-- > 0) \
410 { \
411 gcss -= 6; \
412 c |= (*eptr++ & 0x3f) << gcss; \
413 } \
414 }
415
416 /* Get the next character, testing for UTF-8 mode, and advancing the pointer */
417
418 #define GETCHARINCTEST(c, eptr) \
419 c = *eptr++; \
420 if (md->utf8 && (c & 0xc0) == 0xc0) \
421 { \
422 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
423 int gcss = 6*gcaa; \
424 c = (c & utf8_table3[gcaa]) << gcss; \
425 while (gcaa-- > 0) \
426 { \
427 gcss -= 6; \
428 c |= (*eptr++ & 0x3f) << gcss; \
429 } \
430 }
431
432 /* Get the next UTF-8 character, not advancing the pointer, incrementing length
433 if there are extra bytes. This is called when we know we are in UTF-8 mode. */
434
435 #define GETCHARLEN(c, eptr, len) \
436 c = *eptr; \
437 if ((c & 0xc0) == 0xc0) \
438 { \
439 int gcii; \
440 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
441 int gcss = 6*gcaa; \
442 c = (c & utf8_table3[gcaa]) << gcss; \
443 for (gcii = 1; gcii <= gcaa; gcii++) \
444 { \
445 gcss -= 6; \
446 c |= (eptr[gcii] & 0x3f) << gcss; \
447 } \
448 len += gcaa; \
449 }
450
451 /* If the pointer is not at the start of a character, move it back until
452 it is. Called only in UTF-8 mode. */
453
454 #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
455
456 #endif
457
458
459
460 /*************************************************
461 * Default character tables *
462 *************************************************/
463
464 /* A default set of character tables is included in the PCRE binary. Its source
465 is built by the maketables auxiliary program, which uses the default C ctypes
466 functions, and put in the file chartables.c. These tables are used by PCRE
467 whenever the caller of pcre_compile() does not provide an alternate set of
468 tables. */
469
470 #include "chartables.c"
471
472
473
474 #ifdef SUPPORT_UTF8
475 /*************************************************
476 * Tables for UTF-8 support *
477 *************************************************/
478
479 /* These are the breakpoints for different numbers of bytes in a UTF-8
480 character. */
481
482 static const int utf8_table1[] =
483 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
484
485 /* These are the indicator bits and the mask for the data bits to set in the
486 first byte of a character, indexed by the number of additional bytes. */
487
488 static const int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
489 static const int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
490
491 /* Table of the number of extra characters, indexed by the first character
492 masked with 0x3f. The highest number for a valid UTF-8 character is in fact
493 0x3d. */
494
495 static const uschar utf8_table4[] = {
496 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
497 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
498 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
499 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
500
501
502 /*************************************************
503 * Convert character value to UTF-8 *
504 *************************************************/
505
506 /* This function takes an integer value in the range 0 - 0x7fffffff
507 and encodes it as a UTF-8 character in 0 to 6 bytes.
508
509 Arguments:
510 cvalue the character value
511 buffer pointer to buffer for result - at least 6 bytes long
512
513 Returns: number of characters placed in the buffer
514 */
515
516 static int
517 ord2utf8(int cvalue, uschar *buffer)
518 {
519 register int i, j;
520 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
521 if (cvalue <= utf8_table1[i]) break;
522 buffer += i;
523 for (j = i; j > 0; j--)
524 {
525 *buffer-- = 0x80 | (cvalue & 0x3f);
526 cvalue >>= 6;
527 }
528 *buffer = utf8_table2[i] | cvalue;
529 return i + 1;
530 }
531 #endif
532
533
534
535 /*************************************************
536 * Print compiled regex *
537 *************************************************/
538
539 /* The code for doing this is held in a separate file that is also included in
540 pcretest.c. It defines a function called print_internals(). */
541
542 #ifdef DEBUG
543 #include "printint.c"
544 #endif
545
546
547
548 /*************************************************
549 * Return version string *
550 *************************************************/
551
552 #define STRING(a) # a
553 #define XSTRING(s) STRING(s)
554
555 EXPORT const char *
556 pcre_version(void)
557 {
558 return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
559 }
560
561
562
563
564 /*************************************************
565 * Flip bytes in an integer *
566 *************************************************/
567
568 /* This function is called when the magic number in a regex doesn't match in
569 order to flip its bytes to see if we are dealing with a pattern that was
570 compiled on a host of different endianness. If so, this function is used to
571 flip other byte values.
572
573 Arguments:
574 value the number to flip
575 n the number of bytes to flip (assumed to be 2 or 4)
576
577 Returns: the flipped value
578 */
579
580 static long int
581 byteflip(long int value, int n)
582 {
583 if (n == 2) return ((value & 0x00ff) << 8) | ((value & 0xff00) >> 8);
584 return ((value & 0x000000ff) << 24) |
585 ((value & 0x0000ff00) << 8) |
586 ((value & 0x00ff0000) >> 8) |
587 ((value & 0xff000000) >> 24);
588 }
589
590
591
592 /*************************************************
593 * Test for a byte-flipped compiled regex *
594 *************************************************/
595
596 /* This function is called from pce_exec() and also from pcre_fullinfo(). Its
597 job is to test whether the regex is byte-flipped - that is, it was compiled on
598 a system of opposite endianness. The function is called only when the native
599 MAGIC_NUMBER test fails. If the regex is indeed flipped, we flip all the
600 relevant values into a different data block, and return it.
601
602 Arguments:
603 re points to the regex
604 study points to study data, or NULL
605 internal_re points to a new regex block
606 internal_study points to a new study block
607
608 Returns: the new block if is is indeed a byte-flipped regex
609 NULL if it is not
610 */
611
612 static real_pcre *
613 try_flipped(const real_pcre *re, real_pcre *internal_re,
614 const pcre_study_data *study, pcre_study_data *internal_study)
615 {
616 if (byteflip(re->magic_number, sizeof(re->magic_number)) != MAGIC_NUMBER)
617 return NULL;
618
619 *internal_re = *re; /* To copy other fields */
620 internal_re->size = byteflip(re->size, sizeof(re->size));
621 internal_re->options = byteflip(re->options, sizeof(re->options));
622 internal_re->top_bracket = byteflip(re->top_bracket, sizeof(re->top_bracket));
623 internal_re->top_backref = byteflip(re->top_backref, sizeof(re->top_backref));
624 internal_re->first_byte = byteflip(re->first_byte, sizeof(re->first_byte));
625 internal_re->req_byte = byteflip(re->req_byte, sizeof(re->req_byte));
626 internal_re->name_table_offset = byteflip(re->name_table_offset,
627 sizeof(re->name_table_offset));
628 internal_re->name_entry_size = byteflip(re->name_entry_size,
629 sizeof(re->name_entry_size));
630 internal_re->name_count = byteflip(re->name_count, sizeof(re->name_count));
631
632 if (study != NULL)
633 {
634 *internal_study = *study; /* To copy other fields */
635 internal_study->size = byteflip(study->size, sizeof(study->size));
636 internal_study->options = byteflip(study->options, sizeof(study->options));
637 }
638
639 return internal_re;
640 }
641
642
643
644 /*************************************************
645 * (Obsolete) Return info about compiled pattern *
646 *************************************************/
647
648 /* This is the original "info" function. It picks potentially useful data out
649 of the private structure, but its interface was too rigid. It remains for
650 backwards compatibility. The public options are passed back in an int - though
651 the re->options field has been expanded to a long int, all the public options
652 at the low end of it, and so even on 16-bit systems this will still be OK.
653 Therefore, I haven't changed the API for pcre_info().
654
655 Arguments:
656 argument_re points to compiled code
657 optptr where to pass back the options
658 first_byte where to pass back the first character,
659 or -1 if multiline and all branches start ^,
660 or -2 otherwise
661
662 Returns: number of capturing subpatterns
663 or negative values on error
664 */
665
666 EXPORT int
667 pcre_info(const pcre *argument_re, int *optptr, int *first_byte)
668 {
669 real_pcre internal_re;
670 const real_pcre *re = (const real_pcre *)argument_re;
671 if (re == NULL) return PCRE_ERROR_NULL;
672 if (re->magic_number != MAGIC_NUMBER)
673 {
674 re = try_flipped(re, &internal_re, NULL, NULL);
675 if (re == NULL) return PCRE_ERROR_BADMAGIC;
676 }
677 if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
678 if (first_byte != NULL)
679 *first_byte = ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
680 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
681 return re->top_bracket;
682 }
683
684
685
686 /*************************************************
687 * Return info about compiled pattern *
688 *************************************************/
689
690 /* This is a newer "info" function which has an extensible interface so
691 that additional items can be added compatibly.
692
693 Arguments:
694 argument_re points to compiled code
695 extra_data points extra data, or NULL
696 what what information is required
697 where where to put the information
698
699 Returns: 0 if data returned, negative on error
700 */
701
702 EXPORT int
703 pcre_fullinfo(const pcre *argument_re, const pcre_extra *extra_data, int what,
704 void *where)
705 {
706 real_pcre internal_re;
707 pcre_study_data internal_study;
708 const real_pcre *re = (const real_pcre *)argument_re;
709 const pcre_study_data *study = NULL;
710
711 if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
712
713 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0)
714 study = (const pcre_study_data *)extra_data->study_data;
715
716 if (re->magic_number != MAGIC_NUMBER)
717 {
718 re = try_flipped(re, &internal_re, study, &internal_study);
719 if (re == NULL) return PCRE_ERROR_BADMAGIC;
720 if (study != NULL) study = &internal_study;
721 }
722
723 switch (what)
724 {
725 case PCRE_INFO_OPTIONS:
726 *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
727 break;
728
729 case PCRE_INFO_SIZE:
730 *((size_t *)where) = re->size;
731 break;
732
733 case PCRE_INFO_STUDYSIZE:
734 *((size_t *)where) = (study == NULL)? 0 : study->size;
735 break;
736
737 case PCRE_INFO_CAPTURECOUNT:
738 *((int *)where) = re->top_bracket;
739 break;
740
741 case PCRE_INFO_BACKREFMAX:
742 *((int *)where) = re->top_backref;
743 break;
744
745 case PCRE_INFO_FIRSTBYTE:
746 *((int *)where) =
747 ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
748 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
749 break;
750
751 /* Make sure we pass back the pointer to the bit vector in the external
752 block, not the internal copy (with flipped integer fields). */
753
754 case PCRE_INFO_FIRSTTABLE:
755 *((const uschar **)where) =
756 (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
757 ((const pcre_study_data *)extra_data->study_data)->start_bits : NULL;
758 break;
759
760 case PCRE_INFO_LASTLITERAL:
761 *((int *)where) =
762 ((re->options & PCRE_REQCHSET) != 0)? re->req_byte : -1;
763 break;
764
765 case PCRE_INFO_NAMEENTRYSIZE:
766 *((int *)where) = re->name_entry_size;
767 break;
768
769 case PCRE_INFO_NAMECOUNT:
770 *((int *)where) = re->name_count;
771 break;
772
773 case PCRE_INFO_NAMETABLE:
774 *((const uschar **)where) = (const uschar *)re + re->name_table_offset;
775 break;
776
777 case PCRE_INFO_DEFAULT_TABLES:
778 *((const uschar **)where) = (const uschar *)pcre_default_tables;
779 break;
780
781 default: return PCRE_ERROR_BADOPTION;
782 }
783
784 return 0;
785 }
786
787
788
789 /*************************************************
790 * Return info about what features are configured *
791 *************************************************/
792
793 /* This is function which has an extensible interface so that additional items
794 can be added compatibly.
795
796 Arguments:
797 what what information is required
798 where where to put the information
799
800 Returns: 0 if data returned, negative on error
801 */
802
803 EXPORT int
804 pcre_config(int what, void *where)
805 {
806 switch (what)
807 {
808 case PCRE_CONFIG_UTF8:
809 #ifdef SUPPORT_UTF8
810 *((int *)where) = 1;
811 #else
812 *((int *)where) = 0;
813 #endif
814 break;
815
816 case PCRE_CONFIG_UNICODE_PROPERTIES:
817 #ifdef SUPPORT_UCP
818 *((int *)where) = 1;
819 #else
820 *((int *)where) = 0;
821 #endif
822 break;
823
824 case PCRE_CONFIG_NEWLINE:
825 *((int *)where) = NEWLINE;
826 break;
827
828 case PCRE_CONFIG_LINK_SIZE:
829 *((int *)where) = LINK_SIZE;
830 break;
831
832 case PCRE_CONFIG_POSIX_MALLOC_THRESHOLD:
833 *((int *)where) = POSIX_MALLOC_THRESHOLD;
834 break;
835
836 case PCRE_CONFIG_MATCH_LIMIT:
837 *((unsigned int *)where) = MATCH_LIMIT;
838 break;
839
840 case PCRE_CONFIG_STACKRECURSE:
841 #ifdef NO_RECURSE
842 *((int *)where) = 0;
843 #else
844 *((int *)where) = 1;
845 #endif
846 break;
847
848 default: return PCRE_ERROR_BADOPTION;
849 }
850
851 return 0;
852 }
853
854
855
856 #ifdef DEBUG
857 /*************************************************
858 * Debugging function to print chars *
859 *************************************************/
860
861 /* Print a sequence of chars in printable format, stopping at the end of the
862 subject if the requested.
863
864 Arguments:
865 p points to characters
866 length number to print
867 is_subject TRUE if printing from within md->start_subject
868 md pointer to matching data block, if is_subject is TRUE
869
870 Returns: nothing
871 */
872
873 static void
874 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
875 {
876 int c;
877 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
878 while (length-- > 0)
879 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
880 }
881 #endif
882
883
884
885
886 /*************************************************
887 * Handle escapes *
888 *************************************************/
889
890 /* This function is called when a \ has been encountered. It either returns a
891 positive value for a simple escape such as \n, or a negative value which
892 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
893 a positive value greater than 255 may be returned. On entry, ptr is pointing at
894 the \. On exit, it is on the final character of the escape sequence.
895
896 Arguments:
897 ptrptr points to the pattern position pointer
898 errorptr points to the pointer to the error message
899 bracount number of previous extracting brackets
900 options the options bits
901 isclass TRUE if inside a character class
902
903 Returns: zero or positive => a data character
904 negative => a special escape sequence
905 on error, errorptr is set
906 */
907
908 static int
909 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
910 int options, BOOL isclass)
911 {
912 const uschar *ptr = *ptrptr;
913 int c, i;
914
915 /* If backslash is at the end of the pattern, it's an error. */
916
917 c = *(++ptr);
918 if (c == 0) *errorptr = ERR1;
919
920 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
921 a table. A non-zero result is something that can be returned immediately.
922 Otherwise further processing may be required. */
923
924 #if !EBCDIC /* ASCII coding */
925 else if (c < '0' || c > 'z') {} /* Not alphameric */
926 else if ((i = escapes[c - '0']) != 0) c = i;
927
928 #else /* EBCDIC coding */
929 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
930 else if ((i = escapes[c - 0x48]) != 0) c = i;
931 #endif
932
933 /* Escapes that need further processing, or are illegal. */
934
935 else
936 {
937 const uschar *oldptr;
938 switch (c)
939 {
940 /* A number of Perl escapes are not handled by PCRE. We give an explicit
941 error. */
942
943 case 'l':
944 case 'L':
945 case 'N':
946 case 'u':
947 case 'U':
948 *errorptr = ERR37;
949 break;
950
951 /* The handling of escape sequences consisting of a string of digits
952 starting with one that is not zero is not straightforward. By experiment,
953 the way Perl works seems to be as follows:
954
955 Outside a character class, the digits are read as a decimal number. If the
956 number is less than 10, or if there are that many previous extracting
957 left brackets, then it is a back reference. Otherwise, up to three octal
958 digits are read to form an escaped byte. Thus \123 is likely to be octal
959 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
960 value is greater than 377, the least significant 8 bits are taken. Inside a
961 character class, \ followed by a digit is always an octal number. */
962
963 case '1': case '2': case '3': case '4': case '5':
964 case '6': case '7': case '8': case '9':
965
966 if (!isclass)
967 {
968 oldptr = ptr;
969 c -= '0';
970 while ((digitab[ptr[1]] & ctype_digit) != 0)
971 c = c * 10 + *(++ptr) - '0';
972 if (c < 10 || c <= bracount)
973 {
974 c = -(ESC_REF + c);
975 break;
976 }
977 ptr = oldptr; /* Put the pointer back and fall through */
978 }
979
980 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
981 generates a binary zero byte and treats the digit as a following literal.
982 Thus we have to pull back the pointer by one. */
983
984 if ((c = *ptr) >= '8')
985 {
986 ptr--;
987 c = 0;
988 break;
989 }
990
991 /* \0 always starts an octal number, but we may drop through to here with a
992 larger first octal digit. */
993
994 case '0':
995 c -= '0';
996 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
997 c = c * 8 + *(++ptr) - '0';
998 c &= 255; /* Take least significant 8 bits */
999 break;
1000
1001 /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
1002 which can be greater than 0xff, but only if the ddd are hex digits. */
1003
1004 case 'x':
1005 #ifdef SUPPORT_UTF8
1006 if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
1007 {
1008 const uschar *pt = ptr + 2;
1009 register int count = 0;
1010 c = 0;
1011 while ((digitab[*pt] & ctype_xdigit) != 0)
1012 {
1013 int cc = *pt++;
1014 count++;
1015 #if !EBCDIC /* ASCII coding */
1016 if (cc >= 'a') cc -= 32; /* Convert to upper case */
1017 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
1018 #else /* EBCDIC coding */
1019 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
1020 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
1021 #endif
1022 }
1023 if (*pt == '}')
1024 {
1025 if (c < 0 || count > 8) *errorptr = ERR34;
1026 ptr = pt;
1027 break;
1028 }
1029 /* If the sequence of hex digits does not end with '}', then we don't
1030 recognize this construct; fall through to the normal \x handling. */
1031 }
1032 #endif
1033
1034 /* Read just a single hex char */
1035
1036 c = 0;
1037 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
1038 {
1039 int cc; /* Some compilers don't like ++ */
1040 cc = *(++ptr); /* in initializers */
1041 #if !EBCDIC /* ASCII coding */
1042 if (cc >= 'a') cc -= 32; /* Convert to upper case */
1043 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
1044 #else /* EBCDIC coding */
1045 if (cc <= 'z') cc += 64; /* Convert to upper case */
1046 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
1047 #endif
1048 }
1049 break;
1050
1051 /* Other special escapes not starting with a digit are straightforward */
1052
1053 case 'c':
1054 c = *(++ptr);
1055 if (c == 0)
1056 {
1057 *errorptr = ERR2;
1058 return 0;
1059 }
1060
1061 /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
1062 is ASCII-specific, but then the whole concept of \cx is ASCII-specific.
1063 (However, an EBCDIC equivalent has now been added.) */
1064
1065 #if !EBCDIC /* ASCII coding */
1066 if (c >= 'a' && c <= 'z') c -= 32;
1067 c ^= 0x40;
1068 #else /* EBCDIC coding */
1069 if (c >= 'a' && c <= 'z') c += 64;
1070 c ^= 0xC0;
1071 #endif
1072 break;
1073
1074 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1075 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
1076 for Perl compatibility, it is a literal. This code looks a bit odd, but
1077 there used to be some cases other than the default, and there may be again
1078 in future, so I haven't "optimized" it. */
1079
1080 default:
1081 if ((options & PCRE_EXTRA) != 0) switch(c)
1082 {
1083 default:
1084 *errorptr = ERR3;
1085 break;
1086 }
1087 break;
1088 }
1089 }
1090
1091 *ptrptr = ptr;
1092 return c;
1093 }
1094
1095
1096
1097 #ifdef SUPPORT_UCP
1098 /*************************************************
1099 * Handle \P and \p *
1100 *************************************************/
1101
1102 /* This function is called after \P or \p has been encountered, provided that
1103 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1104 pointing at the P or p. On exit, it is pointing at the final character of the
1105 escape sequence.
1106
1107 Argument:
1108 ptrptr points to the pattern position pointer
1109 negptr points to a boolean that is set TRUE for negation else FALSE
1110 errorptr points to the pointer to the error message
1111
1112 Returns: value from ucp_type_table, or -1 for an invalid type
1113 */
1114
1115 static int
1116 get_ucp(const uschar **ptrptr, BOOL *negptr, const char **errorptr)
1117 {
1118 int c, i, bot, top;
1119 const uschar *ptr = *ptrptr;
1120 char name[4];
1121
1122 c = *(++ptr);
1123 if (c == 0) goto ERROR_RETURN;
1124
1125 *negptr = FALSE;
1126
1127 /* \P or \p can be followed by a one- or two-character name in {}, optionally
1128 preceded by ^ for negation. */
1129
1130 if (c == '{')
1131 {
1132 if (ptr[1] == '^')
1133 {
1134 *negptr = TRUE;
1135 ptr++;
1136 }
1137 for (i = 0; i <= 2; i++)
1138 {
1139 c = *(++ptr);
1140 if (c == 0) goto ERROR_RETURN;
1141 if (c == '}') break;
1142 name[i] = c;
1143 }
1144 if (c !='}') /* Try to distinguish error cases */
1145 {
1146 while (*(++ptr) != 0 && *ptr != '}');
1147 if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;
1148 }
1149 name[i] = 0;
1150 }
1151
1152 /* Otherwise there is just one following character */
1153
1154 else
1155 {
1156 name[0] = c;
1157 name[1] = 0;
1158 }
1159
1160 *ptrptr = ptr;
1161
1162 /* Search for a recognized property name using binary chop */
1163
1164 bot = 0;
1165 top = sizeof(utt)/sizeof(ucp_type_table);
1166
1167 while (bot < top)
1168 {
1169 i = (bot + top)/2;
1170 c = strcmp(name, utt[i].name);
1171 if (c == 0) return utt[i].value;
1172 if (c > 0) bot = i + 1; else top = i;
1173 }
1174
1175 UNKNOWN_RETURN:
1176 *errorptr = ERR47;
1177 *ptrptr = ptr;
1178 return -1;
1179
1180 ERROR_RETURN:
1181 *errorptr = ERR46;
1182 *ptrptr = ptr;
1183 return -1;
1184 }
1185 #endif
1186
1187
1188
1189
1190 /*************************************************
1191 * Check for counted repeat *
1192 *************************************************/
1193
1194 /* This function is called when a '{' is encountered in a place where it might
1195 start a quantifier. It looks ahead to see if it really is a quantifier or not.
1196 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1197 where the ddds are digits.
1198
1199 Arguments:
1200 p pointer to the first char after '{'
1201
1202 Returns: TRUE or FALSE
1203 */
1204
1205 static BOOL
1206 is_counted_repeat(const uschar *p)
1207 {
1208 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1209 while ((digitab[*p] & ctype_digit) != 0) p++;
1210 if (*p == '}') return TRUE;
1211
1212 if (*p++ != ',') return FALSE;
1213 if (*p == '}') return TRUE;
1214
1215 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1216 while ((digitab[*p] & ctype_digit) != 0) p++;
1217
1218 return (*p == '}');
1219 }
1220
1221
1222
1223 /*************************************************
1224 * Read repeat counts *
1225 *************************************************/
1226
1227 /* Read an item of the form {n,m} and return the values. This is called only
1228 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1229 so the syntax is guaranteed to be correct, but we need to check the values.
1230
1231 Arguments:
1232 p pointer to first char after '{'
1233 minp pointer to int for min
1234 maxp pointer to int for max
1235 returned as -1 if no max
1236 errorptr points to pointer to error message
1237
1238 Returns: pointer to '}' on success;
1239 current ptr on error, with errorptr set
1240 */
1241
1242 static const uschar *
1243 read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
1244 {
1245 int min = 0;
1246 int max = -1;
1247
1248 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
1249
1250 if (*p == '}') max = min; else
1251 {
1252 if (*(++p) != '}')
1253 {
1254 max = 0;
1255 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
1256 if (max < min)
1257 {
1258 *errorptr = ERR4;
1259 return p;
1260 }
1261 }
1262 }
1263
1264 /* Do paranoid checks, then fill in the required variables, and pass back the
1265 pointer to the terminating '}'. */
1266
1267 if (min > 65535 || max > 65535)
1268 *errorptr = ERR5;
1269 else
1270 {
1271 *minp = min;
1272 *maxp = max;
1273 }
1274 return p;
1275 }
1276
1277
1278
1279 /*************************************************
1280 * Find first significant op code *
1281 *************************************************/
1282
1283 /* This is called by several functions that scan a compiled expression looking
1284 for a fixed first character, or an anchoring op code etc. It skips over things
1285 that do not influence this. For some calls, a change of option is important.
1286 For some calls, it makes sense to skip negative forward and all backward
1287 assertions, and also the \b assertion; for others it does not.
1288
1289 Arguments:
1290 code pointer to the start of the group
1291 options pointer to external options
1292 optbit the option bit whose changing is significant, or
1293 zero if none are
1294 skipassert TRUE if certain assertions are to be skipped
1295
1296 Returns: pointer to the first significant opcode
1297 */
1298
1299 static const uschar*
1300 first_significant_code(const uschar *code, int *options, int optbit,
1301 BOOL skipassert)
1302 {
1303 for (;;)
1304 {
1305 switch ((int)*code)
1306 {
1307 case OP_OPT:
1308 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1309 *options = (int)code[1];
1310 code += 2;
1311 break;
1312
1313 case OP_ASSERT_NOT:
1314 case OP_ASSERTBACK:
1315 case OP_ASSERTBACK_NOT:
1316 if (!skipassert) return code;
1317 do code += GET(code, 1); while (*code == OP_ALT);
1318 code += OP_lengths[*code];
1319 break;
1320
1321 case OP_WORD_BOUNDARY:
1322 case OP_NOT_WORD_BOUNDARY:
1323 if (!skipassert) return code;
1324 /* Fall through */
1325
1326 case OP_CALLOUT:
1327 case OP_CREF:
1328 case OP_BRANUMBER:
1329 code += OP_lengths[*code];
1330 break;
1331
1332 default:
1333 return code;
1334 }
1335 }
1336 /* Control never reaches here */
1337 }
1338
1339
1340
1341
1342 /*************************************************
1343 * Find the fixed length of a pattern *
1344 *************************************************/
1345
1346 /* Scan a pattern and compute the fixed length of subject that will match it,
1347 if the length is fixed. This is needed for dealing with backward assertions.
1348 In UTF8 mode, the result is in characters rather than bytes.
1349
1350 Arguments:
1351 code points to the start of the pattern (the bracket)
1352 options the compiling options
1353
1354 Returns: the fixed length, or -1 if there is no fixed length,
1355 or -2 if \C was encountered
1356 */
1357
1358 static int
1359 find_fixedlength(uschar *code, int options)
1360 {
1361 int length = -1;
1362
1363 register int branchlength = 0;
1364 register uschar *cc = code + 1 + LINK_SIZE;
1365
1366 /* Scan along the opcodes for this branch. If we get to the end of the
1367 branch, check the length against that of the other branches. */
1368
1369 for (;;)
1370 {
1371 int d;
1372 register int op = *cc;
1373 if (op >= OP_BRA) op = OP_BRA;
1374
1375 switch (op)
1376 {
1377 case OP_BRA:
1378 case OP_ONCE:
1379 case OP_COND:
1380 d = find_fixedlength(cc, options);
1381 if (d < 0) return d;
1382 branchlength += d;
1383 do cc += GET(cc, 1); while (*cc == OP_ALT);
1384 cc += 1 + LINK_SIZE;
1385 break;
1386
1387 /* Reached end of a branch; if it's a ket it is the end of a nested
1388 call. If it's ALT it is an alternation in a nested call. If it is
1389 END it's the end of the outer call. All can be handled by the same code. */
1390
1391 case OP_ALT:
1392 case OP_KET:
1393 case OP_KETRMAX:
1394 case OP_KETRMIN:
1395 case OP_END:
1396 if (length < 0) length = branchlength;
1397 else if (length != branchlength) return -1;
1398 if (*cc != OP_ALT) return length;
1399 cc += 1 + LINK_SIZE;
1400 branchlength = 0;
1401 break;
1402
1403 /* Skip over assertive subpatterns */
1404
1405 case OP_ASSERT:
1406 case OP_ASSERT_NOT:
1407 case OP_ASSERTBACK:
1408 case OP_ASSERTBACK_NOT:
1409 do cc += GET(cc, 1); while (*cc == OP_ALT);
1410 /* Fall through */
1411
1412 /* Skip over things that don't match chars */
1413
1414 case OP_REVERSE:
1415 case OP_BRANUMBER:
1416 case OP_CREF:
1417 case OP_OPT:
1418 case OP_CALLOUT:
1419 case OP_SOD:
1420 case OP_SOM:
1421 case OP_EOD:
1422 case OP_EODN:
1423 case OP_CIRC:
1424 case OP_DOLL:
1425 case OP_NOT_WORD_BOUNDARY:
1426 case OP_WORD_BOUNDARY:
1427 cc += OP_lengths[*cc];
1428 break;
1429
1430 /* Handle literal characters */
1431
1432 case OP_CHAR:
1433 case OP_CHARNC:
1434 branchlength++;
1435 cc += 2;
1436 #ifdef SUPPORT_UTF8
1437 if ((options & PCRE_UTF8) != 0)
1438 {
1439 while ((*cc & 0xc0) == 0x80) cc++;
1440 }
1441 #endif
1442 break;
1443
1444 /* Handle exact repetitions. The count is already in characters, but we
1445 need to skip over a multibyte character in UTF8 mode. */
1446
1447 case OP_EXACT:
1448 branchlength += GET2(cc,1);
1449 cc += 4;
1450 #ifdef SUPPORT_UTF8
1451 if ((options & PCRE_UTF8) != 0)
1452 {
1453 while((*cc & 0x80) == 0x80) cc++;
1454 }
1455 #endif
1456 break;
1457
1458 case OP_TYPEEXACT:
1459 branchlength += GET2(cc,1);
1460 cc += 4;
1461 break;
1462
1463 /* Handle single-char matchers */
1464
1465 case OP_PROP:
1466 case OP_NOTPROP:
1467 cc++;
1468 /* Fall through */
1469
1470 case OP_NOT_DIGIT:
1471 case OP_DIGIT:
1472 case OP_NOT_WHITESPACE:
1473 case OP_WHITESPACE:
1474 case OP_NOT_WORDCHAR:
1475 case OP_WORDCHAR:
1476 case OP_ANY:
1477 branchlength++;
1478 cc++;
1479 break;
1480
1481 /* The single-byte matcher isn't allowed */
1482
1483 case OP_ANYBYTE:
1484 return -2;
1485
1486 /* Check a class for variable quantification */
1487
1488 #ifdef SUPPORT_UTF8
1489 case OP_XCLASS:
1490 cc += GET(cc, 1) - 33;
1491 /* Fall through */
1492 #endif
1493
1494 case OP_CLASS:
1495 case OP_NCLASS:
1496 cc += 33;
1497
1498 switch (*cc)
1499 {
1500 case OP_CRSTAR:
1501 case OP_CRMINSTAR:
1502 case OP_CRQUERY:
1503 case OP_CRMINQUERY:
1504 return -1;
1505
1506 case OP_CRRANGE:
1507 case OP_CRMINRANGE:
1508 if (GET2(cc,1) != GET2(cc,3)) return -1;
1509 branchlength += GET2(cc,1);
1510 cc += 5;
1511 break;
1512
1513 default:
1514 branchlength++;
1515 }
1516 break;
1517
1518 /* Anything else is variable length */
1519
1520 default:
1521 return -1;
1522 }
1523 }
1524 /* Control never gets here */
1525 }
1526
1527
1528
1529
1530 /*************************************************
1531 * Scan compiled regex for numbered bracket *
1532 *************************************************/
1533
1534 /* This little function scans through a compiled pattern until it finds a
1535 capturing bracket with the given number.
1536
1537 Arguments:
1538 code points to start of expression
1539 utf8 TRUE in UTF-8 mode
1540 number the required bracket number
1541
1542 Returns: pointer to the opcode for the bracket, or NULL if not found
1543 */
1544
1545 static const uschar *
1546 find_bracket(const uschar *code, BOOL utf8, int number)
1547 {
1548 #ifndef SUPPORT_UTF8
1549 utf8 = utf8; /* Stop pedantic compilers complaining */
1550 #endif
1551
1552 for (;;)
1553 {
1554 register int c = *code;
1555 if (c == OP_END) return NULL;
1556 else if (c > OP_BRA)
1557 {
1558 int n = c - OP_BRA;
1559 if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1560 if (n == number) return (uschar *)code;
1561 code += OP_lengths[OP_BRA];
1562 }
1563 else
1564 {
1565 code += OP_lengths[c];
1566
1567 #ifdef SUPPORT_UTF8
1568
1569 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1570 by a multi-byte character. The length in the table is a minimum, so we have
1571 to scan along to skip the extra bytes. All opcodes are less than 128, so we
1572 can use relatively efficient code. */
1573
1574 if (utf8) switch(c)
1575 {
1576 case OP_CHAR:
1577 case OP_CHARNC:
1578 case OP_EXACT:
1579 case OP_UPTO:
1580 case OP_MINUPTO:
1581 case OP_STAR:
1582 case OP_MINSTAR:
1583 case OP_PLUS:
1584 case OP_MINPLUS:
1585 case OP_QUERY:
1586 case OP_MINQUERY:
1587 while ((*code & 0xc0) == 0x80) code++;
1588 break;
1589
1590 /* XCLASS is used for classes that cannot be represented just by a bit
1591 map. This includes negated single high-valued characters. The length in
1592 the table is zero; the actual length is stored in the compiled code. */
1593
1594 case OP_XCLASS:
1595 code += GET(code, 1) + 1;
1596 break;
1597 }
1598 #endif
1599 }
1600 }
1601 }
1602
1603
1604
1605 /*************************************************
1606 * Scan compiled regex for recursion reference *
1607 *************************************************/
1608
1609 /* This little function scans through a compiled pattern until it finds an
1610 instance of OP_RECURSE.
1611
1612 Arguments:
1613 code points to start of expression
1614 utf8 TRUE in UTF-8 mode
1615
1616 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1617 */
1618
1619 static const uschar *
1620 find_recurse(const uschar *code, BOOL utf8)
1621 {
1622 #ifndef SUPPORT_UTF8
1623 utf8 = utf8; /* Stop pedantic compilers complaining */
1624 #endif
1625
1626 for (;;)
1627 {
1628 register int c = *code;
1629 if (c == OP_END) return NULL;
1630 else if (c == OP_RECURSE) return code;
1631 else if (c > OP_BRA)
1632 {
1633 code += OP_lengths[OP_BRA];
1634 }
1635 else
1636 {
1637 code += OP_lengths[c];
1638
1639 #ifdef SUPPORT_UTF8
1640
1641 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1642 by a multi-byte character. The length in the table is a minimum, so we have
1643 to scan along to skip the extra bytes. All opcodes are less than 128, so we
1644 can use relatively efficient code. */
1645
1646 if (utf8) switch(c)
1647 {
1648 case OP_CHAR:
1649 case OP_CHARNC:
1650 case OP_EXACT:
1651 case OP_UPTO:
1652 case OP_MINUPTO:
1653 case OP_STAR:
1654 case OP_MINSTAR:
1655 case OP_PLUS:
1656 case OP_MINPLUS:
1657 case OP_QUERY:
1658 case OP_MINQUERY:
1659 while ((*code & 0xc0) == 0x80) code++;
1660 break;
1661
1662 /* XCLASS is used for classes that cannot be represented just by a bit
1663 map. This includes negated single high-valued characters. The length in
1664 the table is zero; the actual length is stored in the compiled code. */
1665
1666 case OP_XCLASS:
1667 code += GET(code, 1) + 1;
1668 break;
1669 }
1670 #endif
1671 }
1672 }
1673 }
1674
1675
1676
1677 /*************************************************
1678 * Scan compiled branch for non-emptiness *
1679 *************************************************/
1680
1681 /* This function scans through a branch of a compiled pattern to see whether it
1682 can match the empty string or not. It is called only from could_be_empty()
1683 below. Note that first_significant_code() skips over assertions. If we hit an
1684 unclosed bracket, we return "empty" - this means we've struck an inner bracket
1685 whose current branch will already have been scanned.
1686
1687 Arguments:
1688 code points to start of search
1689 endcode points to where to stop
1690 utf8 TRUE if in UTF8 mode
1691
1692 Returns: TRUE if what is matched could be empty
1693 */
1694
1695 static BOOL
1696 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1697 {
1698 register int c;
1699 for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);
1700 code < endcode;
1701 code = first_significant_code(code + OP_lengths[c], NULL, 0, TRUE))
1702 {
1703 const uschar *ccode;
1704
1705 c = *code;
1706
1707 if (c >= OP_BRA)
1708 {
1709 BOOL empty_branch;
1710 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1711
1712 /* Scan a closed bracket */
1713
1714 empty_branch = FALSE;
1715 do
1716 {
1717 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1718 empty_branch = TRUE;
1719 code += GET(code, 1);
1720 }
1721 while (*code == OP_ALT);
1722 if (!empty_branch) return FALSE; /* All branches are non-empty */
1723 code += 1 + LINK_SIZE;
1724 c = *code;
1725 }
1726
1727 else switch (c)
1728 {
1729 /* Check for quantifiers after a class */
1730
1731 #ifdef SUPPORT_UTF8
1732 case OP_XCLASS:
1733 ccode = code + GET(code, 1);
1734 goto CHECK_CLASS_REPEAT;
1735 #endif
1736
1737 case OP_CLASS:
1738 case OP_NCLASS:
1739 ccode = code + 33;
1740
1741 #ifdef SUPPORT_UTF8
1742 CHECK_CLASS_REPEAT:
1743 #endif
1744
1745 switch (*ccode)
1746 {
1747 case OP_CRSTAR: /* These could be empty; continue */
1748 case OP_CRMINSTAR:
1749 case OP_CRQUERY:
1750 case OP_CRMINQUERY:
1751 break;
1752
1753 default: /* Non-repeat => class must match */
1754 case OP_CRPLUS: /* These repeats aren't empty */
1755 case OP_CRMINPLUS:
1756 return FALSE;
1757
1758 case OP_CRRANGE:
1759 case OP_CRMINRANGE:
1760 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1761 break;
1762 }
1763 break;
1764
1765 /* Opcodes that must match a character */
1766
1767 case OP_PROP:
1768 case OP_NOTPROP:
1769 case OP_EXTUNI:
1770 case OP_NOT_DIGIT:
1771 case OP_DIGIT:
1772 case OP_NOT_WHITESPACE:
1773 case OP_WHITESPACE:
1774 case OP_NOT_WORDCHAR:
1775 case OP_WORDCHAR:
1776 case OP_ANY:
1777 case OP_ANYBYTE:
1778 case OP_CHAR:
1779 case OP_CHARNC:
1780 case OP_NOT:
1781 case OP_PLUS:
1782 case OP_MINPLUS:
1783 case OP_EXACT:
1784 case OP_NOTPLUS:
1785 case OP_NOTMINPLUS:
1786 case OP_NOTEXACT:
1787 case OP_TYPEPLUS:
1788 case OP_TYPEMINPLUS:
1789 case OP_TYPEEXACT:
1790 return FALSE;
1791
1792 /* End of branch */
1793
1794 case OP_KET:
1795 case OP_KETRMAX:
1796 case OP_KETRMIN:
1797 case OP_ALT:
1798 return TRUE;
1799
1800 /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO may be
1801 followed by a multibyte character */
1802
1803 #ifdef SUPPORT_UTF8
1804 case OP_STAR:
1805 case OP_MINSTAR:
1806 case OP_QUERY:
1807 case OP_MINQUERY:
1808 case OP_UPTO:
1809 case OP_MINUPTO:
1810 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1811 break;
1812 #endif
1813 }
1814 }
1815
1816 return TRUE;
1817 }
1818
1819
1820
1821 /*************************************************
1822 * Scan compiled regex for non-emptiness *
1823 *************************************************/
1824
1825 /* This function is called to check for left recursive calls. We want to check
1826 the current branch of the current pattern to see if it could match the empty
1827 string. If it could, we must look outwards for branches at other levels,
1828 stopping when we pass beyond the bracket which is the subject of the recursion.
1829
1830 Arguments:
1831 code points to start of the recursion
1832 endcode points to where to stop (current RECURSE item)
1833 bcptr points to the chain of current (unclosed) branch starts
1834 utf8 TRUE if in UTF-8 mode
1835
1836 Returns: TRUE if what is matched could be empty
1837 */
1838
1839 static BOOL
1840 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1841 BOOL utf8)
1842 {
1843 while (bcptr != NULL && bcptr->current >= code)
1844 {
1845 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1846 bcptr = bcptr->outer;
1847 }
1848 return TRUE;
1849 }
1850
1851
1852
1853 /*************************************************
1854 * Check for POSIX class syntax *
1855 *************************************************/
1856
1857 /* This function is called when the sequence "[:" or "[." or "[=" is
1858 encountered in a character class. It checks whether this is followed by an
1859 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1860 ".]" or "=]".
1861
1862 Argument:
1863 ptr pointer to the initial [
1864 endptr where to return the end pointer
1865 cd pointer to compile data
1866
1867 Returns: TRUE or FALSE
1868 */
1869
1870 static BOOL
1871 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1872 {
1873 int terminator; /* Don't combine these lines; the Solaris cc */
1874 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1875 if (*(++ptr) == '^') ptr++;
1876 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1877 if (*ptr == terminator && ptr[1] == ']')
1878 {
1879 *endptr = ptr;
1880 return TRUE;
1881 }
1882 return FALSE;
1883 }
1884
1885
1886
1887
1888 /*************************************************
1889 * Check POSIX class name *
1890 *************************************************/
1891
1892 /* This function is called to check the name given in a POSIX-style class entry
1893 such as [:alnum:].
1894
1895 Arguments:
1896 ptr points to the first letter
1897 len the length of the name
1898
1899 Returns: a value representing the name, or -1 if unknown
1900 */
1901
1902 static int
1903 check_posix_name(const uschar *ptr, int len)
1904 {
1905 register int yield = 0;
1906 while (posix_name_lengths[yield] != 0)
1907 {
1908 if (len == posix_name_lengths[yield] &&
1909 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1910 yield++;
1911 }
1912 return -1;
1913 }
1914
1915
1916 /*************************************************
1917 * Adjust OP_RECURSE items in repeated group *
1918 *************************************************/
1919
1920 /* OP_RECURSE items contain an offset from the start of the regex to the group
1921 that is referenced. This means that groups can be replicated for fixed
1922 repetition simply by copying (because the recursion is allowed to refer to
1923 earlier groups that are outside the current group). However, when a group is
1924 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1925 it, after it has been compiled. This means that any OP_RECURSE items within it
1926 that refer to the group itself or any contained groups have to have their
1927 offsets adjusted. That is the job of this function. Before it is called, the
1928 partially compiled regex must be temporarily terminated with OP_END.
1929
1930 Arguments:
1931 group points to the start of the group
1932 adjust the amount by which the group is to be moved
1933 utf8 TRUE in UTF-8 mode
1934 cd contains pointers to tables etc.
1935
1936 Returns: nothing
1937 */
1938
1939 static void
1940 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)
1941 {
1942 uschar *ptr = group;
1943 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1944 {
1945 int offset = GET(ptr, 1);
1946 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1947 ptr += 1 + LINK_SIZE;
1948 }
1949 }
1950
1951
1952
1953 /*************************************************
1954 * Insert an automatic callout point *
1955 *************************************************/
1956
1957 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1958 callout points before each pattern item.
1959
1960 Arguments:
1961 code current code pointer
1962 ptr current pattern pointer
1963 cd pointers to tables etc
1964
1965 Returns: new code pointer
1966 */
1967
1968 static uschar *
1969 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1970 {
1971 *code++ = OP_CALLOUT;
1972 *code++ = 255;
1973 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1974 PUT(code, LINK_SIZE, 0); /* Default length */
1975 return code + 2*LINK_SIZE;
1976 }
1977
1978
1979
1980 /*************************************************
1981 * Complete a callout item *
1982 *************************************************/
1983
1984 /* A callout item contains the length of the next item in the pattern, which
1985 we can't fill in till after we have reached the relevant point. This is used
1986 for both automatic and manual callouts.
1987
1988 Arguments:
1989 previous_callout points to previous callout item
1990 ptr current pattern pointer
1991 cd pointers to tables etc
1992
1993 Returns: nothing
1994 */
1995
1996 static void
1997 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1998 {
1999 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
2000 PUT(previous_callout, 2 + LINK_SIZE, length);
2001 }
2002
2003
2004
2005 #ifdef SUPPORT_UCP
2006 /*************************************************
2007 * Get othercase range *
2008 *************************************************/
2009
2010 /* This function is passed the start and end of a class range, in UTF-8 mode
2011 with UCP support. It searches up the characters, looking for internal ranges of
2012 characters in the "other" case. Each call returns the next one, updating the
2013 start address.
2014
2015 Arguments:
2016 cptr points to starting character value; updated
2017 d end value
2018 ocptr where to put start of othercase range
2019 odptr where to put end of othercase range
2020
2021 Yield: TRUE when range returned; FALSE when no more
2022 */
2023
2024 static BOOL
2025 get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)
2026 {
2027 int c, chartype, othercase, next;
2028
2029 for (c = *cptr; c <= d; c++)
2030 {
2031 if (ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0) break;
2032 }
2033
2034 if (c > d) return FALSE;
2035
2036 *ocptr = othercase;
2037 next = othercase + 1;
2038
2039 for (++c; c <= d; c++)
2040 {
2041 if (ucp_findchar(c, &chartype, &othercase) != ucp_L || othercase != next)
2042 break;
2043 next++;
2044 }
2045
2046 *odptr = next - 1;
2047 *cptr = c;
2048
2049 return TRUE;
2050 }
2051 #endif /* SUPPORT_UCP */
2052
2053
2054 /*************************************************
2055 * Compile one branch *
2056 *************************************************/
2057
2058 /* Scan the pattern, compiling it into the code vector. If the options are
2059 changed during the branch, the pointer is used to change the external options
2060 bits.
2061
2062 Arguments:
2063 optionsptr pointer to the option bits
2064 brackets points to number of extracting brackets used
2065 codeptr points to the pointer to the current code point
2066 ptrptr points to the current pattern pointer
2067 errorptr points to pointer to error message
2068 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2069 reqbyteptr set to the last literal character required, else < 0
2070 bcptr points to current branch chain
2071 cd contains pointers to tables etc.
2072
2073 Returns: TRUE on success
2074 FALSE, with *errorptr set on error
2075 */
2076
2077 static BOOL
2078 compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
2079 const uschar **ptrptr, const char **errorptr, int *firstbyteptr,
2080 int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
2081 {
2082 int repeat_type, op_type;
2083 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2084 int bravalue = 0;
2085 int greedy_default, greedy_non_default;
2086 int firstbyte, reqbyte;
2087 int zeroreqbyte, zerofirstbyte;
2088 int req_caseopt, reqvary, tempreqvary;
2089 int condcount = 0;
2090 int options = *optionsptr;
2091 int after_manual_callout = 0;
2092 register int c;
2093 register uschar *code = *codeptr;
2094 uschar *tempcode;
2095 BOOL inescq = FALSE;
2096 BOOL groupsetfirstbyte = FALSE;
2097 const uschar *ptr = *ptrptr;
2098 const uschar *tempptr;
2099 uschar *previous = NULL;
2100 uschar *previous_callout = NULL;
2101 uschar classbits[32];
2102
2103 #ifdef SUPPORT_UTF8
2104 BOOL class_utf8;
2105 BOOL utf8 = (options & PCRE_UTF8) != 0;
2106 uschar *class_utf8data;
2107 uschar utf8_char[6];
2108 #else
2109 BOOL utf8 = FALSE;
2110 #endif
2111
2112 /* Set up the default and non-default settings for greediness */
2113
2114 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2115 greedy_non_default = greedy_default ^ 1;
2116
2117 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2118 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2119 matches a non-fixed char first char; reqbyte just remains unset if we never
2120 find one.
2121
2122 When we hit a repeat whose minimum is zero, we may have to adjust these values
2123 to take the zero repeat into account. This is implemented by setting them to
2124 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2125 item types that can be repeated set these backoff variables appropriately. */
2126
2127 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2128
2129 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2130 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2131 value > 255. It is added into the firstbyte or reqbyte variables to record the
2132 case status of the value. This is used only for ASCII characters. */
2133
2134 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2135
2136 /* Switch on next character until the end of the branch */
2137
2138 for (;; ptr++)
2139 {
2140 BOOL negate_class;
2141 BOOL possessive_quantifier;
2142 BOOL is_quantifier;
2143 int class_charcount;
2144 int class_lastchar;
2145 int newoptions;
2146 int recno;
2147 int skipbytes;
2148 int subreqbyte;
2149 int subfirstbyte;
2150 int mclength;
2151 uschar mcbuffer[8];
2152
2153 /* Next byte in the pattern */
2154
2155 c = *ptr;
2156
2157 /* If in \Q...\E, check for the end; if not, we have a literal */
2158
2159 if (inescq && c != 0)
2160 {
2161 if (c == '\\' && ptr[1] == 'E')
2162 {
2163 inescq = FALSE;
2164 ptr++;
2165 continue;
2166 }
2167 else
2168 {
2169 if (previous_callout != NULL)
2170 {
2171 complete_callout(previous_callout, ptr, cd);
2172 previous_callout = NULL;
2173 }
2174 if ((options & PCRE_AUTO_CALLOUT) != 0)
2175 {
2176 previous_callout = code;
2177 code = auto_callout(code, ptr, cd);
2178 }
2179 goto NORMAL_CHAR;
2180 }
2181 }
2182
2183 /* Fill in length of a previous callout, except when the next thing is
2184 a quantifier. */
2185
2186 is_quantifier = c == '*' || c == '+' || c == '?' ||
2187 (c == '{' && is_counted_repeat(ptr+1));
2188
2189 if (!is_quantifier && previous_callout != NULL &&
2190 after_manual_callout-- <= 0)
2191 {
2192 complete_callout(previous_callout, ptr, cd);
2193 previous_callout = NULL;
2194 }
2195
2196 /* In extended mode, skip white space and comments */
2197
2198 if ((options & PCRE_EXTENDED) != 0)
2199 {
2200 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2201 if (c == '#')
2202 {
2203 /* The space before the ; is to avoid a warning on a silly compiler
2204 on the Macintosh. */
2205 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
2206 if (c != 0) continue; /* Else fall through to handle end of string */
2207 }
2208 }
2209
2210 /* No auto callout for quantifiers. */
2211
2212 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2213 {
2214 previous_callout = code;
2215 code = auto_callout(code, ptr, cd);
2216 }
2217
2218 switch(c)
2219 {
2220 /* The branch terminates at end of string, |, or ). */
2221
2222 case 0:
2223 case '|':
2224 case ')':
2225 *firstbyteptr = firstbyte;
2226 *reqbyteptr = reqbyte;
2227 *codeptr = code;
2228 *ptrptr = ptr;
2229 return TRUE;
2230
2231 /* Handle single-character metacharacters. In multiline mode, ^ disables
2232 the setting of any following char as a first character. */
2233
2234 case '^':
2235 if ((options & PCRE_MULTILINE) != 0)
2236 {
2237 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2238 }
2239 previous = NULL;
2240 *code++ = OP_CIRC;
2241 break;
2242
2243 case '$':
2244 previous = NULL;
2245 *code++ = OP_DOLL;
2246 break;
2247
2248 /* There can never be a first char if '.' is first, whatever happens about
2249 repeats. The value of reqbyte doesn't change either. */
2250
2251 case '.':
2252 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2253 zerofirstbyte = firstbyte;
2254 zeroreqbyte = reqbyte;
2255 previous = code;
2256 *code++ = OP_ANY;
2257 break;
2258
2259 /* Character classes. If the included characters are all < 255 in value, we
2260 build a 32-byte bitmap of the permitted characters, except in the special
2261 case where there is only one such character. For negated classes, we build
2262 the map as usual, then invert it at the end. However, we use a different
2263 opcode so that data characters > 255 can be handled correctly.
2264
2265 If the class contains characters outside the 0-255 range, a different
2266 opcode is compiled. It may optionally have a bit map for characters < 256,
2267 but those above are are explicitly listed afterwards. A flag byte tells
2268 whether the bitmap is present, and whether this is a negated class or not.
2269 */
2270
2271 case '[':
2272 previous = code;
2273
2274 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2275 they are encountered at the top level, so we'll do that too. */
2276
2277 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2278 check_posix_syntax(ptr, &tempptr, cd))
2279 {
2280 *errorptr = (ptr[1] == ':')? ERR13 : ERR31;
2281 goto FAILED;
2282 }
2283
2284 /* If the first character is '^', set the negation flag and skip it. */
2285
2286 if ((c = *(++ptr)) == '^')
2287 {
2288 negate_class = TRUE;
2289 c = *(++ptr);
2290 }
2291 else
2292 {
2293 negate_class = FALSE;
2294 }
2295
2296 /* Keep a count of chars with values < 256 so that we can optimize the case
2297 of just a single character (as long as it's < 256). For higher valued UTF-8
2298 characters, we don't yet do any optimization. */
2299
2300 class_charcount = 0;
2301 class_lastchar = -1;
2302
2303 #ifdef SUPPORT_UTF8
2304 class_utf8 = FALSE; /* No chars >= 256 */
2305 class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
2306 #endif
2307
2308 /* Initialize the 32-char bit map to all zeros. We have to build the
2309 map in a temporary bit of store, in case the class contains only 1
2310 character (< 256), because in that case the compiled code doesn't use the
2311 bit map. */
2312
2313 memset(classbits, 0, 32 * sizeof(uschar));
2314
2315 /* Process characters until ] is reached. By writing this as a "do" it
2316 means that an initial ] is taken as a data character. The first pass
2317 through the regex checked the overall syntax, so we don't need to be very
2318 strict here. At the start of the loop, c contains the first byte of the
2319 character. */
2320
2321 do
2322 {
2323 #ifdef SUPPORT_UTF8
2324 if (utf8 && c > 127)
2325 { /* Braces are required because the */
2326 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2327 }
2328 #endif
2329
2330 /* Inside \Q...\E everything is literal except \E */
2331
2332 if (inescq)
2333 {
2334 if (c == '\\' && ptr[1] == 'E')
2335 {
2336 inescq = FALSE;
2337 ptr++;
2338 continue;
2339 }
2340 else goto LONE_SINGLE_CHARACTER;
2341 }
2342
2343 /* Handle POSIX class names. Perl allows a negation extension of the
2344 form [:^name:]. A square bracket that doesn't match the syntax is
2345 treated as a literal. We also recognize the POSIX constructions
2346 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2347 5.6 and 5.8 do. */
2348
2349 if (c == '[' &&
2350 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2351 check_posix_syntax(ptr, &tempptr, cd))
2352 {
2353 BOOL local_negate = FALSE;
2354 int posix_class, i;
2355 register const uschar *cbits = cd->cbits;
2356
2357 if (ptr[1] != ':')
2358 {
2359 *errorptr = ERR31;
2360 goto FAILED;
2361 }
2362
2363 ptr += 2;
2364 if (*ptr == '^')
2365 {
2366 local_negate = TRUE;
2367 ptr++;
2368 }
2369
2370 posix_class = check_posix_name(ptr, tempptr - ptr);
2371 if (posix_class < 0)
2372 {
2373 *errorptr = ERR30;
2374 goto FAILED;
2375 }
2376
2377 /* If matching is caseless, upper and lower are converted to
2378 alpha. This relies on the fact that the class table starts with
2379 alpha, lower, upper as the first 3 entries. */
2380
2381 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2382 posix_class = 0;
2383
2384 /* Or into the map we are building up to 3 of the static class
2385 tables, or their negations. The [:blank:] class sets up the same
2386 chars as the [:space:] class (all white space). We remove the vertical
2387 white space chars afterwards. */
2388
2389 posix_class *= 3;
2390 for (i = 0; i < 3; i++)
2391 {
2392 BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;
2393 int taboffset = posix_class_maps[posix_class + i];
2394 if (taboffset < 0) break;
2395 if (local_negate)
2396 {
2397 if (i == 0)
2398 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset];
2399 else
2400 for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset];
2401 if (blankclass) classbits[1] |= 0x3c;
2402 }
2403 else
2404 {
2405 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset];
2406 if (blankclass) classbits[1] &= ~0x3c;
2407 }
2408 }
2409
2410 ptr = tempptr + 1;
2411 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2412 continue; /* End of POSIX syntax handling */
2413 }
2414
2415 /* Backslash may introduce a single character, or it may introduce one
2416 of the specials, which just set a flag. Escaped items are checked for
2417 validity in the pre-compiling pass. The sequence \b is a special case.
2418 Inside a class (and only there) it is treated as backspace. Elsewhere
2419 it marks a word boundary. Other escapes have preset maps ready to
2420 or into the one we are building. We assume they have more than one
2421 character in them, so set class_charcount bigger than one. */
2422
2423 if (c == '\\')
2424 {
2425 c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
2426
2427 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2428 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2429 else if (-c == ESC_Q) /* Handle start of quoted string */
2430 {
2431 if (ptr[1] == '\\' && ptr[2] == 'E')
2432 {
2433 ptr += 2; /* avoid empty string */
2434 }
2435 else inescq = TRUE;
2436 continue;
2437 }
2438
2439 if (c < 0)
2440 {
2441 register const uschar *cbits = cd->cbits;
2442 class_charcount += 2; /* Greater than 1 is what matters */
2443 switch (-c)
2444 {
2445 case ESC_d:
2446 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2447 continue;
2448
2449 case ESC_D:
2450 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2451 continue;
2452
2453 case ESC_w:
2454 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2455 continue;
2456
2457 case ESC_W:
2458 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2459 continue;
2460
2461 case ESC_s:
2462 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2463 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2464 continue;
2465
2466 case ESC_S:
2467 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2468 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2469 continue;
2470
2471 #ifdef SUPPORT_UCP
2472 case ESC_p:
2473 case ESC_P:
2474 {
2475 BOOL negated;
2476 int property = get_ucp(&ptr, &negated, errorptr);
2477 if (property < 0) goto FAILED;
2478 class_utf8 = TRUE;
2479 *class_utf8data++ = ((-c == ESC_p) != negated)?
2480 XCL_PROP : XCL_NOTPROP;
2481 *class_utf8data++ = property;
2482 class_charcount -= 2; /* Not a < 256 character */
2483 }
2484 continue;
2485 #endif
2486
2487 /* Unrecognized escapes are faulted if PCRE is running in its
2488 strict mode. By default, for compatibility with Perl, they are
2489 treated as literals. */
2490
2491 default:
2492 if ((options & PCRE_EXTRA) != 0)
2493 {
2494 *errorptr = ERR7;
2495 goto FAILED;
2496 }
2497 c = *ptr; /* The final character */
2498 class_charcount -= 2; /* Undo the default count from above */
2499 }
2500 }
2501
2502 /* Fall through if we have a single character (c >= 0). This may be
2503 > 256 in UTF-8 mode. */
2504
2505 } /* End of backslash handling */
2506
2507 /* A single character may be followed by '-' to form a range. However,
2508 Perl does not permit ']' to be the end of the range. A '-' character
2509 here is treated as a literal. */
2510
2511 if (ptr[1] == '-' && ptr[2] != ']')
2512 {
2513 int d;
2514 ptr += 2;
2515
2516 #ifdef SUPPORT_UTF8
2517 if (utf8)
2518 { /* Braces are required because the */
2519 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2520 }
2521 else
2522 #endif
2523 d = *ptr; /* Not UTF-8 mode */
2524
2525 /* The second part of a range can be a single-character escape, but
2526 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2527 in such circumstances. */
2528
2529 if (d == '\\')
2530 {
2531 const uschar *oldptr = ptr;
2532 d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
2533
2534 /* \b is backslash; \X is literal X; any other special means the '-'
2535 was literal */
2536
2537 if (d < 0)
2538 {
2539 if (d == -ESC_b) d = '\b';
2540 else if (d == -ESC_X) d = 'X'; else
2541 {
2542 ptr = oldptr - 2;
2543 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2544 }
2545 }
2546 }
2547
2548 /* The check that the two values are in the correct order happens in
2549 the pre-pass. Optimize one-character ranges */
2550
2551 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2552
2553 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2554 matching, we have to use an XCLASS with extra data items. Caseless
2555 matching for characters > 127 is available only if UCP support is
2556 available. */
2557
2558 #ifdef SUPPORT_UTF8
2559 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2560 {
2561 class_utf8 = TRUE;
2562
2563 /* With UCP support, we can find the other case equivalents of
2564 the relevant characters. There may be several ranges. Optimize how
2565 they fit with the basic range. */
2566
2567 #ifdef SUPPORT_UCP
2568 if ((options & PCRE_CASELESS) != 0)
2569 {
2570 int occ, ocd;
2571 int cc = c;
2572 int origd = d;
2573 while (get_othercase_range(&cc, origd, &occ, &ocd))
2574 {
2575 if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
2576
2577 if (occ < c && ocd >= c - 1) /* Extend the basic range */
2578 { /* if there is overlap, */
2579 c = occ; /* noting that if occ < c */
2580 continue; /* we can't have ocd > d */
2581 } /* because a subrange is */
2582 if (ocd > d && occ <= d + 1) /* always shorter than */
2583 { /* the basic range. */
2584 d = ocd;
2585 continue;
2586 }
2587
2588 if (occ == ocd)
2589 {
2590 *class_utf8data++ = XCL_SINGLE;
2591 }
2592 else
2593 {
2594 *class_utf8data++ = XCL_RANGE;
2595 class_utf8data += ord2utf8(occ, class_utf8data);
2596 }
2597 class_utf8data += ord2utf8(ocd, class_utf8data);
2598 }
2599 }
2600 #endif /* SUPPORT_UCP */
2601
2602 /* Now record the original range, possibly modified for UCP caseless
2603 overlapping ranges. */
2604
2605 *class_utf8data++ = XCL_RANGE;
2606 class_utf8data += ord2utf8(c, class_utf8data);
2607 class_utf8data += ord2utf8(d, class_utf8data);
2608
2609 /* With UCP support, we are done. Without UCP support, there is no
2610 caseless matching for UTF-8 characters > 127; we can use the bit map
2611 for the smaller ones. */
2612
2613 #ifdef SUPPORT_UCP
2614 continue; /* With next character in the class */
2615 #else
2616 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2617
2618 /* Adjust upper limit and fall through to set up the map */
2619
2620 d = 127;
2621
2622 #endif /* SUPPORT_UCP */
2623 }
2624 #endif /* SUPPORT_UTF8 */
2625
2626 /* We use the bit map for all cases when not in UTF-8 mode; else
2627 ranges that lie entirely within 0-127 when there is UCP support; else
2628 for partial ranges without UCP support. */
2629
2630 for (; c <= d; c++)
2631 {
2632 classbits[c/8] |= (1 << (c&7));
2633 if ((options & PCRE_CASELESS) != 0)
2634 {
2635 int uc = cd->fcc[c]; /* flip case */
2636 classbits[uc/8] |= (1 << (uc&7));
2637 }
2638 class_charcount++; /* in case a one-char range */
2639 class_lastchar = c;
2640 }
2641
2642 continue; /* Go get the next char in the class */
2643 }
2644
2645 /* Handle a lone single character - we can get here for a normal
2646 non-escape char, or after \ that introduces a single character or for an
2647 apparent range that isn't. */
2648
2649 LONE_SINGLE_CHARACTER:
2650
2651 /* Handle a character that cannot go in the bit map */
2652
2653 #ifdef SUPPORT_UTF8
2654 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2655 {
2656 class_utf8 = TRUE;
2657 *class_utf8data++ = XCL_SINGLE;
2658 class_utf8data += ord2utf8(c, class_utf8data);
2659
2660 #ifdef SUPPORT_UCP
2661 if ((options & PCRE_CASELESS) != 0)
2662 {
2663 int chartype;
2664 int othercase;
2665 if (ucp_findchar(c, &chartype, &othercase) >= 0 && othercase > 0)
2666 {
2667 *class_utf8data++ = XCL_SINGLE;
2668 class_utf8data += ord2utf8(othercase, class_utf8data);
2669 }
2670 }
2671 #endif /* SUPPORT_UCP */
2672
2673 }
2674 else
2675 #endif /* SUPPORT_UTF8 */
2676
2677 /* Handle a single-byte character */
2678 {
2679 classbits[c/8] |= (1 << (c&7));
2680 if ((options & PCRE_CASELESS) != 0)
2681 {
2682 c = cd->fcc[c]; /* flip case */
2683 classbits[c/8] |= (1 << (c&7));
2684 }
2685 class_charcount++;
2686 class_lastchar = c;
2687 }
2688 }
2689
2690 /* Loop until ']' reached; the check for end of string happens inside the
2691 loop. This "while" is the end of the "do" above. */
2692
2693 while ((c = *(++ptr)) != ']' || inescq);
2694
2695 /* If class_charcount is 1, we saw precisely one character whose value is
2696 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2697 can optimize the negative case only if there were no characters >= 128
2698 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2699 single-bytes only. This is an historical hangover. Maybe one day we can
2700 tidy these opcodes to handle multi-byte characters.
2701
2702 The optimization throws away the bit map. We turn the item into a
2703 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2704 that OP_NOT does not support multibyte characters. In the positive case, it
2705 can cause firstbyte to be set. Otherwise, there can be no first char if
2706 this item is first, whatever repeat count may follow. In the case of
2707 reqbyte, save the previous value for reinstating. */
2708
2709 #ifdef SUPPORT_UTF8
2710 if (class_charcount == 1 &&
2711 (!utf8 ||
2712 (!class_utf8 && (!negate_class || class_lastchar < 128))))
2713
2714 #else
2715 if (class_charcount == 1)
2716 #endif
2717 {
2718 zeroreqbyte = reqbyte;
2719
2720 /* The OP_NOT opcode works on one-byte characters only. */
2721
2722 if (negate_class)
2723 {
2724 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2725 zerofirstbyte = firstbyte;
2726 *code++ = OP_NOT;
2727 *code++ = class_lastchar;
2728 break;
2729 }
2730
2731 /* For a single, positive character, get the value into mcbuffer, and
2732 then we can handle this with the normal one-character code. */
2733
2734 #ifdef SUPPORT_UTF8
2735 if (utf8 && class_lastchar > 127)
2736 mclength = ord2utf8(class_lastchar, mcbuffer);
2737 else
2738 #endif
2739 {
2740 mcbuffer[0] = class_lastchar;
2741 mclength = 1;
2742 }
2743 goto ONE_CHAR;
2744 } /* End of 1-char optimization */
2745
2746 /* The general case - not the one-char optimization. If this is the first
2747 thing in the branch, there can be no first char setting, whatever the
2748 repeat count. Any reqbyte setting must remain unchanged after any kind of
2749 repeat. */
2750
2751 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2752 zerofirstbyte = firstbyte;
2753 zeroreqbyte = reqbyte;
2754
2755 /* If there are characters with values > 255, we have to compile an
2756 extended class, with its own opcode. If there are no characters < 256,
2757 we can omit the bitmap. */
2758
2759 #ifdef SUPPORT_UTF8
2760 if (class_utf8)
2761 {
2762 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2763 *code++ = OP_XCLASS;
2764 code += LINK_SIZE;
2765 *code = negate_class? XCL_NOT : 0;
2766
2767 /* If the map is required, install it, and move on to the end of
2768 the extra data */
2769
2770 if (class_charcount > 0)
2771 {
2772 *code++ |= XCL_MAP;
2773 memcpy(code, classbits, 32);
2774 code = class_utf8data;
2775 }
2776
2777 /* If the map is not required, slide down the extra data. */
2778
2779 else
2780 {
2781 int len = class_utf8data - (code + 33);
2782 memmove(code + 1, code + 33, len);
2783 code += len + 1;
2784 }
2785
2786 /* Now fill in the complete length of the item */
2787
2788 PUT(previous, 1, code - previous);
2789 break; /* End of class handling */
2790 }
2791 #endif
2792
2793 /* If there are no characters > 255, negate the 32-byte map if necessary,
2794 and copy it into the code vector. If this is the first thing in the branch,
2795 there can be no first char setting, whatever the repeat count. Any reqbyte
2796 setting must remain unchanged after any kind of repeat. */
2797
2798 if (negate_class)
2799 {
2800 *code++ = OP_NCLASS;
2801 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2802 }
2803 else
2804 {
2805 *code++ = OP_CLASS;
2806 memcpy(code, classbits, 32);
2807 }
2808 code += 32;
2809 break;
2810
2811 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2812 has been tested above. */
2813
2814 case '{':
2815 if (!is_quantifier) goto NORMAL_CHAR;
2816 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
2817 if (*errorptr != NULL) goto FAILED;
2818 goto REPEAT;
2819
2820 case '*':
2821 repeat_min = 0;
2822 repeat_max = -1;
2823 goto REPEAT;
2824
2825 case '+':
2826 repeat_min = 1;
2827 repeat_max = -1;
2828 goto REPEAT;
2829
2830 case '?':
2831 repeat_min = 0;
2832 repeat_max = 1;
2833
2834 REPEAT:
2835 if (previous == NULL)
2836 {
2837 *errorptr = ERR9;
2838 goto FAILED;
2839 }
2840
2841 if (repeat_min == 0)
2842 {
2843 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2844 reqbyte = zeroreqbyte; /* Ditto */
2845 }
2846
2847 /* Remember whether this is a variable length repeat */
2848
2849 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2850
2851 op_type = 0; /* Default single-char op codes */
2852 possessive_quantifier = FALSE; /* Default not possessive quantifier */
2853
2854 /* Save start of previous item, in case we have to move it up to make space
2855 for an inserted OP_ONCE for the additional '+' extension. */
2856
2857 tempcode = previous;
2858
2859 /* If the next character is '+', we have a possessive quantifier. This
2860 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2861 If the next character is '?' this is a minimizing repeat, by default,
2862 but if PCRE_UNGREEDY is set, it works the other way round. We change the
2863 repeat type to the non-default. */
2864
2865 if (ptr[1] == '+')
2866 {
2867 repeat_type = 0; /* Force greedy */
2868 possessive_quantifier = TRUE;
2869 ptr++;
2870 }
2871 else if (ptr[1] == '?')
2872 {
2873 repeat_type = greedy_non_default;
2874 ptr++;
2875 }
2876 else repeat_type = greedy_default;
2877
2878 /* If previous was a recursion, we need to wrap it inside brackets so that
2879 it can be replicated if necessary. */
2880
2881 if (*previous == OP_RECURSE)
2882 {
2883 memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2884 code += 1 + LINK_SIZE;
2885 *previous = OP_BRA;
2886 PUT(previous, 1, code - previous);
2887 *code = OP_KET;
2888 PUT(code, 1, code - previous);
2889 code += 1 + LINK_SIZE;
2890 }
2891
2892 /* If previous was a character match, abolish the item and generate a
2893 repeat item instead. If a char item has a minumum of more than one, ensure
2894 that it is set in reqbyte - it might not be if a sequence such as x{3} is
2895 the first thing in a branch because the x will have gone into firstbyte
2896 instead. */
2897
2898 if (*previous == OP_CHAR || *previous == OP_CHARNC)
2899 {
2900 /* Deal with UTF-8 characters that take up more than one byte. It's
2901 easier to write this out separately than try to macrify it. Use c to
2902 hold the length of the character in bytes, plus 0x80 to flag that it's a
2903 length rather than a small character. */
2904
2905 #ifdef SUPPORT_UTF8
2906 if (utf8 && (code[-1] & 0x80) != 0)
2907 {
2908 uschar *lastchar = code - 1;
2909 while((*lastchar & 0xc0) == 0x80) lastchar--;
2910 c = code - lastchar; /* Length of UTF-8 character */
2911 memcpy(utf8_char, lastchar, c); /* Save the char */
2912 c |= 0x80; /* Flag c as a length */
2913 }
2914 else
2915 #endif
2916
2917 /* Handle the case of a single byte - either with no UTF8 support, or
2918 with UTF-8 disabled, or for a UTF-8 character < 128. */
2919
2920 {
2921 c = code[-1];
2922 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2923 }
2924
2925 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
2926 }
2927
2928 /* If previous was a single negated character ([^a] or similar), we use
2929 one of the special opcodes, replacing it. The code is shared with single-
2930 character repeats by setting opt_type to add a suitable offset into
2931 repeat_type. OP_NOT is currently used only for single-byte chars. */
2932
2933 else if (*previous == OP_NOT)
2934 {
2935 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
2936 c = previous[1];
2937 goto OUTPUT_SINGLE_REPEAT;
2938 }
2939
2940 /* If previous was a character type match (\d or similar), abolish it and
2941 create a suitable repeat item. The code is shared with single-character
2942 repeats by setting op_type to add a suitable offset into repeat_type. Note
2943 the the Unicode property types will be present only when SUPPORT_UCP is
2944 defined, but we don't wrap the little bits of code here because it just
2945 makes it horribly messy. */
2946
2947 else if (*previous < OP_EODN)
2948 {
2949 uschar *oldcode;
2950 int prop_type;
2951 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
2952 c = *previous;
2953
2954 OUTPUT_SINGLE_REPEAT:
2955 prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)?
2956 previous[1] : -1;
2957
2958 oldcode = code;
2959 code = previous; /* Usually overwrite previous item */
2960
2961 /* If the maximum is zero then the minimum must also be zero; Perl allows
2962 this case, so we do too - by simply omitting the item altogether. */
2963
2964 if (repeat_max == 0) goto END_REPEAT;
2965
2966 /* All real repeats make it impossible to handle partial matching (maybe
2967 one day we will be able to remove this restriction). */
2968
2969 if (repeat_max != 1) cd->nopartial = TRUE;
2970
2971 /* Combine the op_type with the repeat_type */
2972
2973 repeat_type += op_type;
2974
2975 /* A minimum of zero is handled either as the special case * or ?, or as
2976 an UPTO, with the maximum given. */
2977
2978 if (repeat_min == 0)
2979 {
2980 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
2981 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
2982 else
2983 {
2984 *code++ = OP_UPTO + repeat_type;
2985 PUT2INC(code, 0, repeat_max);
2986 }
2987 }
2988
2989 /* A repeat minimum of 1 is optimized into some special cases. If the
2990 maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
2991 left in place and, if the maximum is greater than 1, we use OP_UPTO with
2992 one less than the maximum. */
2993
2994 else if (repeat_min == 1)
2995 {
2996 if (repeat_max == -1)
2997 *code++ = OP_PLUS + repeat_type;
2998 else
2999 {
3000 code = oldcode; /* leave previous item in place */
3001 if (repeat_max == 1) goto END_REPEAT;
3002 *code++ = OP_UPTO + repeat_type;
3003 PUT2INC(code, 0, repeat_max - 1);
3004 }
3005 }
3006
3007 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3008 handled as an EXACT followed by an UPTO. */
3009
3010 else
3011 {
3012 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3013 PUT2INC(code, 0, repeat_min);
3014
3015 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3016 we have to insert the character for the previous code. For a repeated
3017 Unicode property match, there is an extra byte that defines the
3018 required property. In UTF-8 mode, long characters have their length in
3019 c, with the 0x80 bit as a flag. */
3020
3021 if (repeat_max < 0)
3022 {
3023 #ifdef SUPPORT_UTF8
3024 if (utf8 && c >= 128)
3025 {
3026 memcpy(code, utf8_char, c & 7);
3027 code += c & 7;
3028 }
3029 else
3030 #endif
3031 {
3032 *code++ = c;
3033 if (prop_type >= 0) *code++ = prop_type;
3034 }
3035 *code++ = OP_STAR + repeat_type;
3036 }
3037
3038 /* Else insert an UPTO if the max is greater than the min, again
3039 preceded by the character, for the previously inserted code. */
3040
3041 else if (repeat_max != repeat_min)
3042 {
3043 #ifdef SUPPORT_UTF8
3044 if (utf8 && c >= 128)
3045 {
3046 memcpy(code, utf8_char, c & 7);
3047 code += c & 7;
3048 }
3049 else
3050 #endif
3051 *code++ = c;
3052 if (prop_type >= 0) *code++ = prop_type;
3053 repeat_max -= repeat_min;
3054 *code++ = OP_UPTO + repeat_type;
3055 PUT2INC(code, 0, repeat_max);
3056 }
3057 }
3058
3059 /* The character or character type itself comes last in all cases. */
3060
3061 #ifdef SUPPORT_UTF8
3062 if (utf8 && c >= 128)
3063 {
3064 memcpy(code, utf8_char, c & 7);
3065 code += c & 7;
3066 }
3067 else
3068 #endif
3069 *code++ = c;
3070
3071 /* For a repeated Unicode property match, there is an extra byte that
3072 defines the required property. */
3073
3074 #ifdef SUPPORT_UCP
3075 if (prop_type >= 0) *code++ = prop_type;
3076 #endif
3077 }
3078
3079 /* If previous was a character class or a back reference, we put the repeat
3080 stuff after it, but just skip the item if the repeat was {0,0}. */
3081
3082 else if (*previous == OP_CLASS ||
3083 *previous == OP_NCLASS ||
3084 #ifdef SUPPORT_UTF8
3085 *previous == OP_XCLASS ||
3086 #endif
3087 *previous == OP_REF)
3088 {
3089 if (repeat_max == 0)
3090 {
3091 code = previous;
3092 goto END_REPEAT;
3093 }
3094
3095 /* All real repeats make it impossible to handle partial matching (maybe
3096 one day we will be able to remove this restriction). */
3097
3098 if (repeat_max != 1) cd->nopartial = TRUE;
3099
3100 if (repeat_min == 0 && repeat_max == -1)
3101 *code++ = OP_CRSTAR + repeat_type;
3102 else if (repeat_min == 1 && repeat_max == -1)
3103 *code++ = OP_CRPLUS + repeat_type;
3104 else if (repeat_min == 0 && repeat_max == 1)
3105 *code++ = OP_CRQUERY + repeat_type;
3106 else
3107 {
3108 *code++ = OP_CRRANGE + repeat_type;
3109 PUT2INC(code, 0, repeat_min);
3110 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3111 PUT2INC(code, 0, repeat_max);
3112 }
3113 }
3114
3115 /* If previous was a bracket group, we may have to replicate it in certain
3116 cases. */
3117
3118 else if (*previous >= OP_BRA || *previous == OP_ONCE ||
3119 *previous == OP_COND)
3120 {
3121 register int i;
3122 int ketoffset = 0;
3123 int len = code - previous;
3124 uschar *bralink = NULL;
3125
3126 /* If the maximum repeat count is unlimited, find the end of the bracket
3127 by scanning through from the start, and compute the offset back to it
3128 from the current code pointer. There may be an OP_OPT setting following
3129 the final KET, so we can't find the end just by going back from the code
3130 pointer. */
3131
3132 if (repeat_max == -1)
3133 {
3134 register uschar *ket = previous;
3135 do ket += GET(ket, 1); while (*ket != OP_KET);
3136 ketoffset = code - ket;
3137 }
3138
3139 /* The case of a zero minimum is special because of the need to stick
3140 OP_BRAZERO in front of it, and because the group appears once in the
3141 data, whereas in other cases it appears the minimum number of times. For
3142 this reason, it is simplest to treat this case separately, as otherwise
3143 the code gets far too messy. There are several special subcases when the
3144 minimum is zero. */
3145
3146 if (repeat_min == 0)
3147 {
3148 /* If the maximum is also zero, we just omit the group from the output
3149 altogether. */
3150
3151 if (repeat_max == 0)
3152 {
3153 code = previous;
3154 goto END_REPEAT;
3155 }
3156
3157 /* If the maximum is 1 or unlimited, we just have to stick in the
3158 BRAZERO and do no more at this point. However, we do need to adjust
3159 any OP_RECURSE calls inside the group that refer to the group itself or
3160 any internal group, because the offset is from the start of the whole
3161 regex. Temporarily terminate the pattern while doing this. */
3162
3163 if (repeat_max <= 1)
3164 {
3165 *code = OP_END;
3166 adjust_recurse(previous, 1, utf8, cd);
3167 memmove(previous+1, previous, len);
3168 code++;
3169 *previous++ = OP_BRAZERO + repeat_type;
3170 }
3171
3172 /* If the maximum is greater than 1 and limited, we have to replicate
3173 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3174 The first one has to be handled carefully because it's the original
3175 copy, which has to be moved up. The remainder can be handled by code
3176 that is common with the non-zero minimum case below. We have to
3177 adjust the value or repeat_max, since one less copy is required. Once
3178 again, we may have to adjust any OP_RECURSE calls inside the group. */
3179
3180 else
3181 {
3182 int offset;
3183 *code = OP_END;
3184 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);
3185 memmove(previous + 2 + LINK_SIZE, previous, len);
3186 code += 2 + LINK_SIZE;
3187 *previous++ = OP_BRAZERO + repeat_type;
3188 *previous++ = OP_BRA;
3189
3190 /* We chain together the bracket offset fields that have to be
3191 filled in later when the ends of the brackets are reached. */
3192
3193 offset = (bralink == NULL)? 0 : previous - bralink;
3194 bralink = previous;
3195 PUTINC(previous, 0, offset);
3196 }
3197
3198 repeat_max--;
3199 }
3200
3201 /* If the minimum is greater than zero, replicate the group as many
3202 times as necessary, and adjust the maximum to the number of subsequent
3203 copies that we need. If we set a first char from the group, and didn't
3204 set a required char, copy the latter from the former. */
3205
3206 else
3207 {
3208 if (repeat_min > 1)
3209 {
3210 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3211 for (i = 1; i < repeat_min; i++)
3212 {
3213 memcpy(code, previous, len);
3214 code += len;
3215 }
3216 }
3217 if (repeat_max > 0) repeat_max -= repeat_min;
3218 }
3219
3220 /* This code is common to both the zero and non-zero minimum cases. If
3221 the maximum is limited, it replicates the group in a nested fashion,
3222 remembering the bracket starts on a stack. In the case of a zero minimum,
3223 the first one was set up above. In all cases the repeat_max now specifies
3224 the number of additional copies needed. */
3225
3226 if (repeat_max >= 0)
3227 {
3228 for (i = repeat_max - 1; i >= 0; i--)
3229 {
3230 *code++ = OP_BRAZERO + repeat_type;
3231
3232 /* All but the final copy start a new nesting, maintaining the
3233 chain of brackets outstanding. */
3234
3235 if (i != 0)
3236 {
3237 int offset;
3238 *code++ = OP_BRA;
3239 offset = (bralink == NULL)? 0 : code - bralink;
3240 bralink = code;
3241 PUTINC(code, 0, offset);
3242 }
3243
3244 memcpy(code, previous, len);
3245 code += len;
3246 }
3247
3248 /* Now chain through the pending brackets, and fill in their length
3249 fields (which are holding the chain links pro tem). */
3250
3251 while (bralink != NULL)
3252 {
3253 int oldlinkoffset;
3254 int offset = code - bralink + 1;
3255 uschar *bra = code - offset;
3256 oldlinkoffset = GET(bra, 1);
3257 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3258 *code++ = OP_KET;
3259 PUTINC(code, 0, offset);
3260 PUT(bra, 1, offset);
3261 }
3262 }
3263
3264 /* If the maximum is unlimited, set a repeater in the final copy. We
3265 can't just offset backwards from the current code point, because we
3266 don't know if there's been an options resetting after the ket. The
3267 correct offset was computed above. */
3268
3269 else code[-ketoffset] = OP_KETRMAX + repeat_type;
3270 }
3271
3272 /* Else there's some kind of shambles */
3273
3274 else
3275 {
3276 *errorptr = ERR11;
3277 goto FAILED;
3278 }
3279
3280 /* If the character following a repeat is '+', we wrap the entire repeated
3281 item inside OP_ONCE brackets. This is just syntactic sugar, taken from
3282 Sun's Java package. The repeated item starts at tempcode, not at previous,
3283 which might be the first part of a string whose (former) last char we
3284 repeated. However, we don't support '+' after a greediness '?'. */
3285
3286 if (possessive_quantifier)
3287 {
3288 int len = code - tempcode;
3289 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3290 code += 1 + LINK_SIZE;
3291 len += 1 + LINK_SIZE;
3292 tempcode[0] = OP_ONCE;
3293 *code++ = OP_KET;
3294 PUTINC(code, 0, len);
3295 PUT(tempcode, 1, len);
3296 }
3297
3298 /* In all case we no longer have a previous item. We also set the
3299 "follows varying string" flag for subsequently encountered reqbytes if
3300 it isn't already set and we have just passed a varying length item. */
3301
3302 END_REPEAT:
3303 previous = NULL;
3304 cd->req_varyopt |= reqvary;
3305 break;
3306
3307
3308 /* Start of nested bracket sub-expression, or comment or lookahead or
3309 lookbehind or option setting or condition. First deal with special things
3310 that can come after a bracket; all are introduced by ?, and the appearance
3311 of any of them means that this is not a referencing group. They were
3312 checked for validity in the first pass over the string, so we don't have to
3313 check for syntax errors here. */
3314
3315 case '(':
3316 newoptions = options;
3317 skipbytes = 0;
3318
3319 if (*(++ptr) == '?')
3320 {
3321 int set, unset;
3322 int *optset;
3323
3324 switch (*(++ptr))
3325 {
3326 case '#': /* Comment; skip to ket */
3327 ptr++;
3328 while (*ptr != ')') ptr++;
3329 continue;
3330
3331 case ':': /* Non-extracting bracket */
3332 bravalue = OP_BRA;
3333 ptr++;
3334 break;
3335
3336 case '(':
3337 bravalue = OP_COND; /* Conditional group */
3338
3339 /* Condition to test for recursion */
3340
3341 if (ptr[1] == 'R')
3342 {
3343 code[1+LINK_SIZE] = OP_CREF;
3344 PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
3345 skipbytes = 3;
3346 ptr += 3;
3347 }
3348
3349 /* Condition to test for a numbered subpattern match. We know that
3350 if a digit follows ( then there will just be digits until ) because
3351 the syntax was checked in the first pass. */
3352
3353 else if ((digitab[ptr[1]] && ctype_digit) != 0)
3354 {
3355 int condref; /* Don't amalgamate; some compilers */
3356 condref = *(++ptr) - '0'; /* grumble at autoincrement in declaration */
3357 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
3358 if (condref == 0)
3359 {
3360 *errorptr = ERR35;
3361 goto FAILED;
3362 }
3363 ptr++;
3364 code[1+LINK_SIZE] = OP_CREF;
3365 PUT2(code, 2+LINK_SIZE, condref);
3366 skipbytes = 3;
3367 }
3368 /* For conditions that are assertions, we just fall through, having
3369 set bravalue above. */
3370 break;
3371
3372 case '=': /* Positive lookahead */
3373 bravalue = OP_ASSERT;
3374 ptr++;
3375 break;
3376
3377 case '!': /* Negative lookahead */
3378 bravalue = OP_ASSERT_NOT;
3379 ptr++;
3380 break;
3381
3382 case '<': /* Lookbehinds */
3383 switch (*(++ptr))
3384 {
3385 case '=': /* Positive lookbehind */
3386 bravalue = OP_ASSERTBACK;
3387 ptr++;
3388 break;
3389
3390 case '!': /* Negative lookbehind */
3391 bravalue = OP_ASSERTBACK_NOT;
3392 ptr++;
3393 break;
3394 }
3395 break;
3396
3397 case '>': /* One-time brackets */
3398 bravalue = OP_ONCE;
3399 ptr++;
3400 break;
3401
3402 case 'C': /* Callout - may be followed by digits; */
3403 previous_callout = code; /* Save for later completion */
3404 after_manual_callout = 1; /* Skip one item before completing */
3405 *code++ = OP_CALLOUT; /* Already checked that the terminating */
3406 { /* closing parenthesis is present. */
3407 int n = 0;
3408 while ((digitab[*(++ptr)] & ctype_digit) != 0)
3409 n = n * 10 + *ptr - '0';
3410 if (n > 255)
3411 {
3412 *errorptr = ERR38;
3413 goto FAILED;
3414 }
3415 *code++ = n;
3416 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
3417 PUT(code, LINK_SIZE, 0); /* Default length */
3418 code += 2 * LINK_SIZE;
3419 }
3420 previous = NULL;
3421 continue;
3422
3423 case 'P': /* Named subpattern handling */
3424 if (*(++ptr) == '<') /* Definition */
3425 {
3426 int i, namelen;
3427 uschar *slot = cd->name_table;
3428 const uschar *name; /* Don't amalgamate; some compilers */
3429 name = ++ptr; /* grumble at autoincrement in declaration */
3430
3431 while (*ptr++ != '>');
3432 namelen = ptr - name - 1;
3433
3434 for (i = 0; i < cd->names_found; i++)
3435 {
3436 int crc = memcmp(name, slot+2, namelen);
3437 if (crc == 0)
3438 {
3439 if (slot[2+namelen] == 0)
3440 {
3441 *errorptr = ERR43;
3442 goto FAILED;
3443 }
3444 crc = -1; /* Current name is substring */
3445 }
3446 if (crc < 0)
3447 {
3448 memmove(slot + cd->name_entry_size, slot,
3449 (cd->names_found - i) * cd->name_entry_size);
3450 break;
3451 }
3452 slot += cd->name_entry_size;
3453 }
3454
3455 PUT2(slot, 0, *brackets + 1);
3456 memcpy(slot + 2, name, namelen);
3457 slot[2+namelen] = 0;
3458 cd->names_found++;
3459 goto NUMBERED_GROUP;
3460 }
3461
3462 if (*ptr == '=' || *ptr == '>') /* Reference or recursion */
3463 {
3464 int i, namelen;
3465 int type = *ptr++;
3466 const uschar *name = ptr;
3467 uschar *slot = cd->name_table;
3468
3469 while (*ptr != ')') ptr++;
3470 namelen = ptr - name;
3471
3472 for (i = 0; i < cd->names_found; i++)
3473 {
3474 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3475 slot += cd->name_entry_size;
3476 }
3477 if (i >= cd->names_found)
3478 {
3479 *errorptr = ERR15;
3480 goto FAILED;
3481 }
3482
3483 recno = GET2(slot, 0);
3484
3485 if (type == '>') goto HANDLE_RECURSION; /* A few lines below */
3486
3487 /* Back reference */
3488
3489 previous = code;
3490 *code++ = OP_REF;
3491 PUT2INC(code, 0, recno);
3492 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
3493 if (recno > cd->top_backref) cd->top_backref = recno;
3494 continue;
3495 }
3496
3497 /* Should never happen */
3498 break;
3499
3500 case 'R': /* Pattern recursion */
3501 ptr++; /* Same as (?0) */
3502 /* Fall through */
3503
3504 /* Recursion or "subroutine" call */
3505
3506 case '0': case '1': case '2': case '3': case '4':
3507 case '5': case '6': case '7': case '8': case '9':
3508 {
3509 const uschar *called;
3510 recno = 0;
3511 while((digitab[*ptr] & ctype_digit) != 0)
3512 recno = recno * 10 + *ptr++ - '0';
3513
3514 /* Come here from code above that handles a named recursion */
3515
3516 HANDLE_RECURSION:
3517
3518 previous = code;
3519
3520 /* Find the bracket that is being referenced. Temporarily end the
3521 regex in case it doesn't exist. */
3522
3523 *code = OP_END;
3524 called = (recno == 0)?
3525 cd->start_code : find_bracket(cd->start_code, utf8, recno);
3526
3527 if (called == NULL)
3528 {
3529 *errorptr = ERR15;
3530 goto FAILED;
3531 }
3532
3533 /* If the subpattern is still open, this is a recursive call. We
3534 check to see if this is a left recursion that could loop for ever,
3535 and diagnose that case. */
3536
3537 if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
3538 {
3539 *errorptr = ERR40;
3540 goto FAILED;
3541 }
3542
3543 /* Insert the recursion/subroutine item */
3544
3545 *code = OP_RECURSE;
3546 PUT(code, 1, called - cd->start_code);
3547 code += 1 + LINK_SIZE;
3548 }
3549 continue;
3550
3551 /* Character after (? not specially recognized */
3552
3553 default: /* Option setting */
3554 set = unset = 0;
3555 optset = &set;
3556
3557 while (*ptr != ')' && *ptr != ':')
3558 {
3559 switch (*ptr++)
3560 {
3561 case '-': optset = &unset; break;
3562
3563 case 'i': *optset |= PCRE_CASELESS; break;
3564 case 'm': *optset |= PCRE_MULTILINE; break;
3565 case 's': *optset |= PCRE_DOTALL; break;
3566 case 'x': *optset |= PCRE_EXTENDED; break;
3567 case 'U': *optset |= PCRE_UNGREEDY; break;
3568 case 'X': *optset |= PCRE_EXTRA; break;
3569 }
3570 }
3571
3572 /* Set up the changed option bits, but don't change anything yet. */
3573
3574 newoptions = (options | set) & (~unset);
3575
3576 /* If the options ended with ')' this is not the start of a nested
3577 group with option changes, so the options change at this level. Compile
3578 code to change the ims options if this setting actually changes any of
3579 them. We also pass the new setting back so that it can be put at the
3580 start of any following branches, and when this group ends (if we are in
3581 a group), a resetting item can be compiled.
3582
3583 Note that if this item is right at the start of the pattern, the
3584 options will have been abstracted and made global, so there will be no
3585 change to compile. */
3586
3587 if (*ptr == ')')
3588 {
3589 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
3590 {
3591 *code++ = OP_OPT;
3592 *code++ = newoptions & PCRE_IMS;
3593 }
3594
3595 /* Change options at this level, and pass them back for use
3596 in subsequent branches. Reset the greedy defaults and the case
3597 value for firstbyte and reqbyte. */
3598
3599 *optionsptr = options = newoptions;
3600 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
3601 greedy_non_default = greedy_default ^ 1;
3602 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3603
3604 previous = NULL; /* This item can't be repeated */
3605 continue; /* It is complete */
3606 }
3607
3608 /* If the options ended with ':' we are heading into a nested group
3609 with possible change of options. Such groups are non-capturing and are
3610 not assertions of any kind. All we need to do is skip over the ':';
3611 the newoptions value is handled below. */
3612
3613 bravalue = OP_BRA;
3614 ptr++;
3615 }
3616 }
3617
3618 /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
3619 non-capturing and behave like (?:...) brackets */
3620
3621 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
3622 {
3623 bravalue = OP_BRA;
3624 }
3625
3626 /* Else we have a referencing group; adjust the opcode. If the bracket
3627 number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
3628 arrange for the true number to follow later, in an OP_BRANUMBER item. */
3629
3630 else
3631 {
3632 NUMBERED_GROUP:
3633 if (++(*brackets) > EXTRACT_BASIC_MAX)
3634 {
3635 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
3636 code[1+LINK_SIZE] = OP_BRANUMBER;
3637 PUT2(code, 2+LINK_SIZE, *brackets);
3638 skipbytes = 3;
3639 }
3640 else bravalue = OP_BRA + *brackets;
3641 }
3642
3643 /* Process nested bracketed re. Assertions may not be repeated, but other
3644 kinds can be. We copy code into a non-register variable in order to be able
3645 to pass its address because some compilers complain otherwise. Pass in a
3646 new setting for the ims options if they have changed. */
3647
3648 previous = (bravalue >= OP_ONCE)? code : NULL;
3649 *code = bravalue;
3650 tempcode = code;
3651 tempreqvary = cd->req_varyopt; /* Save value before bracket */
3652
3653 if (!compile_regex(
3654 newoptions, /* The complete new option state */
3655 options & PCRE_IMS, /* The previous ims option state */
3656 brackets, /* Extracting bracket count */
3657 &tempcode, /* Where to put code (updated) */
3658 &ptr, /* Input pointer (updated) */
3659 errorptr, /* Where to put an error message */
3660 (bravalue == OP_ASSERTBACK ||
3661 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
3662 skipbytes, /* Skip over OP_COND/OP_BRANUMBER */
3663 &subfirstbyte, /* For possible first char */
3664 &subreqbyte, /* For possible last char */
3665 bcptr, /* Current branch chain */
3666 cd)) /* Tables block */
3667 goto FAILED;
3668
3669 /* At the end of compiling, code is still pointing to the start of the
3670 group, while tempcode has been updated to point past the end of the group
3671 and any option resetting that may follow it. The pattern pointer (ptr)
3672 is on the bracket. */
3673
3674 /* If this is a conditional bracket, check that there are no more than
3675 two branches in the group. */
3676
3677 else if (bravalue == OP_COND)
3678 {
3679 uschar *tc = code;
3680 condcount = 0;
3681
3682 do {
3683 condcount++;
3684 tc += GET(tc,1);
3685 }
3686 while (*tc != OP_KET);
3687
3688 if (condcount > 2)
3689 {
3690 *errorptr = ERR27;
3691 goto FAILED;
3692 }
3693
3694 /* If there is just one branch, we must not make use of its firstbyte or
3695 reqbyte, because this is equivalent to an empty second branch. */
3696
3697 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
3698 }
3699
3700 /* Handle updating of the required and first characters. Update for normal
3701 brackets of all kinds, and conditions with two branches (see code above).
3702 If the bracket is followed by a quantifier with zero repeat, we have to
3703 back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
3704 main loop so that they can be accessed for the back off. */
3705
3706 zeroreqbyte = reqbyte;
3707 zerofirstbyte = firstbyte;
3708 groupsetfirstbyte = FALSE;
3709
3710 if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
3711 {
3712 /* If we have not yet set a firstbyte in this branch, take it from the
3713 subpattern, remembering that it was set here so that a repeat of more
3714 than one can replicate it as reqbyte if necessary. If the subpattern has
3715 no firstbyte, set "none" for the whole branch. In both cases, a zero
3716 repeat forces firstbyte to "none". */
3717
3718 if (firstbyte == REQ_UNSET)
3719 {
3720 if (subfirstbyte >= 0)
3721 {
3722 firstbyte = subfirstbyte;
3723 groupsetfirstbyte = TRUE;
3724 }
3725 else firstbyte = REQ_NONE;
3726 zerofirstbyte = REQ_NONE;
3727 }
3728
3729 /* If firstbyte was previously set, convert the subpattern's firstbyte
3730 into reqbyte if there wasn't one, using the vary flag that was in
3731 existence beforehand. */
3732
3733 else if (subfirstbyte >= 0 && subreqbyte < 0)
3734 subreqbyte = subfirstbyte | tempreqvary;
3735
3736 /* If the subpattern set a required byte (or set a first byte that isn't
3737 really the first byte - see above), set it. */
3738
3739 if (subreqbyte >= 0) reqbyte = subreqbyte;
3740 }
3741
3742 /* For a forward assertion, we take the reqbyte, if set. This can be
3743 helpful if the pattern that follows the assertion doesn't set a different
3744 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
3745 for an assertion, however because it leads to incorrect effect for patterns
3746 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
3747 of a firstbyte. This is overcome by a scan at the end if there's no
3748 firstbyte, looking for an asserted first char. */
3749
3750 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
3751
3752 /* Now update the main code pointer to the end of the group. */
3753
3754 code = tempcode;
3755
3756 /* Error if hit end of pattern */
3757
3758 if (*ptr != ')')
3759 {
3760 *errorptr = ERR14;
3761 goto FAILED;
3762 }
3763 break;
3764
3765 /* Check \ for being a real metacharacter; if not, fall through and handle
3766 it as a data character at the start of a string. Escape items are checked
3767 for validity in the pre-compiling pass. */
3768
3769 case '\\':
3770 tempptr = ptr;
3771 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
3772
3773 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
3774 are arranged to be the negation of the corresponding OP_values. For the
3775 back references, the values are ESC_REF plus the reference number. Only
3776 back references and those types that consume a character may be repeated.
3777 We can test for values between ESC_b and ESC_Z for the latter; this may
3778 have to change if any new ones are ever created. */
3779
3780 if (c < 0)
3781 {
3782 if (-c == ESC_Q) /* Handle start of quoted string */
3783 {
3784 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
3785 else inescq = TRUE;
3786 continue;
3787 }
3788
3789 /* For metasequences that actually match a character, we disable the
3790 setting of a first character if it hasn't already been set. */
3791
3792 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
3793 firstbyte = REQ_NONE;
3794
3795 /* Set values to reset to if this is followed by a zero repeat. */
3796
3797 zerofirstbyte = firstbyte;
3798 zeroreqbyte = reqbyte;
3799
3800 /* Back references are handled specially */
3801
3802 if (-c >= ESC_REF)
3803 {
3804 int number = -c - ESC_REF;
3805 previous = code;
3806 *code++ = OP_REF;
3807 PUT2INC(code, 0, number);
3808 }
3809
3810 /* So are Unicode property matches, if supported. We know that get_ucp
3811 won't fail because it was tested in the pre-pass. */
3812
3813 #ifdef SUPPORT_UCP
3814 else if (-c == ESC_P || -c == ESC_p)
3815 {
3816 BOOL negated;
3817 int value = get_ucp(&ptr, &negated, errorptr);
3818 previous = code;
3819 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
3820 *code++ = value;
3821 }
3822 #endif
3823
3824 /* For the rest, we can obtain the OP value by negating the escape
3825 value */
3826
3827 else
3828 {
3829 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
3830 *code++ = -c;
3831 }
3832 continue;
3833 }
3834
3835 /* We have a data character whose value is in c. In UTF-8 mode it may have
3836 a value > 127. We set its representation in the length/buffer, and then
3837 handle it as a data character. */
3838
3839 #ifdef SUPPORT_UTF8
3840 if (utf8 && c > 127)
3841 mclength = ord2utf8(c, mcbuffer);
3842 else
3843 #endif
3844
3845 {
3846 mcbuffer[0] = c;
3847 mclength = 1;
3848 }
3849
3850 goto ONE_CHAR;
3851
3852 /* Handle a literal character. It is guaranteed not to be whitespace or #
3853 when the extended flag is set. If we are in UTF-8 mode, it may be a
3854 multi-byte literal character. */
3855
3856 default:
3857 NORMAL_CHAR:
3858 mclength = 1;
3859 mcbuffer[0] = c;
3860
3861 #ifdef SUPPORT_UTF8
3862 if (utf8 && (c & 0xc0) == 0xc0)
3863 {
3864 while ((ptr[1] & 0xc0) == 0x80)
3865 mcbuffer[mclength++] = *(++ptr);
3866 }
3867 #endif
3868
3869 /* At this point we have the character's bytes in mcbuffer, and the length
3870 in mclength. When not in UTF-8 mode, the length is always 1. */
3871
3872 ONE_CHAR:
3873 previous = code;
3874 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
3875 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
3876
3877 /* Set the first and required bytes appropriately. If no previous first
3878 byte, set it from this character, but revert to none on a zero repeat.
3879 Otherwise, leave the firstbyte value alone, and don't change it on a zero
3880 repeat. */
3881
3882 if (firstbyte == REQ_UNSET)
3883 {
3884 zerofirstbyte = REQ_NONE;
3885 zeroreqbyte = reqbyte;
3886
3887 /* If the character is more than one byte long, we can set firstbyte
3888 only if it is not to be matched caselessly. */
3889
3890 if (mclength == 1 || req_caseopt == 0)
3891 {
3892 firstbyte = mcbuffer[0] | req_caseopt;
3893 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
3894 }
3895 else firstbyte = reqbyte = REQ_NONE;
3896 }
3897
3898 /* firstbyte was previously set; we can set reqbyte only the length is
3899 1 or the matching is caseful. */
3900
3901 else
3902 {
3903 zerofirstbyte = firstbyte;
3904 zeroreqbyte = reqbyte;
3905 if (mclength == 1 || req_caseopt == 0)
3906 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3907 }
3908
3909 break; /* End of literal character handling */
3910 }
3911 } /* end of big loop */
3912
3913 /* Control never reaches here by falling through, only by a goto for all the
3914 error states. Pass back the position in the pattern so that it can be displayed
3915 to the user for diagnosing the error. */
3916
3917 FAILED:
3918 *ptrptr = ptr;
3919 return FALSE;
3920 }
3921
3922
3923
3924
3925 /*************************************************
3926 * Compile sequence of alternatives *
3927 *************************************************/
3928
3929 /* On entry, ptr is pointing past the bracket character, but on return
3930 it points to the closing bracket, or vertical bar, or end of string.
3931 The code variable is pointing at the byte into which the BRA operator has been
3932 stored. If the ims options are changed at the start (for a (?ims: group) or
3933 during any branch, we need to insert an OP_OPT item at the start of every
3934 following branch to ensure they get set correctly at run time, and also pass
3935 the new options into every subsequent branch compile.
3936
3937 Argument:
3938 options option bits, including any changes for this subpattern
3939 oldims previous settings of ims option bits
3940 brackets -> int containing the number of extracting brackets used
3941 codeptr -> the address of the current code pointer
3942 ptrptr -> the address of the current pattern pointer
3943 errorptr -> pointer to error message
3944 lookbehind TRUE if this is a lookbehind assertion
3945 skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER)
3946 firstbyteptr place to put the first required character, or a negative number
3947 reqbyteptr place to put the last required character, or a negative number
3948 bcptr pointer to the chain of currently open branches
3949 cd points to the data block with tables pointers etc.
3950
3951 Returns: TRUE on success
3952 */
3953
3954 static BOOL
3955 compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
3956 const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes,
3957 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
3958 {
3959 const uschar *ptr = *ptrptr;
3960 uschar *code = *codeptr;
3961 uschar *last_branch = code;
3962 uschar *start_bracket = code;
3963 uschar *reverse_count = NULL;
3964 int firstbyte, reqbyte;
3965 int branchfirstbyte, branchreqbyte;
3966 branch_chain bc;
3967
3968 bc.outer = bcptr;
3969 bc.current = code;
3970
3971 firstbyte = reqbyte = REQ_UNSET;
3972
3973 /* Offset is set zero to mark that this bracket is still open */
3974
3975 PUT(code, 1, 0);
3976 code += 1 + LINK_SIZE + skipbytes;
3977
3978 /* Loop for each alternative branch */
3979
3980 for (;;)
3981 {
3982 /* Handle a change of ims options at the start of the branch */
3983
3984 if ((options & PCRE_IMS) != oldims)
3985 {
3986 *code++ = OP_OPT;
3987 *code++ = options & PCRE_IMS;
3988 }
3989
3990 /* Set up dummy OP_REVERSE if lookbehind assertion */
3991
3992 if (lookbehind)
3993 {
3994 *code++ = OP_REVERSE;
3995 reverse_count = code;
3996 PUTINC(code, 0, 0);
3997 }
3998
3999 /* Now compile the branch */
4000
4001 if (!compile_branch(&options, brackets, &code, &ptr, errorptr,
4002 &branchfirstbyte, &branchreqbyte, &bc, cd))
4003 {
4004 *ptrptr = ptr;
4005 return FALSE;
4006 }
4007
4008 /* If this is the first branch, the firstbyte and reqbyte values for the
4009 branch become the values for the regex. */
4010
4011 if (*last_branch != OP_ALT)
4012 {
4013 firstbyte = branchfirstbyte;
4014 reqbyte = branchreqbyte;
4015 }
4016
4017 /* If this is not the first branch, the first char and reqbyte have to
4018 match the values from all the previous branches, except that if the previous
4019 value for reqbyte didn't have REQ_VARY set, it can still match, and we set
4020 REQ_VARY for the regex. */
4021
4022 else
4023 {
4024 /* If we previously had a firstbyte, but it doesn't match the new branch,
4025 we have to abandon the firstbyte for the regex, but if there was previously
4026 no reqbyte, it takes on the value of the old firstbyte. */
4027
4028 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
4029 {
4030 if (reqbyte < 0) reqbyte = firstbyte;
4031 firstbyte = REQ_NONE;
4032 }
4033
4034 /* If we (now or from before) have no firstbyte, a firstbyte from the
4035 branch becomes a reqbyte if there isn't a branch reqbyte. */
4036
4037 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
4038 branchreqbyte = branchfirstbyte;
4039
4040 /* Now ensure that the reqbytes match */
4041
4042 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
4043 reqbyte = REQ_NONE;
4044 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
4045 }
4046
4047 /* If lookbehind, check that this branch matches a fixed-length string,
4048 and put the length into the OP_REVERSE item. Temporarily mark the end of
4049 the branch with OP_END. */
4050
4051 if (lookbehind)
4052 {
4053 int length;
4054 *code = OP_END;
4055 length = find_fixedlength(last_branch, options);
4056 DPRINTF(("fixed length = %d\n", length));
4057 if (length < 0)
4058 {
4059 *errorptr = (length == -2)? ERR36 : ERR25;
4060 *ptrptr = ptr;
4061 return FALSE;
4062 }
4063 PUT(reverse_count, 0, length);
4064 }
4065
4066 /* Reached end of expression, either ')' or end of pattern. Go back through
4067 the alternative branches and reverse the chain of offsets, with the field in
4068 the BRA item now becoming an offset to the first alternative. If there are
4069 no alternatives, it points to the end of the group. The length in the
4070 terminating ket is always the length of the whole bracketed item. If any of
4071 the ims options were changed inside the group, compile a resetting op-code
4072 following, except at the very end of the pattern. Return leaving the pointer
4073 at the terminating char. */
4074
4075 if (*ptr != '|')
4076 {
4077 int length = code - last_branch;
4078 do
4079 {
4080 int prev_length = GET(last_branch, 1);
4081 PUT(last_branch, 1, length);
4082 length = prev_length;
4083 last_branch -= length;
4084 }
4085 while (length > 0);
4086
4087 /* Fill in the ket */
4088
4089 *code = OP_KET;
4090 PUT(code, 1, code - start_bracket);
4091 code += 1 + LINK_SIZE;
4092
4093 /* Resetting option if needed */
4094
4095 if ((options & PCRE_IMS) != oldims && *ptr == ')')
4096 {
4097 *code++ = OP_OPT;
4098 *code++ = oldims;
4099 }
4100
4101 /* Set values to pass back */
4102
4103 *codeptr = code;
4104 *ptrptr = ptr;
4105 *firstbyteptr = firstbyte;
4106 *reqbyteptr = reqbyte;
4107 return TRUE;
4108 }
4109
4110 /* Another branch follows; insert an "or" node. Its length field points back
4111 to the previous branch while the bracket remains open. At the end the chain
4112 is reversed. It's done like this so that the start of the bracket has a
4113 zero offset until it is closed, making it possible to detect recursion. */
4114
4115 *code = OP_ALT;
4116 PUT(code, 1, code - last_branch);
4117 bc.current = last_branch = code;
4118 code += 1 + LINK_SIZE;
4119 ptr++;
4120 }
4121 /* Control never reaches here */
4122 }
4123
4124
4125
4126
4127 /*************************************************
4128 * Check for anchored expression *
4129 *************************************************/
4130
4131 /* Try to find out if this is an anchored regular expression. Consider each
4132 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
4133 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
4134 it's anchored. However, if this is a multiline pattern, then only OP_SOD
4135 counts, since OP_CIRC can match in the middle.
4136
4137 We can also consider a regex to be anchored if OP_SOM starts all its branches.
4138 This is the code for \G, which means "match at start of match position, taking
4139 into account the match offset".
4140
4141 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
4142 because that will try the rest of the pattern at all possible matching points,
4143 so there is no point trying again.... er ....
4144
4145 .... except when the .* appears inside capturing parentheses, and there is a
4146 subsequent back reference to those parentheses. We haven't enough information
4147 to catch that case precisely.
4148
4149 At first, the best we could do was to detect when .* was in capturing brackets
4150 and the highest back reference was greater than or equal to that level.
4151 However, by keeping a bitmap of the first 31 back references, we can catch some
4152 of the more common cases more precisely.
4153
4154 Arguments:
4155 code points to start of expression (the bracket)
4156 options points to the options setting
4157 bracket_map a bitmap of which brackets we are inside while testing; this
4158 handles up to substring 31; after that we just have to take
4159 the less precise approach
4160 backref_map the back reference bitmap
4161
4162 Returns: TRUE or FALSE
4163 */
4164
4165 static BOOL
4166 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
4167 unsigned int backref_map)
4168 {
4169 do {
4170 const uschar *scode =
4171 first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE);
4172 register int op = *scode;
4173
4174 /* Capturing brackets */
4175
4176 if (op > OP_BRA)
4177 {
4178 int new_map;
4179 op -= OP_BRA;
4180 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
4181 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
4182 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
4183 }
4184
4185 /* Other brackets */
4186
4187 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4188 {
4189 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
4190 }
4191
4192 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
4193 are or may be referenced. */
4194
4195 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
4196 (*options & PCRE_DOTALL) != 0)
4197 {
4198 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
4199 }
4200
4201 /* Check for explicit anchoring */
4202
4203 else if (op != OP_SOD && op != OP_SOM &&
4204 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
4205 return FALSE;
4206 code += GET(code, 1);
4207 }
4208 while (*code == OP_ALT); /* Loop for each alternative */
4209 return TRUE;
4210 }
4211
4212
4213
4214 /*************************************************
4215 * Check for starting with ^ or .* *
4216 *************************************************/
4217
4218 /* This is called to find out if every branch starts with ^ or .* so that
4219 "first char" processing can be done to speed things up in multiline
4220 matching and for non-DOTALL patterns that start with .* (which must start at
4221 the beginning or after \n). As in the case of is_anchored() (see above), we
4222 have to take account of back references to capturing brackets that contain .*
4223 because in that case we can't make the assumption.
4224
4225 Arguments:
4226 code points to start of expression (the bracket)
4227 bracket_map a bitmap of which brackets we are inside while testing; this
4228 handles up to substring 31; after that we just have to take
4229 the less precise approach
4230 backref_map the back reference bitmap
4231
4232 Returns: TRUE or FALSE
4233 */
4234
4235 static BOOL
4236 is_startline(const uschar *code, unsigned int bracket_map,
4237 unsigned int backref_map)
4238 {
4239 do {
4240 const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0,
4241 FALSE);
4242 register int op = *scode;
4243
4244 /* Capturing brackets */
4245
4246 if (op > OP_BRA)
4247 {
4248 int new_map;
4249 op -= OP_BRA;
4250 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
4251 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
4252 if (!is_startline(scode, new_map, backref_map)) return FALSE;
4253 }
4254
4255 /* Other brackets */
4256
4257 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4258 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
4259
4260 /* .* means "start at start or after \n" if it isn't in brackets that
4261 may be referenced. */
4262
4263 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
4264 {
4265 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
4266 }
4267
4268 /* Check for explicit circumflex */
4269
4270 else if (op != OP_CIRC) return FALSE;
4271
4272 /* Move on to the next alternative */
4273
4274 code += GET(code, 1);
4275 }
4276 while (*code == OP_ALT); /* Loop for each alternative */
4277 return TRUE;
4278 }
4279
4280
4281
4282 /*************************************************
4283 * Check for asserted fixed first char *
4284 *************************************************/
4285
4286 /* During compilation, the "first char" settings from forward assertions are
4287 discarded, because they can cause conflicts with actual literals that follow.
4288 However, if we end up without a first char setting for an unanchored pattern,
4289 it is worth scanning the regex to see if there is an initial asserted first
4290 char. If all branches start with the same asserted char, or with a bracket all
4291 of whose alternatives start with the same asserted char (recurse ad lib), then
4292 we return that char, otherwise -1.
4293
4294 Arguments:
4295 code points to start of expression (the bracket)
4296 options pointer to the options (used to check casing changes)
4297 inassert TRUE if in an assertion
4298
4299 Returns: -1 or the fixed first char
4300 */
4301
4302 static int
4303 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
4304 {
4305 register int c = -1;
4306 do {
4307 int d;
4308 const uschar *scode =
4309 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
4310 register int op = *scode;
4311
4312 if (op >= OP_BRA) op = OP_BRA;
4313
4314 switch(op)
4315 {
4316 default:
4317 return -1;
4318
4319 case OP_BRA:
4320 case OP_ASSERT:
4321 case OP_ONCE:
4322 case OP_COND:
4323 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
4324 return -1;
4325 if (c < 0) c = d; else if (c != d) return -1;
4326 break;
4327
4328 case OP_EXACT: /* Fall through */
4329 scode += 2;
4330
4331 case OP_CHAR:
4332 case OP_CHARNC:
4333 case OP_PLUS:
4334 case OP_MINPLUS:
4335 if (!inassert) return -1;
4336 if (c < 0)
4337 {
4338 c = scode[1];
4339 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
4340 }
4341 else if (c != scode[1]) return -1;
4342 break;
4343 }
4344
4345 code += GET(code, 1);
4346 }
4347 while (*code == OP_ALT);
4348 return c;
4349 }
4350
4351
4352
4353
4354 #ifdef SUPPORT_UTF8
4355 /*************************************************
4356 * Validate a UTF-8 string *
4357 *************************************************/
4358
4359 /* This function is called (optionally) at the start of compile or match, to
4360 validate that a supposed UTF-8 string is actually valid. The early check means
4361 that subsequent code can assume it is dealing with a valid string. The check
4362 can be turned off for maximum performance, but then consequences of supplying
4363 an invalid string are then undefined.
4364
4365 Arguments:
4366 string points to the string
4367 length length of string, or -1 if the string is zero-terminated
4368
4369 Returns: < 0 if the string is a valid UTF-8 string
4370 >= 0 otherwise; the value is the offset of the bad byte
4371 */
4372
4373 static int
4374 valid_utf8(const uschar *string, int length)
4375 {
4376 register const uschar *p;
4377
4378 if (length < 0)
4379 {
4380 for (p = string; *p != 0; p++);
4381 length = p - string;
4382 }
4383
4384 for (p = string; length-- > 0; p++)
4385 {
4386 register int ab;
4387 register int c = *p;
4388 if (c < 128) continue;
4389 if ((c & 0xc0) != 0xc0) return p - string;
4390 ab = utf8_table4[c & 0x3f]; /* Number of additional bytes */
4391 if (length < ab) return p - string;
4392 length -= ab;
4393
4394 /* Check top bits in the second byte */
4395 if ((*(++p) & 0xc0) != 0x80) return p - string;
4396
4397 /* Check for overlong sequences for each different length */
4398 switch (ab)
4399 {
4400 /* Check for xx00 000x */
4401 case 1:
4402 if ((c & 0x3e) == 0) return p - string;
4403 continue; /* We know there aren't any more bytes to check */
4404
4405 /* Check for 1110 0000, xx0x xxxx */
4406 case 2:
4407 if (c == 0xe0 && (*p & 0x20) == 0) return p - string;
4408 break;
4409
4410 /* Check for 1111 0000, xx00 xxxx */
4411 case 3:
4412 if (c == 0xf0 && (*p & 0x30) == 0) return p - string;
4413 break;
4414
4415 /* Check for 1111 1000, xx00 0xxx */
4416 case 4:
4417 if (c == 0xf8 && (*p & 0x38) == 0) return p - string;
4418 break;
4419
4420 /* Check for leading 0xfe or 0xff, and then for 1111 1100, xx00 00xx */
4421 case 5:
4422 if (c == 0xfe || c == 0xff ||
4423 (c == 0xfc && (*p & 0x3c) == 0)) return p - string;
4424 break;
4425 }
4426
4427 /* Check for valid bytes after the 2nd, if any; all must start 10 */
4428 while (--ab > 0)
4429 {
4430 if ((*(++p) & 0xc0) != 0x80) return p - string;
4431 }
4432 }
4433
4434 return -1;
4435 }
4436 #endif
4437
4438
4439
4440 /*************************************************
4441 * Compile a Regular Expression *
4442 *************************************************/
4443
4444 /* This function takes a string and returns a pointer to a block of store
4445 holding a compiled version of the expression.
4446
4447 Arguments:
4448 pattern the regular expression
4449 options various option bits
4450 errorptr pointer to pointer to error text
4451 erroroffset ptr offset in pattern where error was detected
4452 tables pointer to character tables or NULL
4453
4454 Returns: pointer to compiled data block, or NULL on error,
4455 with errorptr and erroroffset set
4456 */
4457
4458 EXPORT pcre *
4459 pcre_compile(const char *pattern, int options, const char **errorptr,
4460 int *erroroffset, const unsigned char *tables)
4461 {
4462 real_pcre *re;
4463 int length = 1 + LINK_SIZE; /* For initial BRA plus length */
4464 int c, firstbyte, reqbyte;
4465 int bracount = 0;
4466 int branch_extra = 0;
4467 int branch_newextra;
4468 int item_count = -1;
4469 int name_count = 0;
4470 int max_name_size = 0;
4471 int lastitemlength = 0;
4472 #ifdef SUPPORT_UTF8
4473 BOOL utf8;
4474 BOOL class_utf8;
4475 #endif
4476 BOOL inescq = FALSE;
4477 unsigned int brastackptr = 0;
4478 size_t size;
4479 uschar *code;
4480 const uschar *codestart;
4481 const uschar *ptr;
4482 compile_data compile_block;
4483 int brastack[BRASTACK_SIZE];
4484 uschar bralenstack[BRASTACK_SIZE];
4485
4486 /* We can't pass back an error message if errorptr is NULL; I guess the best we
4487 can do is just return NULL. */
4488
4489 if (errorptr == NULL) return NULL;
4490 *errorptr = NULL;
4491
4492 /* However, we can give a message for this error */
4493
4494 if (erroroffset == NULL)
4495 {
4496 *errorptr = ERR16;
4497 return NULL;
4498 }
4499 *erroroffset = 0;
4500
4501 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
4502
4503 #ifdef SUPPORT_UTF8
4504 utf8 = (options & PCRE_UTF8) != 0;
4505 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
4506 (*erroroffset = valid_utf8((uschar *)pattern, -1)) >= 0)
4507 {
4508 *errorptr = ERR44;
4509 return NULL;
4510 }
4511 #else
4512 if ((options & PCRE_UTF8) != 0)
4513 {
4514 *errorptr = ERR32;
4515 return NULL;
4516 }
4517 #endif
4518
4519 if ((options & ~PUBLIC_OPTIONS) != 0)
4520 {
4521 *errorptr = ERR17;
4522 return NULL;
4523 }
4524
4525 /* Set up pointers to the individual character tables */
4526
4527 if (tables == NULL) tables = pcre_default_tables;
4528 compile_block.lcc = tables + lcc_offset;
4529 compile_block.fcc = tables + fcc_offset;
4530 compile_block.cbits = tables + cbits_offset;
4531 compile_block.ctypes = tables + ctypes_offset;
4532
4533 /* Maximum back reference and backref bitmap. This is updated for numeric
4534 references during the first pass, but for named references during the actual
4535 compile pass. The bitmap records up to 31 back references to help in deciding
4536 whether (.*) can be treated as anchored or not. */
4537
4538 compile_block.top_backref = 0;
4539 compile_block.backref_map = 0;
4540
4541 /* Reflect pattern for debugging output */
4542
4543 DPRINTF(("------------------------------------------------------------------\n"));
4544 DPRINTF(("%s\n", pattern));
4545
4546 /* The first thing to do is to make a pass over the pattern to compute the
4547 amount of store required to hold the compiled code. This does not have to be
4548 perfect as long as errors are overestimates. At the same time we can detect any
4549 flag settings right at the start, and extract them. Make an attempt to correct
4550 for any counted white space if an "extended" flag setting appears late in the
4551 pattern. We can't be so clever for #-comments. */
4552
4553 ptr = (const uschar *)(pattern - 1);
4554 while ((c = *(++ptr)) != 0)
4555 {
4556 int min, max;
4557 int class_optcount;
4558 int bracket_length;
4559 int duplength;
4560
4561 /* If we are inside a \Q...\E sequence, all chars are literal */
4562
4563 if (inescq)
4564 {
4565 if ((options & PCRE_AUTO_CALLOUT) != 0) length += 2 + 2*LINK_SIZE;
4566 goto NORMAL_CHAR;
4567 }
4568
4569 /* Otherwise, first check for ignored whitespace and comments */
4570
4571 if ((options & PCRE_EXTENDED) != 0)
4572 {
4573 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
4574 if (c == '#')
4575 {
4576 /* The space before the ; is to avoid a warning on a silly compiler
4577 on the Macintosh. */
4578 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
4579 if (c == 0) break;
4580 continue;
4581 }
4582 }
4583
4584 item_count++; /* Is zero for the first non-comment item */
4585
4586 /* Allow space for auto callout before every item except quantifiers. */
4587
4588 if ((options & PCRE_AUTO_CALLOUT) != 0 &&
4589 c != '*' && c != '+' && c != '?' &&
4590 (c != '{' || !is_counted_repeat(ptr + 1)))
4591 length += 2 + 2*LINK_SIZE;
4592
4593 switch(c)
4594 {
4595 /* A backslashed item may be an escaped data character or it may be a
4596 character type. */
4597
4598 case '\\':
4599 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
4600 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4601
4602 lastitemlength = 1; /* Default length of last item for repeats */
4603
4604 if (c >= 0) /* Data character */
4605 {
4606 length += 2; /* For a one-byte character */
4607
4608 #ifdef SUPPORT_UTF8
4609 if (utf8 && c > 127)
4610 {
4611 int i;
4612 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
4613 if (c <= utf8_table1[i]) break;
4614 length += i;
4615 lastitemlength += i;
4616 }
4617 #endif
4618
4619 continue;
4620 }
4621
4622 /* If \Q, enter "literal" mode */
4623
4624 if (-c == ESC_Q)
4625 {
4626 inescq = TRUE;
4627 continue;
4628 }
4629
4630 /* \X is supported only if Unicode property support is compiled */
4631
4632 #ifndef SUPPORT_UCP
4633 if (-c == ESC_X)
4634 {
4635 *errorptr = ERR45;
4636 goto PCRE_ERROR_RETURN;
4637 }
4638 #endif
4639
4640 /* \P and \p are for Unicode properties, but only when the support has
4641 been compiled. Each item needs 2 bytes. */
4642
4643 else if (-c == ESC_P || -c == ESC_p)
4644 {
4645 #ifdef SUPPORT_UCP
4646 BOOL negated;
4647 length += 2;
4648 lastitemlength = 2;
4649 if (get_ucp(&ptr, &negated, errorptr) < 0) goto PCRE_ERROR_RETURN;
4650 continue;
4651 #else
4652 *errorptr = ERR45;
4653 goto PCRE_ERROR_RETURN;
4654 #endif
4655 }
4656
4657 /* Other escapes need one byte */
4658
4659 length++;
4660
4661 /* A back reference needs an additional 2 bytes, plus either one or 5
4662 bytes for a repeat. We also need to keep the value of the highest
4663 back reference. */
4664
4665 if (c <= -ESC_REF)
4666 {
4667 int refnum = -c - ESC_REF;
4668 compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
4669 if (refnum > compile_block.top_backref)
4670 compile_block.top_backref = refnum;
4671 length += 2; /* For single back reference */
4672 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
4673 {
4674 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
4675 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4676 if ((min == 0 && (max == 1 || max == -1)) ||
4677 (min == 1 && max == -1))
4678 length++;
4679 else length += 5;
4680 if (ptr[1] == '?') ptr++;
4681 }
4682 }
4683 continue;
4684
4685 case '^': /* Single-byte metacharacters */
4686 case '.':
4687 case '$':
4688 length++;
4689 lastitemlength = 1;
4690 continue;
4691
4692 case '*': /* These repeats won't be after brackets; */
4693 case '+': /* those are handled separately */
4694 case '?':
4695 length++;
4696 goto POSESSIVE; /* A few lines below */
4697
4698 /* This covers the cases of braced repeats after a single char, metachar,
4699 class, or back reference. */
4700
4701 case '{':
4702 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
4703 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
4704 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4705
4706 /* These special cases just insert one extra opcode */
4707
4708 if ((min == 0 && (max == 1 || max == -1)) ||
4709 (min == 1 && max == -1))
4710 length++;
4711
4712 /* These cases might insert additional copies of a preceding character. */
4713
4714 else
4715 {
4716 if (min != 1)
4717 {
4718 length -= lastitemlength; /* Uncount the original char or metachar */
4719 if (min > 0) length += 3 + lastitemlength;
4720 }
4721 length += lastitemlength + ((max > 0)? 3 : 1);
4722 }
4723
4724 if (ptr[1] == '?') ptr++; /* Needs no extra length */
4725
4726 POSESSIVE: /* Test for possessive quantifier */
4727 if (ptr[1] == '+')
4728 {
4729 ptr++;
4730 length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */
4731 }
4732 continue;
4733
4734 /* An alternation contains an offset to the next branch or ket. If any ims
4735 options changed in the previous branch(es), and/or if we are in a
4736 lookbehind assertion, extra space will be needed at the start of the
4737 branch. This is handled by branch_extra. */
4738
4739 case '|':
4740 length += 1 + LINK_SIZE + branch_extra;
4741 continue;
4742
4743 /* A character class uses 33 characters provided that all the character
4744 values are less than 256. Otherwise, it uses a bit map for low valued
4745 characters, and individual items for others. Don't worry about character
4746 types that aren't allowed in classes - they'll get picked up during the
4747 compile. A character class that contains only one single-byte character
4748 uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
4749 where we can. (In UTF-8 mode we can do this only for chars < 128.) */
4750
4751 case '[':
4752 if (*(++ptr) == '^')
4753 {
4754 class_optcount = 10; /* Greater than one */
4755 ptr++;
4756 }
4757 else class_optcount = 0;
4758
4759 #ifdef SUPPORT_UTF8
4760 class_utf8 = FALSE;
4761 #endif
4762
4763 /* Written as a "do" so that an initial ']' is taken as data */
4764
4765 if (*ptr != 0) do
4766 {
4767 /* Inside \Q...\E everything is literal except \E */
4768
4769 if (inescq)
4770 {
4771 if (*ptr != '\\' || ptr[1] != 'E') goto GET_ONE_CHARACTER;
4772 inescq = FALSE;
4773 ptr += 1;
4774 continue;
4775 }
4776
4777 /* Outside \Q...\E, check for escapes */
4778
4779 if (*ptr == '\\')
4780 {
4781 c = check_escape(&ptr, errorptr, bracount, options, TRUE);
4782 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4783
4784 /* \b is backspace inside a class; \X is literal */
4785
4786 if (-c == ESC_b) c = '\b';
4787 else if (-c == ESC_X) c = 'X';
4788
4789 /* \Q enters quoting mode */
4790
4791 else if (-c == ESC_Q)
4792 {
4793 inescq = TRUE;
4794 continue;
4795 }
4796
4797 /* Handle escapes that turn into characters */
4798
4799 if (c >= 0) goto NON_SPECIAL_CHARACTER;
4800
4801 /* Escapes that are meta-things. The normal ones just affect the
4802 bit map, but Unicode properties require an XCLASS extended item. */
4803
4804 else
4805 {
4806 class_optcount = 10; /* \d, \s etc; make sure > 1 */
4807 #ifdef SUPPORT_UTF8
4808 if (-c == ESC_p || -c == ESC_P)
4809 {
4810 if (!class_utf8)
4811 {
4812 class_utf8 = TRUE;
4813 length += LINK_SIZE + 2;
4814 }
4815 length += 2;
4816 }
4817 #endif
4818 }
4819 }
4820
4821 /* Check the syntax for POSIX stuff. The bits we actually handle are
4822 checked during the real compile phase. */
4823
4824 else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block))
4825 {
4826 ptr++;
4827 class_optcount = 10; /* Make sure > 1 */
4828 }
4829
4830 /* Anything else increments the possible optimization count. We have to
4831 detect ranges here so that we can compute the number of extra ranges for
4832 caseless wide characters when UCP support is available. If there are wide
4833 characters, we are going to have to use an XCLASS, even for single
4834 characters. */
4835
4836 else
4837 {
4838 int d;
4839
4840 GET_ONE_CHARACTER:
4841
4842 #ifdef SUPPORT_UTF8
4843 if (utf8)
4844 {
4845 int extra = 0;
4846 GETCHARLEN(c, ptr, extra);
4847 ptr += extra;
4848 }
4849 else c = *ptr;
4850 #else
4851 c = *ptr;
4852 #endif
4853
4854 /* Come here from handling \ above when it escapes to a char value */
4855
4856 NON_SPECIAL_CHARACTER:
4857 class_optcount++;
4858
4859 d = -1;
4860 if (ptr[1] == '-')
4861 {
4862 uschar const *hyptr = ptr++;
4863 if (ptr[1] == '\\')
4864 {
4865 ptr++;
4866 d = check_escape(&ptr, errorptr, bracount, options, TRUE);
4867 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4868 if (-d == ESC_b) d = '\b'; /* backspace */
4869 else if (-d == ESC_X) d = 'X'; /* literal X in a class */
4870 }
4871 else if (ptr[1] != 0 && ptr[1] != ']')
4872 {
4873 ptr++;
4874 #ifdef SUPPORT_UTF8
4875 if (utf8)
4876 {
4877 int extra = 0;
4878 GETCHARLEN(d, ptr, extra);
4879 ptr += extra;
4880 }
4881 else
4882 #endif
4883 d = *ptr;
4884 }
4885 if (d < 0) ptr = hyptr; /* go back to hyphen as data */
4886 }
4887
4888 /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
4889 127 for caseless matching, we will need to use an XCLASS. */
4890
4891 if (d >= 0)
4892 {
4893 class_optcount = 10; /* Ensure > 1 */
4894 if (d < c)
4895 {
4896 *errorptr = ERR8;
4897 goto PCRE_ERROR_RETURN;
4898 }
4899
4900 #ifdef SUPPORT_UTF8
4901 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
4902 {
4903 uschar buffer[6];
4904 if (!class_utf8) /* Allow for XCLASS overhead */
4905 {
4906 class_utf8 = TRUE;
4907 length += LINK_SIZE + 2;
4908 }
4909
4910 #ifdef SUPPORT_UCP
4911 /* If we have UCP support, find out how many extra ranges are
4912 needed to map the other case of characters within this range. We
4913 have to mimic the range optimization here, because extending the
4914 range upwards might push d over a boundary that makes is use
4915 another byte in the UTF-8 representation. */
4916
4917 if ((options & PCRE_CASELESS) != 0)
4918 {
4919 int occ, ocd;
4920 int cc = c;
4921 int origd = d;
4922 while (get_othercase_range(&cc, origd, &occ, &ocd))
4923 {
4924 if (occ >= c && ocd <= d) continue; /* Skip embedded */
4925
4926 if (occ < c && ocd >= c - 1) /* Extend the basic range */
4927 { /* if there is overlap, */
4928 c = occ; /* noting that if occ < c */
4929 continue; /* we can't have ocd > d */
4930 } /* because a subrange is */
4931 if (ocd > d && occ <= d + 1) /* always shorter than */
4932 { /* the basic range. */
4933 d = ocd;
4934 continue;
4935 }
4936
4937 /* An extra item is needed */
4938
4939 length += 1 + ord2utf8(occ, buffer) +
4940 ((occ == ocd)? 0 : ord2utf8(ocd, buffer));
4941 }
4942 }
4943 #endif /* SUPPORT_UCP */
4944
4945 /* The length of the (possibly extended) range */
4946
4947 length += 1 + ord2utf8(c, buffer) + ord2utf8(d, buffer);
4948 }
4949 #endif /* SUPPORT_UTF8 */
4950
4951 }
4952
4953 /* We have a single character. There is nothing to be done unless we
4954 are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
4955 allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
4956 support. */
4957
4958 else
4959 {
4960 #ifdef SUPPORT_UTF8
4961 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
4962 {
4963 uschar buffer[6];
4964 class_optcount = 10; /* Ensure > 1 */
4965 if (!class_utf8) /* Allow for XCLASS overhead */
4966 {
4967 class_utf8 = TRUE;
4968 length += LINK_SIZE + 2;
4969 }
4970 #ifdef SUPPORT_UCP
4971 length += (((options & PCRE_CASELESS) != 0)? 2 : 1) *
4972 (1 + ord2utf8(c, buffer));
4973 #else /* SUPPORT_UCP */
4974 length += 1 + ord2utf8(c, buffer);
4975 #endif /* SUPPORT_UCP */
4976 }
4977 #endif /* SUPPORT_UTF8 */
4978 }
4979 }
4980 }
4981 while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */
4982
4983 if (*ptr == 0) /* Missing terminating ']' */
4984 {
4985 *errorptr = ERR6;
4986 goto PCRE_ERROR_RETURN;
4987 }
4988
4989 /* We can optimize when there was only one optimizable character. Repeats
4990 for positive and negated single one-byte chars are handled by the general
4991 code. Here, we handle repeats for the class opcodes. */
4992
4993 if (class_optcount == 1) length += 3; else
4994 {
4995 length += 33;
4996
4997 /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
4998 we also need extra for wrapping the whole thing in a sub-pattern. */
4999
5000 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
5001 {
5002 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
5003 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
5004 if ((min == 0 && (max == 1 || max == -1)) ||
5005 (min == 1 && max == -1))
5006 length++;
5007 else length += 5;
5008 if (ptr[1] == '+')
5009 {
5010 ptr++;
5011 length += 2 + 2*LINK_SIZE;
5012 }
5013 else if (ptr[1] == '?') ptr++;
5014 }
5015 }
5016 continue;
5017
5018 /* Brackets may be genuine groups or special things */
5019
5020 case '(':
5021 branch_newextra = 0;
5022 bracket_length = 1 + LINK_SIZE;
5023
5024 /* Handle special forms of bracket, which all start (? */
5025
5026 if (ptr[1] == '?')
5027 {
5028 int set, unset;
5029 int *optset;
5030
5031 switch (c = ptr[2])
5032 {
5033 /* Skip over comments entirely */
5034 case '#':
5035 ptr += 3;
5036 while (*ptr != 0 && *ptr != ')') ptr++;
5037 if (*ptr == 0)
5038 {
5039 *errorptr = ERR18;
5040 goto PCRE_ERROR_RETURN;
5041 }
5042 continue;
5043
5044 /* Non-referencing groups and lookaheads just move the pointer on, and
5045 then behave like a non-special bracket, except that they don't increment
5046 the count of extracting brackets. Ditto for the "once only" bracket,
5047 which is in Perl from version 5.005. */
5048
5049 case ':':
5050 case '=':
5051 case '!':
5052 case '>':
5053 ptr += 2;
5054 break;
5055
5056 /* (?R) specifies a recursive call to the regex, which is an extension
5057 to provide the facility which can be obtained by (?p{perl-code}) in
5058 Perl 5.6. In Perl 5.8 this has become (??{perl-code}).
5059
5060 From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to
5061 the appropriate numbered brackets. This includes both recursive and
5062 non-recursive calls. (?R) is now synonymous with (?0). */
5063
5064 case 'R':
5065 ptr++;
5066
5067 case '0': case '1': case '2': case '3': case '4':
5068 case '5': case '6': case '7': case '8': case '9':
5069 ptr += 2;
5070 if (c != 'R')
5071 while ((digitab[*(++ptr)] & ctype_digit) != 0);
5072 if (*ptr != ')')
5073 {
5074 *errorptr = ERR29;
5075 goto PCRE_ERROR_RETURN;
5076 }
5077 length += 1 + LINK_SIZE;
5078
5079 /* If this item is quantified, it will get wrapped inside brackets so
5080 as to use the code for quantified brackets. We jump down and use the
5081 code that handles this for real brackets. */
5082
5083 if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{')
5084 {
5085 length += 2 + 2 * LINK_SIZE; /* to make bracketed */
5086 duplength = 5 + 3 * LINK_SIZE;
5087 goto HANDLE_QUANTIFIED_BRACKETS;
5088 }
5089 continue;
5090
5091 /* (?C) is an extension which provides "callout" - to provide a bit of
5092 the functionality of the Perl (?{...}) feature. An optional number may
5093 follow (default is zero). */
5094
5095 case 'C':
5096 ptr += 2;
5097 while ((digitab[*(++ptr)] & ctype_digit) != 0);
5098 if (*ptr != ')')
5099 {
5100 *errorptr = ERR39;
5101 goto PCRE_ERROR_RETURN;
5102 }
5103 length += 2 + 2*LINK_SIZE;
5104 continue;
5105
5106 /* Named subpatterns are an extension copied from Python */
5107
5108 case 'P':
5109 ptr += 3;
5110 if (*ptr == '<')
5111 {
5112 const uschar *p; /* Don't amalgamate; some compilers */
5113 p = ++ptr; /* grumble at autoincrement in declaration */
5114 while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;
5115 if (*ptr != '>')
5116 {
5117 *errorptr = ERR42;
5118 goto PCRE_ERROR_RETURN;
5119 }
5120 name_count++;
5121 if (ptr - p > max_name_size) max_name_size = (ptr - p);
5122 break;
5123 }
5124
5125 if (*ptr == '=' || *ptr == '>')
5126 {
5127 while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0);
5128 if (*ptr != ')')
5129 {
5130 *errorptr = ERR42;
5131 goto PCRE_ERROR_RETURN;
5132 }
5133 break;
5134 }
5135
5136 /* Unknown character after (?P */
5137
5138 *errorptr = ERR41;
5139 goto PCRE_ERROR_RETURN;
5140
5141 /* Lookbehinds are in Perl from version 5.005 */
5142
5143 case '<':
5144 ptr += 3;
5145 if (*ptr == '=' || *ptr == '!')
5146 {
5147 branch_newextra = 1 + LINK_SIZE;
5148 length += 1 + LINK_SIZE; /* For the first branch */
5149 break;
5150 }
5151 *errorptr = ERR24;
5152 goto PCRE_ERROR_RETURN;
5153
5154 /* Conditionals are in Perl from version 5.005. The bracket must either
5155 be followed by a number (for bracket reference) or by an assertion
5156 group, or (a PCRE extension) by 'R' for a recursion test. */
5157
5158 case '(':
5159 if (ptr[3] == 'R' && ptr[4] == ')')
5160 {
5161 ptr += 4;
5162 length += 3;
5163 }
5164 else if ((digitab[ptr[3]] & ctype_digit) != 0)
5165 {
5166 ptr += 4;
5167 length += 3;
5168 while ((digitab[*ptr] & ctype_digit) != 0) ptr++;
5169 if (*ptr != ')')
5170 {
5171 *errorptr = ERR26;
5172 goto PCRE_ERROR_RETURN;
5173 }
5174 }
5175 else /* An assertion must follow */
5176 {
5177 ptr++; /* Can treat like ':' as far as spacing is concerned */
5178 if (ptr[2] != '?' ||
5179 (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
5180 {
5181 ptr += 2; /* To get right offset in message */
5182 *errorptr = ERR28;
5183 goto PCRE_ERROR_RETURN;
5184 }
5185 }
5186 break;
5187
5188 /* Else loop checking valid options until ) is met. Anything else is an
5189 error. If we are without any brackets, i.e. at top level, the settings
5190 act as if specified in the options, so massage the options immediately.
5191 This is for backward compatibility with Perl 5.004. */
5192
5193 default:
5194 set = unset = 0;
5195 optset = &set;
5196 ptr += 2;
5197
5198 for (;; ptr++)
5199 {
5200 c = *ptr;
5201 switch (c)
5202 {
5203 case 'i':
5204 *optset |= PCRE_CASELESS;
5205 continue;
5206
5207 case 'm':
5208 *optset |= PCRE_MULTILINE;
5209 continue;
5210
5211 case 's':
5212 *optset |= PCRE_DOTALL;
5213 continue;
5214
5215 case 'x':
5216 *optset |= PCRE_EXTENDED;
5217 continue;
5218
5219 case 'X':
5220 *optset |= PCRE_EXTRA;
5221 continue;
5222
5223 case 'U':
5224 *optset |= PCRE_UNGREEDY;
5225 continue;
5226
5227 case '-':
5228 optset = &unset;
5229 continue;
5230
5231 /* A termination by ')' indicates an options-setting-only item; if
5232 this is at the very start of the pattern (indicated by item_count
5233 being zero), we use it to set the global options. This is helpful
5234 when analyzing the pattern for first characters, etc. Otherwise
5235 nothing is done here and it is handled during the compiling
5236 process.
5237
5238 [Historical note: Up to Perl 5.8, options settings at top level
5239 were always global settings, wherever they appeared in the pattern.
5240 That is, they were equivalent to an external setting. From 5.8
5241 onwards, they apply only to what follows (which is what you might
5242 expect).] */
5243
5244 case ')':
5245 if (item_count == 0)
5246 {
5247 options = (options | set) & (~unset);
5248 set = unset = 0; /* To save length */
5249 item_count--; /* To allow for several */
5250 }
5251
5252 /* Fall through */
5253
5254 /* A termination by ':' indicates the start of a nested group with
5255 the given options set. This is again handled at compile time, but
5256 we must allow for compiled space if any of the ims options are
5257 set. We also have to allow for resetting space at the end of
5258 the group, which is why 4 is added to the length and not just 2.
5259 If there are several changes of options within the same group, this
5260 will lead to an over-estimate on the length, but this shouldn't
5261 matter very much. We also have to allow for resetting options at
5262 the start of any alternations, which we do by setting
5263 branch_newextra to 2. Finally, we record whether the case-dependent
5264 flag ever changes within the regex. This is used by the "required
5265 character" code. */
5266
5267 case ':':
5268 if (((set|unset) & PCRE_IMS) != 0)
5269 {
5270 length += 4;
5271 branch_newextra = 2;
5272 if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
5273 }
5274 goto END_OPTIONS;
5275
5276 /* Unrecognized option character */
5277
5278 default:
5279 *errorptr = ERR12;
5280 goto PCRE_ERROR_RETURN;
5281 }
5282 }
5283
5284 /* If we hit a closing bracket, that's it - this is a freestanding
5285 option-setting. We need to ensure that branch_extra is updated if
5286 necessary. The only values branch_newextra can have here are 0 or 2.
5287 If the value is 2, then branch_extra must either be 2 or 5, depending
5288 on whether this is a lookbehind group or not. */
5289
5290 END_OPTIONS:
5291 if (c == ')')
5292 {
5293 if (branch_newextra == 2 &&
5294 (branch_extra == 0 || branch_extra == 1+LINK_SIZE))
5295 branch_extra += branch_newextra;
5296 continue;
5297 }
5298
5299 /* If options were terminated by ':' control comes here. Fall through
5300 to handle the group below. */
5301 }
5302 }
5303
5304 /* Extracting brackets must be counted so we can process escapes in a
5305 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to
5306 need an additional 3 bytes of store per extracting bracket. However, if
5307 PCRE_NO_AUTO)CAPTURE is set, unadorned brackets become non-capturing, so we
5308 must leave the count alone (it will aways be zero). */
5309
5310 else if ((options & PCRE_NO_AUTO_CAPTURE) == 0)
5311 {
5312 bracount++;
5313 if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
5314 }
5315
5316 /* Save length for computing whole length at end if there's a repeat that
5317 requires duplication of the group. Also save the current value of
5318 branch_extra, and start the new group with the new value. If non-zero, this
5319 will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
5320
5321 if (brastackptr >= sizeof(brastack)/sizeof(int))
5322 {
5323 *errorptr = ERR19;
5324 goto PCRE_ERROR_RETURN;
5325 }
5326
5327 bralenstack[brastackptr] = branch_extra;
5328 branch_extra = branch_newextra;
5329
5330 brastack[brastackptr++] = length;
5331 length += bracket_length;
5332 continue;
5333
5334 /* Handle ket. Look for subsequent max/min; for certain sets of values we
5335 have to replicate this bracket up to that many times. If brastackptr is
5336 0 this is an unmatched bracket which will generate an error, but take care
5337 not to try to access brastack[-1] when computing the length and restoring
5338 the branch_extra value. */
5339
5340 case ')':
5341 length += 1 + LINK_SIZE;
5342 if (brastackptr > 0)
5343 {
5344 duplength = length - brastack[--brastackptr];
5345 branch_extra = bralenstack[brastackptr];
5346 }
5347 else duplength = 0;
5348
5349 /* The following code is also used when a recursion such as (?3) is
5350 followed by a quantifier, because in that case, it has to be wrapped inside
5351 brackets so that the quantifier works. The value of duplength must be
5352 set before arrival. */
5353
5354 HANDLE_QUANTIFIED_BRACKETS:
5355
5356 /* Leave ptr at the final char; for read_repeat_counts this happens
5357 automatically; for the others we need an increment. */
5358
5359 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
5360 {
5361 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
5362 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
5363 }
5364 else if (c == '*') { min = 0; max = -1; ptr++; }
5365 else if (c == '+') { min = 1; max = -1; ptr++; }
5366 else if (c == '?') { min = 0; max = 1; ptr++; }
5367 else { min = 1; max = 1; }
5368
5369 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
5370 group, and if the maximum is greater than zero, we have to replicate
5371 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
5372 bracket set. */
5373
5374 if (min == 0)
5375 {
5376 length++;
5377 if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
5378 }
5379
5380 /* When the minimum is greater than zero, we have to replicate up to
5381 minval-1 times, with no additions required in the copies. Then, if there
5382 is a limited maximum we have to replicate up to maxval-1 times allowing
5383 for a BRAZERO item before each optional copy and nesting brackets for all
5384 but one of the optional copies. */
5385
5386 else
5387 {
5388 length += (min - 1) * duplength;
5389 if (max > min) /* Need this test as max=-1 means no limit */
5390 length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
5391 - (2 + 2*LINK_SIZE);
5392 }
5393
5394 /* Allow space for once brackets for "possessive quantifier" */
5395
5396 if (ptr[1] == '+')
5397 {
5398 ptr++;
5399 length += 2 + 2*LINK_SIZE;
5400 }
5401 continue;
5402
5403 /* Non-special character. It won't be space or # in extended mode, so it is
5404 always a genuine character. If we are in a \Q...\E sequence, check for the
5405 end; if not, we have a literal. */
5406
5407 default:
5408 NORMAL_CHAR:
5409
5410 if (inescq && c == '\\' && ptr[1] == 'E')
5411 {
5412 inescq = FALSE;
5413 ptr++;
5414 continue;
5415 }
5416
5417 length += 2; /* For a one-byte character */
5418 lastitemlength = 1; /* Default length of last item for repeats */
5419
5420 /* In UTF-8 mode, check for additional bytes. */
5421
5422 #ifdef SUPPORT_UTF8
5423 if (utf8 && (c & 0xc0) == 0xc0)
5424 {
5425 while ((ptr[1] & 0xc0) == 0x80) /* Can't flow over the end */
5426 { /* because the end is marked */
5427 lastitemlength++; /* by a zero byte. */
5428 length++;
5429 ptr++;
5430 }
5431 }
5432 #endif
5433
5434 continue;
5435 }
5436 }
5437
5438 length += 2 + LINK_SIZE; /* For final KET and END */
5439
5440 if ((options & PCRE_AUTO_CALLOUT) != 0)
5441 length += 2 + 2*LINK_SIZE; /* For final callout */
5442
5443 if (length > MAX_PATTERN_SIZE)
5444 {
5445 *errorptr = ERR20;
5446 return NULL;
5447 }
5448
5449 /* Compute the size of data block needed and get it, either from malloc or
5450 externally provided function. */
5451
5452 size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
5453 re = (real_pcre *)(pcre_malloc)(size);
5454
5455 if (re == NULL)
5456 {
5457 *errorptr = ERR21;
5458 return NULL;
5459 }
5460
5461 /* Put in the magic number, and save the sizes, options, and character table
5462 pointer. NULL is used for the default character tables. The nullpad field is at
5463 the end; it's there to help in the case when a regex compiled on a system with
5464 4-byte pointers is run on another with 8-byte pointers. */
5465
5466 re->magic_number = MAGIC_NUMBER;
5467 re->size = size;
5468 re->options = options;
5469 re->dummy1 = re->dummy2 = 0;
5470 re->name_table_offset = sizeof(real_pcre);
5471 re->name_entry_size = max_name_size + 3;
5472 re->name_count = name_count;
5473 re->tables = (tables == pcre_default_tables)? NULL : tables;
5474 re->nullpad = NULL;
5475
5476 /* The starting points of the name/number translation table and of the code are
5477 passed around in the compile data block. */
5478
5479 compile_block.names_found = 0;
5480 compile_block.name_entry_size = max_name_size + 3;
5481 compile_block.name_table = (uschar *)re + re->name_table_offset;
5482 codestart = compile_block.name_table + re->name_entry_size * re->name_count;
5483 compile_block.start_code = codestart;
5484 compile_block.start_pattern = (const uschar *)pattern;
5485 compile_block.req_varyopt = 0;
5486 compile_block.nopartial = FALSE;
5487
5488 /* Set up a starting, non-extracting bracket, then compile the expression. On
5489 error, *errorptr will be set non-NULL, so we don't need to look at the result
5490 of the function here. */
5491
5492 ptr = (const uschar *)pattern;
5493 code = (uschar *)codestart;
5494 *code = OP_BRA;
5495 bracount = 0;
5496 (void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr,
5497 errorptr, FALSE, 0, &firstbyte, &reqbyte, NULL, &compile_block);
5498 re->top_bracket = bracount;
5499 re->top_backref = compile_block.top_backref;
5500
5501 if (compile_block.nopartial) re->options |= PCRE_NOPARTIAL;
5502
5503 /* If not reached end of pattern on success, there's an excess bracket. */
5504
5505 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
5506
5507 /* Fill in the terminating state and check for disastrous overflow, but
5508 if debugging, leave the test till after things are printed out. */
5509
5510 *code++ = OP_END;
5511
5512 #ifndef DEBUG
5513 if (code - codestart > length) *errorptr = ERR23;
5514 #endif
5515
5516 /* Give an error if there's back reference to a non-existent capturing
5517 subpattern. */
5518
5519 if (re->top_backref > re->top_bracket) *errorptr = ERR15;
5520
5521 /* Failed to compile, or error while post-processing */
5522
5523 if (*errorptr != NULL)
5524 {
5525 (pcre_free)(re);
5526 PCRE_ERROR_RETURN:
5527 *erroroffset = ptr - (const uschar *)pattern;
5528 return NULL;
5529 }
5530
5531 /* If the anchored option was not passed, set the flag if we can determine that
5532 the pattern is anchored by virtue of ^ characters or \A or anything else (such
5533 as starting with .* when DOTALL is set).
5534
5535 Otherwise, if we know what the first character has to be, save it, because that
5536 speeds up unanchored matches no end. If not, see if we can set the
5537 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
5538 start with ^. and also when all branches start with .* for non-DOTALL matches.
5539 */
5540
5541 if ((options & PCRE_ANCHORED) == 0)
5542 {
5543 int temp_options = options;
5544 if (is_anchored(codestart, &temp_options, 0, compile_block.backref_map))
5545 re->options |= PCRE_ANCHORED;
5546 else
5547 {
5548 if (firstbyte < 0)
5549 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
5550 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
5551 {
5552 int ch = firstbyte & 255;
5553 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
5554 compile_block.fcc[ch] == ch)? ch : firstbyte;
5555 re->options |= PCRE_FIRSTSET;
5556 }
5557 else if (is_startline(codestart, 0, compile_block.backref_map))
5558 re->options |= PCRE_STARTLINE;
5559 }
5560 }
5561
5562 /* For an anchored pattern, we use the "required byte" only if it follows a
5563 variable length item in the regex. Remove the caseless flag for non-caseable
5564 bytes. */
5565
5566 if (reqbyte >= 0 &&
5567 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
5568 {
5569 int ch = reqbyte & 255;
5570 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
5571 compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5572 re->options |= PCRE_REQCHSET;
5573 }
5574
5575 /* Print out the compiled data for debugging */
5576
5577 #ifdef DEBUG
5578
5579 printf("Length = %d top_bracket = %d top_backref = %d\n",
5580 length, re->top_bracket, re->top_backref);
5581
5582 if (re->options != 0)
5583 {
5584 printf("%s%s%s%s%s%s%s%s%s%s\n",
5585 ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5586 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5587 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5588 ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
5589 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5590 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5591 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5592 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5593 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5594 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5595 }
5596
5597 if ((re->options & PCRE_FIRSTSET) != 0)
5598 {
5599 int ch = re->first_byte & 255;
5600 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
5601 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5602 else printf("First char = \\x%02x%s\n", ch, caseless);
5603 }
5604
5605 if ((re->options & PCRE_REQCHSET) != 0)
5606 {
5607 int ch = re->req_byte & 255;
5608 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
5609 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5610 else printf("Req char = \\x%02x%s\n", ch, caseless);
5611 }
5612
5613 print_internals(re, stdout);
5614
5615 /* This check is done here in the debugging case so that the code that
5616 was compiled can be seen. */
5617
5618 if (code - codestart > length)
5619 {
5620 *errorptr = ERR23;
5621 (pcre_free)(re);
5622 *erroroffset = ptr - (uschar *)pattern;
5623 return NULL;
5624 }
5625 #endif
5626
5627 return (pcre *)re;
5628 }
5629
5630
5631
5632 /*************************************************
5633 * Match a back-reference *
5634 *************************************************/
5635
5636 /* If a back reference hasn't been set, the length that is passed is greater
5637 than the number of characters left in the string, so the match fails.
5638
5639 Arguments:
5640 offset index into the offset vector
5641 eptr points into the subject
5642 length length to be matched
5643 md points to match data block
5644 ims the ims flags
5645
5646 Returns: TRUE if matched
5647 */
5648
5649 static BOOL
5650 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
5651 unsigned long int ims)
5652 {
5653 const uschar *p = md->start_subject + md->offset_vector[offset];
5654
5655 #ifdef DEBUG
5656 if (eptr >= md->end_subject)
5657 printf("matching subject <null>");
5658 else
5659 {
5660 printf("matching subject ");
5661 pchars(eptr, length, TRUE, md);
5662 }
5663 printf(" against backref ");
5664 pchars(p, length, FALSE, md);
5665 printf("\n");
5666 #endif
5667
5668 /* Always fail if not enough characters left */
5669
5670 if (length > md->end_subject - eptr) return FALSE;
5671
5672 /* Separate the caselesss case for speed */
5673
5674 if ((ims & PCRE_CASELESS) != 0)
5675 {
5676 while (length-- > 0)
5677 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
5678 }
5679 else
5680 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
5681
5682 return TRUE;
5683 }
5684
5685
5686 #ifdef SUPPORT_UTF8
5687 /*************************************************
5688 * Match character against an XCLASS *
5689 *************************************************/
5690
5691 /* This function is called from within the XCLASS code below, to match a
5692 character against an extended class which might match values > 255.
5693
5694 Arguments:
5695 c the character
5696 data points to the flag byte of the XCLASS data
5697
5698 Returns: TRUE if character matches, else FALSE
5699 */
5700
5701 static BOOL
5702 match_xclass(int c, const uschar *data)
5703 {
5704 int t;
5705 BOOL negated = (*data & XCL_NOT) != 0;
5706
5707 /* Character values < 256 are matched against a bitmap, if one is present. If
5708 not, we still carry on, because there may be ranges that start below 256 in the
5709 additional data. */
5710
5711 if (c < 256)
5712 {
5713 if ((*data & XCL_MAP) != 0 && (data[1 + c/8] & (1 << (c&7))) != 0)
5714 return !negated; /* char found */
5715 }
5716
5717 /* First skip the bit map if present. Then match against the list of Unicode
5718 properties or large chars or ranges that end with a large char. We won't ever
5719 encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */
5720
5721 if ((*data++ & XCL_MAP) != 0) data += 32;
5722
5723 while ((t = *data++) != XCL_END)
5724 {
5725 int x, y;
5726 if (t == XCL_SINGLE)
5727 {
5728 GETCHARINC(x, data);
5729 if (c == x) return !negated;
5730 }
5731 else if (t == XCL_RANGE)
5732 {
5733 GETCHARINC(x, data);
5734 GETCHARINC(y, data);
5735 if (c >= x && c <= y) return !negated;
5736 }
5737
5738 #ifdef SUPPORT_UCP
5739 else /* XCL_PROP & XCL_NOTPROP */
5740 {
5741 int chartype, othercase;
5742 int rqdtype = *data++;
5743 int category = ucp_findchar(c, &chartype, &othercase);
5744 if (rqdtype >= 128)
5745 {
5746 if ((rqdtype - 128 == category) == (t == XCL_PROP)) return !negated;
5747 }
5748 else
5749 {
5750 if ((rqdtype == chartype) == (t == XCL_PROP)) return !negated;
5751 }
5752 }
5753 #endif /* SUPPORT_UCP */
5754 }
5755
5756 return negated; /* char did not match */
5757 }
5758 #endif
5759
5760
5761 /***************************************************************************
5762 ****************************************************************************
5763 RECURSION IN THE match() FUNCTION
5764
5765 The match() function is highly recursive. Some regular expressions can cause
5766 it to recurse thousands of times. I was writing for Unix, so I just let it
5767 call itself recursively. This uses the stack for saving everything that has
5768 to be saved for a recursive call. On Unix, the stack can be large, and this
5769 works fine.
5770
5771 It turns out that on non-Unix systems there are problems with programs that
5772 use a lot of stack. (This despite the fact that every last chip has oodles
5773 of memory these days, and techniques for extending the stack have been known
5774 for decades.) So....
5775
5776 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
5777 calls by keeping local variables that need to be preserved in blocks of memory
5778 obtained from malloc instead instead of on the stack. Macros are used to
5779 achieve this so that the actual code doesn't look very different to what it
5780 always used to.
5781 ****************************************************************************
5782 ***************************************************************************/
5783
5784
5785 /* These versions of the macros use the stack, as normal */
5786
5787 #ifndef NO_RECURSE
5788 #define REGISTER register
5789 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) rx = match(ra,rb,rc,rd,re,rf,rg)
5790 #define RRETURN(ra) return ra
5791 #else
5792
5793
5794 /* These versions of the macros manage a private stack on the heap. Note
5795 that the rd argument of RMATCH isn't actually used. It's the md argument of
5796 match(), which never changes. */
5797
5798 #define REGISTER
5799
5800 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg)\
5801 {\
5802 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
5803 if (setjmp(frame->Xwhere) == 0)\
5804 {\
5805 newframe->Xeptr = ra;\
5806 newframe->Xecode = rb;\
5807 newframe->Xoffset_top = rc;\
5808 newframe->Xims = re;\
5809 newframe->Xeptrb = rf;\
5810 newframe->Xflags = rg;\
5811 newframe->Xprevframe = frame;\
5812 frame = newframe;\
5813 DPRINTF(("restarting from line %d\n", __LINE__));\
5814 goto HEAP_RECURSE;\
5815 }\
5816 else\
5817 {\
5818 DPRINTF(("longjumped back to line %d\n", __LINE__));\
5819 frame = md->thisframe;\
5820 rx = frame->Xresult;\
5821 }\
5822 }
5823
5824 #define RRETURN(ra)\
5825 {\
5826 heapframe *newframe = frame;\
5827 frame = newframe->Xprevframe;\
5828 (pcre_stack_free)(newframe);\
5829 if (frame != NULL)\
5830 {\
5831 frame->Xresult = ra;\
5832 md->thisframe = frame;\
5833 longjmp(frame->Xwhere, 1);\
5834 }\
5835 return ra;\
5836 }
5837
5838
5839 /* Structure for remembering the local variables in a private frame */
5840
5841 typedef struct heapframe {
5842 struct heapframe *Xprevframe;
5843
5844 /* Function arguments that may change */
5845
5846 const uschar *Xeptr;
5847 const uschar *Xecode;
5848 int Xoffset_top;
5849 long int Xims;
5850 eptrblock *Xeptrb;
5851 int Xflags;
5852
5853 /* Function local variables */
5854
5855 const uschar *Xcallpat;
5856 const uschar *Xcharptr;
5857 const uschar *Xdata;
5858 const uschar *Xnext;
5859 const uschar *Xpp;
5860 const uschar *Xprev;
5861 const uschar *Xsaved_eptr;
5862
5863 recursion_info Xnew_recursive;
5864
5865 BOOL Xcur_is_word;
5866 BOOL Xcondition;
5867 BOOL Xminimize;
5868 BOOL Xprev_is_word;
5869
5870 unsigned long int Xoriginal_ims;
5871
5872 #ifdef SUPPORT_UCP
5873 int Xprop_type;
5874 int Xprop_fail_result;
5875 int Xprop_category;
5876 int Xprop_chartype;
5877 int Xprop_othercase;
5878 int Xprop_test_against;
5879 int *Xprop_test_variable;
5880 #endif
5881
5882 int Xctype;
5883 int Xfc;
5884 int Xfi;
5885 int Xlength;
5886 int Xmax;
5887 int Xmin;
5888 int Xnumber;
5889 int Xoffset;
5890 int Xop;
5891 int Xsave_capture_last;
5892 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
5893 int Xstacksave[REC_STACK_SAVE_MAX];
5894
5895 eptrblock Xnewptrb;
5896
5897 /* Place to pass back result, and where to jump back to */
5898
5899 int Xresult;
5900 jmp_buf Xwhere;
5901
5902 } heapframe;
5903
5904 #endif
5905
5906
5907 /***************************************************************************
5908 ***************************************************************************/
5909
5910
5911
5912 /*************************************************
5913 * Match from current position *
5914 *************************************************/
5915
5916 /* On entry ecode points to the first opcode, and eptr to the first character
5917 in the subject string, while eptrb holds the value of eptr at the start of the
5918 last bracketed group - used for breaking infinite loops matching zero-length
5919 strings. This function is called recursively in many circumstances. Whenever it
5920 returns a negative (error) response, the outer incarnation must also return the
5921 same response.
5922
5923 Performance note: It might be tempting to extract commonly used fields from the
5924 md structure (e.g. utf8, end_subject) into individual variables to improve
5925 performance. Tests using gcc on a SPARC disproved this; in the first case, it
5926 made performance worse.
5927
5928 Arguments:
5929 eptr pointer in subject
5930 ecode position in code
5931 offset_top current top pointer
5932 md pointer to "static" info for the match
5933 ims current /i, /m, and /s options
5934 eptrb pointer to chain of blocks containing eptr at start of
5935 brackets - for testing for empty matches
5936 flags can contain
5937 match_condassert - this is an assertion condition
5938 match_isgroup - this is the start of a bracketed group
5939
5940 Returns: MATCH_MATCH if matched ) these values are >= 0
5941 MATCH_NOMATCH if failed to match )
5942 a negative PCRE_ERROR_xxx value if aborted by an error condition
5943 (e.g. stopped by recursion limit)
5944 */
5945
5946 static int
5947 match(REGISTER const uschar *eptr, REGISTER const uschar *ecode,
5948 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
5949 int flags)
5950 {
5951 /* These variables do not need to be preserved over recursion in this function,
5952 so they can be ordinary variables in all cases. Mark them with "register"
5953 because they are used a lot in loops. */
5954
5955 register int rrc; /* Returns from recursive calls */
5956 register int i; /* Used for loops not involving calls to RMATCH() */
5957 register int c; /* Character values not kept over RMATCH() calls */
5958
5959 /* When recursion is not being used, all "local" variables that have to be
5960 preserved over calls to RMATCH() are part of a "frame" which is obtained from
5961 heap storage. Set up the top-level frame here; others are obtained from the
5962 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
5963
5964 #ifdef NO_RECURSE
5965 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
5966 frame->Xprevframe = NULL; /* Marks the top level */
5967
5968 /* Copy in the original argument variables */
5969
5970 frame->Xeptr = eptr;
5971 frame->Xecode = ecode;
5972 frame->Xoffset_top = offset_top;
5973 frame->Xims = ims;
5974 frame->Xeptrb = eptrb;
5975 frame->Xflags = flags;
5976
5977 /* This is where control jumps back to to effect "recursion" */
5978
5979 HEAP_RECURSE:
5980
5981 /* Macros make the argument variables come from the current frame */
5982
5983 #define eptr frame->Xeptr
5984 #define ecode frame->Xecode
5985 #define offset_top frame->Xoffset_top
5986 #define ims frame->Xims
5987 #define eptrb frame->Xeptrb
5988 #define flags frame->Xflags
5989
5990 /* Ditto for the local variables */
5991
5992 #ifdef SUPPORT_UTF8
5993 #define charptr frame->Xcharptr
5994 #endif
5995 #define callpat frame->Xcallpat
5996 #define data frame->Xdata
5997 #define next frame->Xnext
5998 #define pp frame->Xpp
5999 #define prev frame->Xprev
6000 #define saved_eptr frame->Xsaved_eptr
6001
6002 #define new_recursive frame->Xnew_recursive
6003
6004 #define cur_is_word frame->Xcur_is_word
6005 #define condition frame->Xcondition
6006 #define minimize frame->Xminimize
6007 #define prev_is_word frame->Xprev_is_word
6008
6009 #define original_ims frame->Xoriginal_ims
6010
6011 #ifdef SUPPORT_UCP
6012 #define prop_type frame->Xprop_type
6013 #define prop_fail_result frame->Xprop_fail_result
6014 #define prop_category frame->Xprop_category
6015 #define prop_chartype frame->Xprop_chartype
6016 #define prop_othercase frame->Xprop_othercase
6017 #define prop_test_against frame->Xprop_test_against
6018 #define prop_test_variable frame->Xprop_test_variable
6019 #endif
6020
6021 #define ctype frame->Xctype
6022 #define fc frame->Xfc
6023 #define fi frame->Xfi
6024 #define length frame->Xlength
6025 #define max frame->Xmax
6026 #define min frame->Xmin
6027 #define number frame->Xnumber
6028 #define offset frame->Xoffset
6029 #define op frame->Xop
6030 #define save_capture_last frame->Xsave_capture_last
6031 #define save_offset1 frame->Xsave_offset1
6032 #define save_offset2 frame->Xsave_offset2
6033 #define save_offset3 frame->Xsave_offset3
6034 #define stacksave frame->Xstacksave
6035
6036 #define newptrb frame->Xnewptrb
6037
6038 /* When recursion is being used, local variables are allocated on the stack and
6039 get preserved during recursion in the normal way. In this environment, fi and
6040 i, and fc and c, can be the same variables. */
6041
6042 #else
6043 #define fi i
6044 #define fc c
6045
6046
6047 #ifdef SUPPORT_UTF8 /* Many of these variables are used ony */
6048 const uschar *charptr; /* small blocks of the code. My normal */
6049 #endif /* style of coding would have declared */
6050 const uschar *callpat; /* them within each of those blocks. */
6051 const uschar *data; /* However, in order to accommodate the */
6052 const uschar *next; /* version of this code that uses an */
6053 const uschar *pp; /* external "stack" implemented on the */
6054 const uschar *prev; /* heap, it is easier to declare them */
6055 const uschar *saved_eptr; /* all here, so the declarations can */
6056 /* be cut out in a block. The only */
6057 recursion_info new_recursive; /* declarations within blocks below are */
6058 /* for variables that do not have to */
6059 BOOL cur_is_word; /* be preserved over a recursive call */
6060 BOOL condition; /* to RMATCH(). */
6061 BOOL minimize;
6062 BOOL prev_is_word;
6063
6064 unsigned long int original_ims;
6065
6066 #ifdef SUPPORT_UCP
6067 int prop_type;
6068 int prop_fail_result;
6069 int prop_category;
6070 int prop_chartype;
6071 int prop_othercase;
6072 int prop_test_against;
6073 int *prop_test_variable;
6074 #endif
6075
6076 int ctype;
6077 int length;
6078 int max;
6079 int min;
6080 int number;
6081 int offset;
6082 int op;
6083 int save_capture_last;
6084 int save_offset1, save_offset2, save_offset3;
6085 int stacksave[REC_STACK_SAVE_MAX];
6086
6087 eptrblock newptrb;
6088 #endif
6089
6090 /* These statements are here to stop the compiler complaining about unitialized
6091 variables. */
6092
6093 #ifdef SUPPORT_UCP
6094 prop_fail_result = 0;
6095 prop_test_against = 0;
6096 prop_test_variable = NULL;
6097 #endif
6098
6099 /* OK, now we can get on with the real code of the function. Recursion is
6100 specified by the macros RMATCH and RRETURN. When NO_RECURSE is *not* defined,
6101 these just turn into a recursive call to match() and a "return", respectively.
6102 However, RMATCH isn't like a function call because it's quite a complicated
6103 macro. It has to be used in one particular way. This shouldn't, however, impact
6104 performance when true recursion is being used. */
6105
6106 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
6107
6108 original_ims = ims; /* Save for resetting on ')' */
6109
6110 /* At the start of a bracketed group, add the current subject pointer to the
6111 stack of such pointers, to be re-instated at the end of the group when we hit
6112 the closing ket. When match() is called in other circumstances, we don't add to
6113 this stack. */
6114
6115 if ((flags & match_isgroup) != 0)
6116 {
6117 newptrb.epb_prev = eptrb;
6118 newptrb.epb_saved_eptr = eptr;
6119 eptrb = &newptrb;
6120 }
6121
6122 /* Now start processing the operations. */
6123
6124 for (;;)
6125 {
6126 op = *ecode;
6127 minimize = FALSE;
6128
6129 /* For partial matching, remember if we ever hit the end of the subject after
6130 matching at least one subject character. */
6131
6132 if (md->partial &&
6133 eptr >= md->end_subject &&
6134 eptr > md->start_match)
6135 md->hitend = TRUE;
6136
6137 /* Opening capturing bracket. If there is space in the offset vector, save
6138 the current subject position in the working slot at the top of the vector. We
6139 mustn't change the current values of the data slot, because they may be set
6140 from a previous iteration of this group, and be referred to by a reference
6141 inside the group.
6142
6143 If the bracket fails to match, we need to restore this value and also the
6144 values of the final offsets, in case they were set by a previous iteration of
6145 the same bracket.
6146
6147 If there isn't enough space in the offset vector, treat this as if it were a
6148 non-capturing bracket. Don't worry about setting the flag for the error case
6149 here; that is handled in the code for KET. */
6150
6151 if (op > OP_BRA)
6152 {
6153 number = op - OP_BRA;
6154
6155 /* For extended extraction brackets (large number), we have to fish out the
6156 number from a dummy opcode at the start. */
6157
6158 if (number > EXTRACT_BASIC_MAX)
6159 number = GET2(ecode, 2+LINK_SIZE);
6160 offset = number << 1;
6161
6162 #ifdef DEBUG
6163 printf("start bracket %d subject=", number);
6164 pchars(eptr, 16, TRUE, md);
6165 printf("\n");
6166 #endif
6167
6168 if (offset < md->offset_max)
6169 {
6170 save_offset1 = md->offset_vector[offset];
6171 save_offset2 = md->offset_vector[offset+1];
6172 save_offset3 = md->offset_vector[md->offset_end - number];
6173 save_capture_last = md->capture_last;
6174
6175 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
6176 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
6177
6178 do
6179 {
6180 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
6181 match_isgroup);
6182 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6183 md->capture_last = save_capture_last;
6184 ecode += GET(ecode, 1);
6185 }
6186 while (*ecode == OP_ALT);
6187
6188 DPRINTF(("bracket %d failed\n", number));
6189
6190 md->offset_vector[offset] = save_offset1;
6191 md->offset_vector[offset+1] = save_offset2;
6192 md->offset_vector[md->offset_end - number] = save_offset3;
6193
6194 RRETURN(MATCH_NOMATCH);
6195 }
6196
6197 /* Insufficient room for saving captured contents */
6198
6199 else op = OP_BRA;
6200 }
6201
6202 /* Other types of node can be handled by a switch */
6203
6204 switch(op)
6205 {
6206 case OP_BRA: /* Non-capturing bracket: optimized */
6207 DPRINTF(("start bracket 0\n"));
6208 do
6209 {
6210 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
6211 match_isgroup);
6212 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6213 ecode += GET(ecode, 1);
6214 }
6215 while (*ecode == OP_ALT);
6216 DPRINTF(("bracket 0 failed\n"));
6217 RRETURN(MATCH_NOMATCH);
6218
6219 /* Conditional group: compilation checked that there are no more than
6220 two branches. If the condition is false, skipping the first branch takes us
6221 past the end if there is only one branch, but that's OK because that is
6222 exactly what going to the ket would do. */
6223
6224 case OP_COND:
6225 if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */
6226 {
6227 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
6228 condition = (offset == CREF_RECURSE * 2)?
6229 (md->recursive != NULL) :
6230 (offset < offset_top && md->offset_vector[offset] >= 0);
6231 RMATCH(rrc, eptr, ecode + (condition?
6232 (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1))),
6233 offset_top, md, ims, eptrb, match_isgroup);
6234 RRETURN(rrc);
6235 }
6236
6237 /* The condition is an assertion. Call match() to evaluate it - setting
6238 the final argument TRUE causes it to stop at the end of an assertion. */
6239
6240 else
6241 {
6242 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
6243 match_condassert | match_isgroup);
6244 if (rrc == MATCH_MATCH)
6245 {
6246 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2);
6247 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
6248 }
6249 else if (rrc != MATCH_NOMATCH)
6250 {
6251 RRETURN(rrc); /* Need braces because of following else */
6252 }
6253 else ecode += GET(ecode, 1);
6254 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
6255 match_isgroup);
6256 RRETURN(rrc);
6257 }
6258 /* Control never reaches here */
6259
6260 /* Skip over conditional reference or large extraction number data if
6261 encountered. */
6262
6263 case OP_CREF:
6264 case OP_BRANUMBER:
6265 ecode += 3;
6266 break;
6267
6268 /* End of the pattern. If we are in a recursion, we should restore the
6269 offsets appropriately and continue from after the call. */
6270
6271 case OP_END:
6272 if (md->recursive != NULL && md->recursive->group_num == 0)
6273 {
6274 recursion_info *rec = md->recursive;
6275 DPRINTF(("Hit the end in a (?0) recursion\n"));
6276 md->recursive = rec->prevrec;
6277 memmove(md->offset_vector, rec->offset_save,
6278 rec->saved_max * sizeof(int));
6279 md->start_match = rec->save_start;
6280 ims = original_ims;
6281 ecode = rec->after_call;
6282 break;
6283 }
6284
6285 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
6286 string - backtracking will then try other alternatives, if any. */
6287
6288 if (md->notempty && eptr == md->start_match) RRETURN(MATCH_NOMATCH);
6289 md->end_match_ptr = eptr; /* Record where we ended */
6290 md->end_offset_top = offset_top; /* and how many extracts were taken */
6291 RRETURN(MATCH_MATCH);
6292
6293 /* Change option settings */
6294
6295 case OP_OPT:
6296 ims = ecode[1];
6297 ecode += 2;
6298 DPRINTF(("ims set to %02lx\n", ims));
6299 break;
6300
6301 /* Assertion brackets. Check the alternative branches in turn - the
6302 matching won't pass the KET for an assertion. If any one branch matches,
6303 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
6304 start of each branch to move the current point backwards, so the code at
6305 this level is identical to the lookahead case. */
6306
6307 case OP_ASSERT:
6308 case OP_ASSERTBACK:
6309 do
6310 {
6311 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
6312 match_isgroup);
6313 if (rrc == MATCH_MATCH) break;
6314 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6315 ecode += GET(ecode, 1);
6316 }
6317 while (*ecode == OP_ALT);
6318 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
6319
6320 /* If checking an assertion for a condition, return MATCH_MATCH. */
6321
6322 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
6323
6324 /* Continue from after the assertion, updating the offsets high water
6325 mark, since extracts may have been taken during the assertion. */
6326
6327 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
6328 ecode += 1 + LINK_SIZE;
6329 offset_top = md->end_offset_top;
6330 continue;
6331
6332 /* Negative assertion: all branches must fail to match */
6333
6334 case OP_ASSERT_NOT:
6335 case OP_ASSERTBACK_NOT:
6336 do
6337 {
6338 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
6339 match_isgroup);
6340 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
6341 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6342 ecode += GET(ecode,1);
6343 }
6344 while (*ecode == OP_ALT);
6345
6346 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
6347
6348 ecode += 1 + LINK_SIZE;
6349 continue;
6350
6351 /* Move the subject pointer back. This occurs only at the start of
6352 each branch of a lookbehind assertion. If we are too close to the start to
6353 move back, this match function fails. When working with UTF-8 we move
6354 back a number of characters, not bytes. */
6355
6356 case OP_REVERSE:
6357 #ifdef SUPPORT_UTF8
6358 if (md->utf8)
6359 {
6360 c = GET(ecode,1);
6361 for (i = 0; i < c; i++)
6362 {
6363 eptr--;
6364 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
6365 BACKCHAR(eptr)
6366 }
6367 }
6368 else
6369 #endif
6370
6371 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
6372
6373 {
6374 eptr -= GET(ecode,1);
6375 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
6376 }
6377
6378 /* Skip to next op code */
6379
6380 ecode += 1 + LINK_SIZE;
6381 break;
6382
6383 /* The callout item calls an external function, if one is provided, passing
6384 details of the match so far. This is mainly for debugging, though the
6385 function is able to force a failure. */
6386
6387 case OP_CALLOUT:
6388 if (pcre_callout != NULL)
6389 {
6390 pcre_callout_block cb;
6391 cb.version = 1; /* Version 1 of the callout block */
6392 cb.callout_number = ecode[1];
6393 cb.offset_vector = md->offset_vector;
6394 cb.subject = (const char *)md->start_subject;
6395 cb.subject_length = md->end_subject - md->start_subject;
6396 cb.start_match = md->start_match - md->start_subject;
6397 cb.current_position = eptr - md->start_subject;
6398 cb.pattern_position = GET(ecode, 2);
6399 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
6400 cb.capture_top = offset_top/2;
6401 cb.capture_last = md->capture_last;
6402 cb.callout_data = md->callout_data;
6403 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
6404 if (rrc < 0) RRETURN(rrc);
6405 }
6406 ecode += 2 + 2*LINK_SIZE;
6407 break;
6408
6409 /* Recursion either matches the current regex, or some subexpression. The
6410 offset data is the offset to the starting bracket from the start of the
6411 whole pattern. (This is so that it works from duplicated subpatterns.)
6412
6413 If there are any capturing brackets started but not finished, we have to
6414 save their starting points and reinstate them after the recursion. However,
6415 we don't know how many such there are (offset_top records the completed
6416 total) so we just have to save all the potential data. There may be up to
6417 65535 such values, which is too large to put on the stack, but using malloc
6418 for small numbers seems expensive. As a compromise, the stack is used when
6419 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
6420 is used. A problem is what to do if the malloc fails ... there is no way of
6421 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
6422 values on the stack, and accept that the rest may be wrong.
6423
6424 There are also other values that have to be saved. We use a chained
6425 sequence of blocks that actually live on the stack. Thanks to Robin Houston
6426 for the original version of this logic. */
6427
6428 case OP_RECURSE:
6429 {
6430 callpat = md->start_code + GET(ecode, 1);
6431 new_recursive.group_num = *callpat - OP_BRA;
6432
6433 /* For extended extraction brackets (large number), we have to fish out
6434 the number from a dummy opcode at the start. */
6435
6436 if (new_recursive.group_num > EXTRACT_BASIC_MAX)
6437 new_recursive.group_num = GET2(callpat, 2+LINK_SIZE);
6438
6439 /* Add to "recursing stack" */
6440
6441 new_recursive.prevrec = md->recursive;
6442 md->recursive = &new_recursive;
6443
6444 /* Find where to continue from afterwards */
6445
6446 ecode += 1 + LINK_SIZE;
6447 new_recursive.after_call = ecode;
6448
6449 /* Now save the offset data. */
6450
6451 new_recursive.saved_max = md->offset_end;
6452 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
6453 new_recursive.offset_save = stacksave;
6454 else
6455 {
6456 new_recursive.offset_save =
6457 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
6458 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
6459 }
6460
6461 memcpy(new_recursive.offset_save, md->offset_vector,
6462 new_recursive.saved_max * sizeof(int));
6463 new_recursive.save_start = md->start_match;
6464 md->start_match = eptr;
6465
6466 /* OK, now we can do the recursion. For each top-level alternative we
6467 restore the offset and recursion data. */
6468
6469 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
6470 do
6471 {
6472 RMATCH(rrc, eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims,
6473 eptrb, match_isgroup);
6474 if (rrc == MATCH_MATCH)
6475 {
6476 md->recursive = new_recursive.prevrec;
6477 if (new_recursive.offset_save != stacksave)
6478 (pcre_free)(new_recursive.offset_save);
6479 RRETURN(MATCH_MATCH);
6480 }
6481 else if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6482
6483 md->recursive = &new_recursive;
6484 memcpy(md->offset_vector, new_recursive.offset_save,
6485 new_recursive.saved_max * sizeof(int));
6486 callpat += GET(callpat, 1);
6487 }
6488 while (*callpat == OP_ALT);
6489
6490 DPRINTF(("Recursion didn't match\n"));
6491 md->recursive = new_recursive.prevrec;
6492 if (new_recursive.offset_save != stacksave)
6493 (pcre_free)(new_recursive.offset_save);
6494 RRETURN(MATCH_NOMATCH);
6495 }
6496 /* Control never reaches here */
6497
6498 /* "Once" brackets are like assertion brackets except that after a match,
6499 the point in the subject string is not moved back. Thus there can never be
6500 a move back into the brackets. Friedl calls these "atomic" subpatterns.
6501 Check the alternative branches in turn - the matching won't pass the KET
6502 for this kind of subpattern. If any one branch matches, we carry on as at
6503 the end of a normal bracket, leaving the subject pointer. */
6504
6505 case OP_ONCE:
6506 {
6507 prev = ecode;
6508 saved_eptr = eptr;
6509
6510 do
6511 {
6512 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
6513 eptrb, match_isgroup);
6514 if (rrc == MATCH_MATCH) break;
6515 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6516 ecode += GET(ecode,1);
6517 }
6518 while (*ecode == OP_ALT);
6519
6520 /* If hit the end of the group (which could be repeated), fail */
6521
6522 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
6523
6524 /* Continue as from after the assertion, updating the offsets high water
6525 mark, since extracts may have been taken. */
6526
6527 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
6528
6529 offset_top = md->end_offset_top;
6530 eptr = md->end_match_ptr;
6531
6532 /* For a non-repeating ket, just continue at this level. This also
6533 happens for a repeating ket if no characters were matched in the group.
6534 This is the forcible breaking of infinite loops as implemented in Perl
6535 5.005. If there is an options reset, it will get obeyed in the normal
6536 course of events. */
6537
6538 if (*ecode == OP_KET || eptr == saved_eptr)
6539 {
6540 ecode += 1+LINK_SIZE;
6541 break;
6542 }
6543
6544 /* The repeating kets try the rest of the pattern or restart from the
6545 preceding bracket, in the appropriate order. We need to reset any options
6546 that changed within the bracket before re-running it, so check the next
6547 opcode. */
6548
6549 if (ecode[1+LINK_SIZE] == OP_OPT)
6550 {
6551 ims = (ims & ~PCRE_IMS) | ecode[4];
6552 DPRINTF(("ims set to %02lx at group repeat\n", ims));
6553 }
6554
6555 if (*ecode == OP_KETRMIN)
6556 {
6557 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);
6558 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6559 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6560 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6561 }
6562 else /* OP_KETRMAX */
6563 {
6564 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6565 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6566 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
6567 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6568 }
6569 }
6570 RRETURN(MATCH_NOMATCH);
6571
6572 /* An alternation is the end of a branch; scan along to find the end of the
6573 bracketed group and go to there. */
6574
6575 case OP_ALT:
6576 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
6577 break;
6578
6579 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
6580 that it may occur zero times. It may repeat infinitely, or not at all -
6581 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
6582 repeat limits are compiled as a number of copies, with the optional ones
6583 preceded by BRAZERO or BRAMINZERO. */
6584
6585 case OP_BRAZERO:
6586 {
6587 next = ecode+1;
6588 RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, match_isgroup);
6589 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6590 do next += GET(next,1); while (*next == OP_ALT);
6591 ecode = next + 1+LINK_SIZE;
6592 }
6593 break;
6594
6595 case OP_BRAMINZERO:
6596 {
6597 next = ecode+1;
6598 do next += GET(next,1); while (*next == OP_ALT);
6599 RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb,
6600 match_isgroup);
6601 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6602 ecode++;
6603 }
6604 break;
6605
6606 /* End of a group, repeated or non-repeating. If we are at the end of
6607 an assertion "group", stop matching and return MATCH_MATCH, but record the
6608 current high water mark for use by positive assertions. Do this also
6609 for the "once" (not-backup up) groups. */
6610
6611 case OP_KET:
6612 case OP_KETRMIN:
6613 case OP_KETRMAX:
6614 {
6615 prev = ecode - GET(ecode, 1);
6616 saved_eptr = eptrb->epb_saved_eptr;
6617
6618 /* Back up the stack of bracket start pointers. */
6619
6620 eptrb = eptrb->epb_prev;
6621
6622 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
6623 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
6624 *prev == OP_ONCE)
6625 {
6626 md->end_match_ptr = eptr; /* For ONCE */
6627 md->end_offset_top = offset_top;
6628 RRETURN(MATCH_MATCH);
6629 }
6630
6631 /* In all other cases except a conditional group we have to check the
6632 group number back at the start and if necessary complete handling an
6633 extraction by setting the offsets and bumping the high water mark. */
6634
6635 if (*prev != OP_COND)
6636 {
6637 number = *prev - OP_BRA;
6638
6639 /* For extended extraction brackets (large number), we have to fish out
6640 the number from a dummy opcode at the start. */
6641
6642 if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);
6643 offset = number << 1;
6644
6645 #ifdef DEBUG
6646 printf("end bracket %d", number);
6647 printf("\n");
6648 #endif
6649
6650 /* Test for a numbered group. This includes groups called as a result
6651 of recursion. Note that whole-pattern recursion is coded as a recurse
6652 into group 0, so it won't be picked up here. Instead, we catch it when
6653 the OP_END is reached. */
6654
6655 if (number > 0)
6656 {
6657 md->capture_last = number;
6658 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
6659 {
6660 md->offset_vector[offset] =
6661 md->offset_vector[md->offset_end - number];
6662 md->offset_vector[offset+1] = eptr - md->start_subject;
6663 if (offset_top <= offset) offset_top = offset + 2;
6664 }
6665
6666 /* Handle a recursively called group. Restore the offsets
6667 appropriately and continue from after the call. */
6668
6669 if (md->recursive != NULL && md->recursive->group_num == number)
6670 {
6671 recursion_info *rec = md->recursive;
6672 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
6673 md->recursive = rec->prevrec;
6674 md->start_match = rec->save_start;
6675 memcpy(md->offset_vector, rec->offset_save,
6676 rec->saved_max * sizeof(int));
6677 ecode = rec->after_call;
6678 ims = original_ims;
6679 break;
6680 }
6681 }
6682 }
6683
6684 /* Reset the value of the ims flags, in case they got changed during
6685 the group. */
6686
6687 ims = original_ims;
6688 DPRINTF(("ims reset to %02lx\n", ims));
6689
6690 /* For a non-repeating ket, just continue at this level. This also
6691 happens for a repeating ket if no characters were matched in the group.
6692 This is the forcible breaking of infinite loops as implemented in Perl
6693 5.005. If there is an options reset, it will get obeyed in the normal
6694 course of events. */
6695
6696 if (*ecode == OP_KET || eptr == saved_eptr)
6697 {
6698 ecode += 1 + LINK_SIZE;
6699 break;
6700 }
6701
6702 /* The repeating kets try the rest of the pattern or restart from the
6703 preceding bracket, in the appropriate order. */
6704
6705 if (*ecode == OP_KETRMIN)
6706 {
6707 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
6708 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6709 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6710 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6711 }
6712 else /* OP_KETRMAX */
6713 {
6714 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6715 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6716 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
6717 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6718 }
6719 }
6720
6721 RRETURN(MATCH_NOMATCH);
6722
6723 /* Start of subject unless notbol, or after internal newline if multiline */
6724
6725 case OP_CIRC:
6726 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
6727 if ((ims & PCRE_MULTILINE) != 0)
6728 {
6729 if (eptr != md->start_subject && eptr[-1] != NEWLINE)
6730 RRETURN(MATCH_NOMATCH);
6731 ecode++;
6732 break;
6733 }
6734 /* ... else fall through */
6735
6736 /* Start of subject assertion */
6737
6738 case OP_SOD:
6739 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
6740 ecode++;
6741 break;
6742
6743 /* Start of match assertion */
6744
6745 case OP_SOM:
6746 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
6747 ecode++;
6748 break;
6749
6750 /* Assert before internal newline if multiline, or before a terminating
6751 newline unless endonly is set, else end of subject unless noteol is set. */
6752
6753 case OP_DOLL:
6754 if ((ims & PCRE_MULTILINE) != 0)
6755 {
6756 if (eptr < md->end_subject)
6757 { if (*eptr != NEWLINE) RRETURN(MATCH_NOMATCH); }
6758 else
6759 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
6760 ecode++;
6761 break;
6762 }
6763 else
6764 {
6765 if (md->noteol) RRETURN(MATCH_NOMATCH);
6766 if (!md->endonly)
6767 {
6768 if (eptr < md->end_subject - 1 ||
6769 (eptr == md->end_subject - 1 && *eptr != NEWLINE))
6770 RRETURN(MATCH_NOMATCH);
6771 ecode++;
6772 break;
6773 }
6774 }
6775 /* ... else fall through */
6776
6777 /* End of subject assertion (\z) */
6778
6779 case OP_EOD:
6780 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
6781 ecode++;
6782 break;
6783
6784 /* End of subject or ending \n assertion (\Z) */
6785
6786 case OP_EODN:
6787 if (eptr < md->end_subject - 1 ||
6788 (eptr == md->end_subject - 1 && *eptr != NEWLINE)) RRETURN(MATCH_NOMATCH);
6789 ecode++;
6790 break;
6791
6792 /* Word boundary assertions */
6793
6794 case OP_NOT_WORD_BOUNDARY:
6795 case OP_WORD_BOUNDARY:
6796 {
6797
6798 /* Find out if the previous and current characters are "word" characters.
6799 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
6800 be "non-word" characters. */
6801
6802 #ifdef SUPPORT_UTF8
6803 if (md->utf8)
6804 {
6805 if (eptr == md->start_subject) prev_is_word = FALSE; else
6806 {
6807 const uschar *lastptr = eptr - 1;
6808 while((*lastptr & 0xc0) == 0x80) lastptr--;
6809 GETCHAR(c, lastptr);
6810 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
6811 }
6812 if (eptr >= md->end_subject) cur_is_word = FALSE; else
6813 {
6814 GETCHAR(c, eptr);
6815 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
6816 }
6817 }
6818 else
6819 #endif
6820
6821 /* More streamlined when not in UTF-8 mode */
6822
6823 {
6824 prev_is_word = (eptr != md->start_subject) &&
6825 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
6826 cur_is_word = (eptr < md->end_subject) &&
6827 ((md->ctypes[*eptr] & ctype_word) != 0);
6828 }
6829
6830 /* Now see if the situation is what we want */
6831
6832 if ((*ecode++ == OP_WORD_BOUNDARY)?
6833 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
6834 RRETURN(MATCH_NOMATCH);
6835 }
6836 break;
6837
6838 /* Match a single character type; inline for speed */
6839
6840 case OP_ANY:
6841 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
6842 RRETURN(MATCH_NOMATCH);
6843 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
6844 #ifdef SUPPORT_UTF8
6845 if (md->utf8)
6846 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
6847 #endif
6848 ecode++;
6849 break;
6850
6851 /* Match a single byte, even in UTF-8 mode. This opcode really does match
6852 any byte, even newline, independent of the setting of PCRE_DOTALL. */
6853
6854 case OP_ANYBYTE:
6855 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
6856 ecode++;
6857 break;
6858
6859 case OP_NOT_DIGIT:
6860 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6861 GETCHARINCTEST(c, eptr);
6862 if (
6863 #ifdef SUPPORT_UTF8
6864 c < 256 &&
6865 #endif
6866 (md->ctypes[c] & ctype_digit) != 0
6867 )
6868 RRETURN(MATCH_NOMATCH);
6869 ecode++;
6870 break;
6871
6872 case OP_DIGIT:
6873 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6874 GETCHARINCTEST(c, eptr);
6875 if (
6876 #ifdef SUPPORT_UTF8
6877 c >= 256 ||
6878 #endif
6879 (md->ctypes[c] & ctype_digit) == 0
6880 )
6881 RRETURN(MATCH_NOMATCH);
6882 ecode++;
6883 break;
6884
6885 case OP_NOT_WHITESPACE:
6886 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6887 GETCHARINCTEST(c, eptr);
6888 if (
6889 #ifdef SUPPORT_UTF8
6890 c < 256 &&
6891 #endif
6892 (md->ctypes[c] & ctype_space) != 0
6893 )
6894 RRETURN(MATCH_NOMATCH);
6895 ecode++;
6896 break;
6897
6898 case OP_WHITESPACE:
6899 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6900 GETCHARINCTEST(c, eptr);
6901 if (
6902 #ifdef SUPPORT_UTF8
6903 c >= 256 ||
6904 #endif
6905 (md->ctypes[c] & ctype_space) == 0
6906 )
6907 RRETURN(MATCH_NOMATCH);
6908 ecode++;
6909 break;
6910
6911 case OP_NOT_WORDCHAR:
6912 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6913 GETCHARINCTEST(c, eptr);
6914 if (
6915 #ifdef SUPPORT_UTF8
6916 c < 256 &&
6917 #endif
6918 (md->ctypes[c] & ctype_word) != 0
6919 )
6920 RRETURN(MATCH_NOMATCH);
6921 ecode++;
6922 break;
6923
6924 case OP_WORDCHAR:
6925 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6926 GETCHARINCTEST(c, eptr);
6927 if (
6928 #ifdef SUPPORT_UTF8
6929 c >= 256 ||
6930 #endif
6931 (md->ctypes[c] & ctype_word) == 0
6932 )
6933 RRETURN(MATCH_NOMATCH);
6934 ecode++;
6935 break;
6936
6937 #ifdef SUPPORT_UCP
6938 /* Check the next character by Unicode property. We will get here only
6939 if the support is in the binary; otherwise a compile-time error occurs. */
6940
6941 case OP_PROP:
6942 case OP_NOTPROP:
6943 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6944 GETCHARINCTEST(c, eptr);
6945 {
6946 int chartype, rqdtype;
6947 int othercase;
6948 int category = ucp_findchar(c, &chartype, &othercase);
6949
6950 rqdtype = *(++ecode);
6951 ecode++;
6952
6953 if (rqdtype >= 128)
6954 {
6955 if ((rqdtype - 128 != category) == (op == OP_PROP))
6956 RRETURN(MATCH_NOMATCH);
6957 }
6958 else
6959 {
6960 if ((rqdtype != chartype) == (op == OP_PROP))
6961 RRETURN(MATCH_NOMATCH);
6962 }
6963 }
6964 break;
6965
6966 /* Match an extended Unicode sequence. We will get here only if the support
6967 is in the binary; otherwise a compile-time error occurs. */
6968
6969 case OP_EXTUNI:
6970 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6971 GETCHARINCTEST(c, eptr);
6972 {
6973 int chartype;
6974 int othercase;
6975 int category = ucp_findchar(c, &chartype, &othercase);
6976 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
6977 while (eptr < md->end_subject)
6978 {
6979 int len = 1;
6980 if (!md->utf8) c = *eptr; else
6981 {
6982 GETCHARLEN(c, eptr, len);
6983 }
6984 category = ucp_findchar(c, &chartype, &othercase);
6985 if (category != ucp_M) break;
6986 eptr += len;
6987 }
6988 }
6989 ecode++;
6990 break;
6991 #endif
6992
6993
6994 /* Match a back reference, possibly repeatedly. Look past the end of the
6995 item to see if there is repeat information following. The code is similar
6996 to that for character classes, but repeated for efficiency. Then obey
6997 similar code to character type repeats - written out again for speed.
6998 However, if the referenced string is the empty string, always treat
6999 it as matched, any number of times (otherwise there could be infinite
7000 loops). */
7001
7002 case OP_REF:
7003 {
7004 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
7005 ecode += 3; /* Advance past item */
7006
7007 /* If the reference is unset, set the length to be longer than the amount
7008 of subject left; this ensures that every attempt at a match fails. We
7009 can't just fail here, because of the possibility of quantifiers with zero
7010 minima. */
7011
7012 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
7013 md->end_subject - eptr + 1 :
7014 md->offset_vector[offset+1] - md->offset_vector[offset];
7015
7016 /* Set up for repetition, or handle the non-repeated case */
7017
7018 switch (*ecode)
7019 {
7020 case OP_CRSTAR:
7021 case OP_CRMINSTAR:
7022 case OP_CRPLUS:
7023 case OP_CRMINPLUS:
7024 case OP_CRQUERY:
7025 case OP_CRMINQUERY:
7026 c = *ecode++ - OP_CRSTAR;
7027 minimize = (c & 1) != 0;
7028 min = rep_min[c]; /* Pick up values from tables; */
7029 max = rep_max[c]; /* zero for max => infinity */
7030 if (max == 0) max = INT_MAX;
7031 break;
7032
7033 case OP_CRRANGE:
7034 case OP_CRMINRANGE:
7035 minimize = (*ecode == OP_CRMINRANGE);
7036 min = GET2(ecode, 1);
7037 max = GET2(ecode, 3);
7038 if (max == 0) max = INT_MAX;
7039 ecode += 5;
7040 break;
7041
7042 default: /* No repeat follows */
7043 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
7044 eptr += length;
7045 continue; /* With the main loop */
7046 }
7047
7048 /* If the length of the reference is zero, just continue with the
7049 main loop. */
7050
7051 if (length == 0) continue;
7052
7053 /* First, ensure the minimum number of matches are present. We get back
7054 the length of the reference string explicitly rather than passing the
7055 address of eptr, so that eptr can be a register variable. */
7056
7057 for (i = 1; i <= min; i++)
7058 {
7059 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
7060 eptr += length;
7061 }
7062
7063 /* If min = max, continue at the same level without recursion.
7064 They are not both allowed to be zero. */
7065
7066 if (min == max) continue;
7067
7068 /* If minimizing, keep trying and advancing the pointer */
7069
7070 if (minimize)
7071 {
7072 for (fi = min;; fi++)
7073 {
7074 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7075 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7076 if (fi >= max || !match_ref(offset, eptr, length, md, ims))
7077 RRETURN(MATCH_NOMATCH);
7078 eptr += length;
7079 }
7080 /* Control never gets here */
7081 }
7082
7083 /* If maximizing, find the longest string and work backwards */
7084
7085 else
7086 {
7087 pp = eptr;
7088 for (i = min; i < max; i++)
7089 {
7090 if (!match_ref(offset, eptr, length, md, ims)) break;
7091 eptr += length;
7092 }
7093 while (eptr >= pp)
7094 {
7095 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7096 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7097 eptr -= length;
7098 }
7099 RRETURN(MATCH_NOMATCH);
7100 }
7101 }
7102 /* Control never gets here */
7103
7104
7105
7106 /* Match a bit-mapped character class, possibly repeatedly. This op code is
7107 used when all the characters in the class have values in the range 0-255,
7108 and either the matching is caseful, or the characters are in the range
7109 0-127 when UTF-8 processing is enabled. The only difference between
7110 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
7111 encountered.
7112
7113 First, look past the end of the item to see if there is repeat information
7114 following. Then obey similar code to character type repeats - written out
7115 again for speed. */
7116
7117 case OP_NCLASS:
7118 case OP_CLASS:
7119 {
7120 data = ecode + 1; /* Save for matching */
7121 ecode += 33; /* Advance past the item */
7122
7123 switch (*ecode)
7124 {
7125 case OP_CRSTAR:
7126 case OP_CRMINSTAR:
7127 case OP_CRPLUS:
7128 case OP_CRMINPLUS:
7129 case OP_CRQUERY:
7130 case OP_CRMINQUERY:
7131 c = *ecode++ - OP_CRSTAR;
7132 minimize = (c & 1) != 0;
7133 min = rep_min[c]; /* Pick up values from tables; */
7134 max = rep_max[c]; /* zero for max => infinity */
7135 if (max == 0) max = INT_MAX;
7136 break;
7137
7138 case OP_CRRANGE:
7139 case OP_CRMINRANGE:
7140 minimize = (*ecode == OP_CRMINRANGE);
7141 min = GET2(ecode, 1);
7142 max = GET2(ecode, 3);
7143 if (max == 0) max = INT_MAX;
7144 ecode += 5;
7145 break;
7146
7147 default: /* No repeat follows */
7148 min = max = 1;
7149 break;
7150 }
7151
7152 /* First, ensure the minimum number of matches are present. */
7153
7154 #ifdef SUPPORT_UTF8
7155 /* UTF-8 mode */
7156 if (md->utf8)
7157 {
7158 for (i = 1; i <= min; i++)
7159 {
7160 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7161 GETCHARINC(c, eptr);
7162 if (c > 255)
7163 {
7164 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
7165 }
7166 else
7167 {
7168 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
7169 }
7170 }
7171 }
7172 else
7173 #endif
7174 /* Not UTF-8 mode */
7175 {
7176 for (i = 1; i <= min; i++)
7177 {
7178 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7179 c = *eptr++;
7180 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
7181 }
7182 }
7183
7184 /* If max == min we can continue with the main loop without the
7185 need to recurse. */
7186
7187 if (min == max) continue;
7188
7189 /* If minimizing, keep testing the rest of the expression and advancing
7190 the pointer while it matches the class. */
7191
7192 if (minimize)
7193 {
7194 #ifdef SUPPORT_UTF8
7195 /* UTF-8 mode */
7196 if (md->utf8)
7197 {
7198 for (fi = min;; fi++)
7199 {
7200 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7201 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7202 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7203 GETCHARINC(c, eptr);
7204 if (c > 255)
7205 {
7206 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
7207 }
7208 else
7209 {
7210 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
7211 }
7212 }
7213 }
7214 else
7215 #endif
7216 /* Not UTF-8 mode */
7217 {
7218 for (fi = min;; fi++)
7219 {
7220 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7221 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7222 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7223 c = *eptr++;
7224 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
7225 }
7226 }
7227 /* Control never gets here */
7228 }
7229
7230 /* If maximizing, find the longest possible run, then work backwards. */
7231
7232 else
7233 {
7234 pp = eptr;
7235
7236 #ifdef SUPPORT_UTF8
7237 /* UTF-8 mode */
7238 if (md->utf8)
7239 {
7240 for (i = min; i < max; i++)
7241 {
7242 int len = 1;
7243 if (eptr >= md->end_subject) break;
7244 GETCHARLEN(c, eptr, len);
7245 if (c > 255)
7246 {
7247 if (op == OP_CLASS) break;
7248 }
7249 else
7250 {
7251 if ((data[c/8] & (1 << (c&7))) == 0) break;
7252 }
7253 eptr += len;
7254 }
7255 for (;;)
7256 {
7257 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7258 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7259 if (eptr-- == pp) break; /* Stop if tried at original pos */
7260 BACKCHAR(eptr);
7261 }
7262 }
7263 else
7264 #endif
7265 /* Not UTF-8 mode */
7266 {
7267 for (i = min; i < max; i++)
7268 {
7269 if (eptr >= md->end_subject) break;
7270 c = *eptr;
7271 if ((data[c/8] & (1 << (c&7))) == 0) break;
7272 eptr++;
7273 }
7274 while (eptr >= pp)
7275 {
7276 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7277 eptr--;
7278 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7279 }
7280 }
7281
7282 RRETURN(MATCH_NOMATCH);
7283 }
7284 }
7285 /* Control never gets here */
7286
7287
7288 /* Match an extended character class. This opcode is encountered only
7289 in UTF-8 mode, because that's the only time it is compiled. */
7290
7291 #ifdef SUPPORT_UTF8
7292 case OP_XCLASS:
7293 {
7294 data = ecode + 1 + LINK_SIZE; /* Save for matching */
7295 ecode += GET(ecode, 1); /* Advance past the item */
7296
7297 switch (*ecode)
7298 {
7299 case OP_CRSTAR:
7300 case OP_CRMINSTAR:
7301 case OP_CRPLUS:
7302 case OP_CRMINPLUS:
7303 case OP_CRQUERY:
7304 case OP_CRMINQUERY:
7305 c = *ecode++ - OP_CRSTAR;
7306 minimize = (c & 1) != 0;
7307 min = rep_min[c]; /* Pick up values from tables; */
7308 max = rep_max[c]; /* zero for max => infinity */
7309 if (max == 0) max = INT_MAX;
7310 break;
7311
7312 case OP_CRRANGE:
7313 case OP_CRMINRANGE:
7314 minimize = (*ecode == OP_CRMINRANGE);
7315 min = GET2(ecode, 1);
7316 max = GET2(ecode, 3);
7317 if (max == 0) max = INT_MAX;
7318 ecode += 5;
7319 break;
7320
7321 default: /* No repeat follows */
7322 min = max = 1;
7323 break;
7324 }
7325
7326 /* First, ensure the minimum number of matches are present. */
7327
7328 for (i = 1; i <= min; i++)
7329 {
7330 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7331 GETCHARINC(c, eptr);
7332 if (!match_xclass(c, data)) RRETURN(MATCH_NOMATCH);
7333 }
7334
7335 /* If max == min we can continue with the main loop without the
7336 need to recurse. */
7337
7338 if (min == max) continue;
7339
7340 /* If minimizing, keep testing the rest of the expression and advancing
7341 the pointer while it matches the class. */
7342
7343 if (minimize)
7344 {
7345 for (fi = min;; fi++)
7346 {
7347 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7348 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7349 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7350 GETCHARINC(c, eptr);
7351 if (!match_xclass(c, data)) RRETURN(MATCH_NOMATCH);
7352 }
7353 /* Control never gets here */
7354 }
7355
7356 /* If maximizing, find the longest possible run, then work backwards. */
7357
7358 else
7359 {
7360 pp = eptr;
7361 for (i = min; i < max; i++)
7362 {
7363 int len = 1;
7364 if (eptr >= md->end_subject) break;
7365 GETCHARLEN(c, eptr, len);
7366 if (!match_xclass(c, data)) break;
7367 eptr += len;
7368 }
7369 for(;;)
7370 {
7371 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7372 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7373 if (eptr-- == pp) break; /* Stop if tried at original pos */
7374 BACKCHAR(eptr)
7375 }
7376 RRETURN(MATCH_NOMATCH);
7377 }
7378
7379 /* Control never gets here */
7380 }
7381 #endif /* End of XCLASS */
7382
7383 /* Match a single character, casefully */
7384
7385 case OP_CHAR:
7386 #ifdef SUPPORT_UTF8
7387 if (md->utf8)
7388 {
7389 length = 1;
7390 ecode++;
7391 GETCHARLEN(fc, ecode, length);
7392 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7393 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
7394 }
7395 else
7396 #endif
7397
7398 /* Non-UTF-8 mode */
7399 {
7400 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
7401 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
7402 ecode += 2;
7403 }
7404 break;
7405
7406 /* Match a single character, caselessly */
7407
7408 case OP_CHARNC:
7409 #ifdef SUPPORT_UTF8
7410 if (md->utf8)
7411 {
7412 length = 1;
7413 ecode++;
7414 GETCHARLEN(fc, ecode, length);
7415
7416 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7417
7418 /* If the pattern character's value is < 128, we have only one byte, and
7419 can use the fast lookup table. */
7420
7421 if (fc < 128)
7422 {
7423 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
7424 }
7425
7426 /* Otherwise we must pick up the subject character */
7427
7428 else
7429 {
7430 int dc;
7431 GETCHARINC(dc, eptr);
7432 ecode += length;
7433
7434 /* If we have Unicode property support, we can use it to test the other
7435 case of the character, if there is one. The result of ucp_findchar() is
7436 < 0 if the char isn't found, and othercase is returned as zero if there
7437 isn't one. */
7438
7439 if (fc != dc)
7440 {
7441 #ifdef SUPPORT_UCP
7442 int chartype;
7443 int othercase;
7444 if (ucp_findchar(fc, &chartype, &othercase) < 0 || dc != othercase)
7445 #endif
7446 RRETURN(MATCH_NOMATCH);
7447 }
7448 }
7449 }
7450 else
7451 #endif /* SUPPORT_UTF8 */
7452
7453 /* Non-UTF-8 mode */
7454 {
7455 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
7456 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
7457 ecode += 2;
7458 }
7459 break;
7460
7461 /* Match a single character repeatedly; different opcodes share code. */
7462
7463 case OP_EXACT:
7464 min = max = GET2(ecode, 1);
7465 ecode += 3;
7466 goto REPEATCHAR;
7467
7468 case OP_UPTO:
7469 case OP_MINUPTO:
7470 min = 0;
7471 max = GET2(ecode, 1);
7472 minimize = *ecode == OP_MINUPTO;
7473 ecode += 3;
7474 goto REPEATCHAR;
7475
7476 case OP_STAR:
7477 case OP_MINSTAR:
7478 case OP_PLUS:
7479 case OP_MINPLUS:
7480 case OP_QUERY:
7481 case OP_MINQUERY:
7482 c = *ecode++ - OP_STAR;
7483 minimize = (c & 1) != 0;
7484 min = rep_min[c]; /* Pick up values from tables; */
7485 max = rep_max[c]; /* zero for max => infinity */
7486 if (max == 0) max = INT_MAX;
7487
7488 /* Common code for all repeated single-character matches. We can give
7489 up quickly if there are fewer than the minimum number of characters left in
7490 the subject. */
7491
7492 REPEATCHAR:
7493 #ifdef SUPPORT_UTF8
7494 if (md->utf8)
7495 {
7496 length = 1;
7497 charptr = ecode;
7498 GETCHARLEN(fc, ecode, length);
7499 if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7500 ecode += length;
7501
7502 /* Handle multibyte character matching specially here. There is
7503 support for caseless matching if UCP support is present. */
7504
7505 if (length > 1)
7506 {
7507 int oclength = 0;
7508 uschar occhars[8];
7509
7510 #ifdef SUPPORT_UCP
7511 int othercase;
7512 int chartype;
7513 if ((ims & PCRE_CASELESS) != 0 &&
7514 ucp_findchar(fc, &chartype, &othercase) >= 0 &&
7515 othercase > 0)
7516 oclength = ord2utf8(othercase, occhars);
7517 #endif /* SUPPORT_UCP */
7518
7519 for (i = 1; i <= min; i++)
7520 {
7521 if (memcmp(eptr, charptr, length) == 0) eptr += length;
7522 /* Need braces because of following else */
7523 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
7524 else
7525 {
7526 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
7527 eptr += oclength;
7528 }
7529 }
7530
7531 if (min == max) continue;
7532
7533 if (minimize)
7534 {
7535 for (fi = min;; fi++)
7536 {
7537 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7538 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7539 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7540 if (memcmp(eptr, charptr, length) == 0) eptr += length;
7541 /* Need braces because of following else */
7542 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
7543 else
7544 {
7545 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
7546 eptr += oclength;
7547 }
7548 }
7549 /* Control never gets here */
7550 }
7551 else
7552 {
7553 pp = eptr;
7554 for (i = min; i < max; i++)
7555 {
7556 if (eptr > md->end_subject - length) break;
7557 if (memcmp(eptr, charptr, length) == 0) eptr += length;
7558 else if (oclength == 0) break;
7559 else
7560 {
7561 if (memcmp(eptr, occhars, oclength) != 0) break;
7562 eptr += oclength;
7563 }
7564 }
7565 while (eptr >= pp)
7566 {
7567 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7568 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7569 eptr -= length;
7570 }
7571 RRETURN(MATCH_NOMATCH);
7572 }
7573 /* Control never gets here */
7574 }
7575
7576 /* If the length of a UTF-8 character is 1, we fall through here, and
7577 obey the code as for non-UTF-8 characters below, though in this case the
7578 value of fc will always be < 128. */
7579 }
7580 else
7581 #endif /* SUPPORT_UTF8 */
7582
7583 /* When not in UTF-8 mode, load a single-byte character. */
7584 {
7585 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7586 fc = *ecode++;
7587 }
7588
7589 /* The value of fc at this point is always less than 256, though we may or
7590 may not be in UTF-8 mode. The code is duplicated for the caseless and
7591 caseful cases, for speed, since matching characters is likely to be quite
7592 common. First, ensure the minimum number of matches are present. If min =
7593 max, continue at the same level without recursing. Otherwise, if
7594 minimizing, keep trying the rest of the expression and advancing one
7595 matching character if failing, up to the maximum. Alternatively, if
7596 maximizing, find the maximum number of characters and work backwards. */
7597
7598 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
7599 max, eptr));
7600
7601 if ((ims & PCRE_CASELESS) != 0)
7602 {
7603 fc = md->lcc[fc];
7604 for (i = 1; i <= min; i++)
7605 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
7606 if (min == max) continue;
7607 if (minimize)
7608 {
7609 for (fi = min;; fi++)
7610 {
7611 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7612 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7613 if (fi >= max || eptr >= md->end_subject ||
7614 fc != md->lcc[*eptr++])
7615 RRETURN(MATCH_NOMATCH);
7616 }
7617 /* Control never gets here */
7618 }
7619 else
7620 {
7621 pp = eptr;
7622 for (i = min; i < max; i++)
7623 {
7624 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
7625 eptr++;
7626 }
7627 while (eptr >= pp)
7628 {
7629 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7630 eptr--;
7631 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7632 }
7633 RRETURN(MATCH_NOMATCH);
7634 }
7635 /* Control never gets here */
7636 }
7637
7638 /* Caseful comparisons (includes all multi-byte characters) */
7639
7640 else
7641 {
7642 for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
7643 if (min == max) continue;
7644 if (minimize)
7645 {
7646 for (fi = min;; fi++)
7647 {
7648 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7649 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7650 if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
7651 RRETURN(MATCH_NOMATCH);
7652 }
7653 /* Control never gets here */
7654 }
7655 else
7656 {
7657 pp = eptr;
7658 for (i = min; i < max; i++)
7659 {
7660 if (eptr >= md->end_subject || fc != *eptr) break;
7661 eptr++;
7662 }
7663 while (eptr >= pp)
7664 {
7665 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7666 eptr--;
7667 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7668 }
7669 RRETURN(MATCH_NOMATCH);
7670 }
7671 }
7672 /* Control never gets here */
7673
7674 /* Match a negated single one-byte character. The character we are
7675 checking can be multibyte. */
7676
7677 case OP_NOT:
7678 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7679 ecode++;
7680 GETCHARINCTEST(c, eptr);
7681 if ((ims & PCRE_CASELESS) != 0)
7682 {
7683 #ifdef SUPPORT_UTF8
7684 if (c < 256)
7685 #endif
7686 c = md->lcc[c];
7687 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
7688 }
7689 else
7690 {
7691 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
7692 }
7693 break;
7694
7695 /* Match a negated single one-byte character repeatedly. This is almost a
7696 repeat of the code for a repeated single character, but I haven't found a
7697 nice way of commoning these up that doesn't require a test of the
7698 positive/negative option for each character match. Maybe that wouldn't add
7699 very much to the time taken, but character matching *is* what this is all
7700 about... */
7701
7702 case OP_NOTEXACT:
7703 min = max = GET2(ecode, 1);
7704 ecode += 3;
7705 goto REPEATNOTCHAR;
7706
7707 case OP_NOTUPTO:
7708 case OP_NOTMINUPTO:
7709 min = 0;
7710 max = GET2(ecode, 1);
7711 minimize = *ecode == OP_NOTMINUPTO;
7712 ecode += 3;
7713 goto REPEATNOTCHAR;
7714
7715 case OP_NOTSTAR:
7716 case OP_NOTMINSTAR:
7717 case OP_NOTPLUS:
7718 case OP_NOTMINPLUS:
7719 case OP_NOTQUERY:
7720 case OP_NOTMINQUERY:
7721 c = *ecode++ - OP_NOTSTAR;
7722 minimize = (c & 1) != 0;
7723 min = rep_min[c]; /* Pick up values from tables; */
7724 max = rep_max[c]; /* zero for max => infinity */
7725 if (max == 0) max = INT_MAX;
7726
7727 /* Common code for all repeated single-byte matches. We can give up quickly
7728 if there are fewer than the minimum number of bytes left in the
7729 subject. */
7730
7731 REPEATNOTCHAR:
7732 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7733 fc = *ecode++;
7734
7735 /* The code is duplicated for the caseless and caseful cases, for speed,
7736 since matching characters is likely to be quite common. First, ensure the
7737 minimum number of matches are present. If min = max, continue at the same
7738 level without recursing. Otherwise, if minimizing, keep trying the rest of
7739 the expression and advancing one matching character if failing, up to the
7740 maximum. Alternatively, if maximizing, find the maximum number of
7741 characters and work backwards. */
7742
7743 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
7744 max, eptr));
7745
7746 if ((ims & PCRE_CASELESS) != 0)
7747 {
7748 fc = md->lcc[fc];
7749
7750 #ifdef SUPPORT_UTF8
7751 /* UTF-8 mode */
7752 if (md->utf8)
7753 {
7754 register int d;
7755 for (i = 1; i <= min; i++)
7756 {
7757 GETCHARINC(d, eptr);
7758 if (d < 256) d = md->lcc[d];
7759 if (fc == d) RRETURN(MATCH_NOMATCH);
7760 }
7761 }
7762 else
7763 #endif
7764
7765 /* Not UTF-8 mode */
7766 {
7767 for (i = 1; i <= min; i++)
7768 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
7769 }
7770
7771 if (min == max) continue;
7772
7773 if (minimize)
7774 {
7775 #ifdef SUPPORT_UTF8
7776 /* UTF-8 mode */
7777 if (md->utf8)
7778 {
7779 register int d;
7780 for (fi = min;; fi++)
7781 {
7782 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7783 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7784 GETCHARINC(d, eptr);
7785 if (d < 256) d = md->lcc[d];
7786 if (fi >= max || eptr >= md->end_subject || fc == d)
7787 RRETURN(MATCH_NOMATCH);
7788 }
7789 }
7790 else
7791 #endif
7792 /* Not UTF-8 mode */
7793 {
7794 for (fi = min;; fi++)
7795 {
7796 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7797 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7798 if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
7799 RRETURN(MATCH_NOMATCH);
7800 }
7801 }
7802 /* Control never gets here */
7803 }
7804
7805 /* Maximize case */
7806
7807 else
7808 {
7809 pp = eptr;
7810
7811 #ifdef SUPPORT_UTF8
7812 /* UTF-8 mode */
7813 if (md->utf8)
7814 {
7815 register int d;
7816 for (i = min; i < max; i++)
7817 {
7818 int len = 1;
7819 if (eptr >= md->end_subject) break;
7820 GETCHARLEN(d, eptr, len);
7821 if (d < 256) d = md->lcc[d];
7822 if (fc == d) break;
7823 eptr += len;
7824 }
7825 for(;;)
7826 {
7827 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7828 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7829 if (eptr-- == pp) break; /* Stop if tried at original pos */
7830 BACKCHAR(eptr);
7831 }
7832 }
7833 else
7834 #endif
7835 /* Not UTF-8 mode */
7836 {
7837 for (i = min; i < max; i++)
7838 {
7839 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
7840 eptr++;
7841 }
7842 while (eptr >= pp)
7843 {
7844 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7845 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7846 eptr--;
7847 }
7848 }
7849
7850 RRETURN(MATCH_NOMATCH);
7851 }
7852 /* Control never gets here */
7853 }
7854
7855 /* Caseful comparisons */
7856
7857 else
7858 {
7859 #ifdef SUPPORT_UTF8
7860 /* UTF-8 mode */
7861 if (md->utf8)
7862 {
7863 register int d;
7864 for (i = 1; i <= min; i++)
7865 {
7866 GETCHARINC(d, eptr);
7867 if (fc == d) RRETURN(MATCH_NOMATCH);
7868 }
7869 }
7870 else
7871 #endif
7872 /* Not UTF-8 mode */
7873 {
7874 for (i = 1; i <= min; i++)
7875 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
7876 }
7877
7878 if (min == max) continue;
7879
7880 if (minimize)
7881 {
7882 #ifdef SUPPORT_UTF8
7883 /* UTF-8 mode */
7884 if (md->utf8)
7885 {
7886 register int d;
7887 for (fi = min;; fi++)
7888 {
7889 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7890 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7891 GETCHARINC(d, eptr);
7892 if (fi >= max || eptr >= md->end_subject || fc == d)
7893 RRETURN(MATCH_NOMATCH);
7894 }
7895 }
7896 else
7897 #endif
7898 /* Not UTF-8 mode */
7899 {
7900 for (fi = min;; fi++)
7901 {
7902 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7903 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7904 if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
7905 RRETURN(MATCH_NOMATCH);
7906 }
7907 }
7908 /* Control never gets here */
7909 }
7910
7911 /* Maximize case */
7912
7913 else
7914 {
7915 pp = eptr;
7916
7917 #ifdef SUPPORT_UTF8
7918 /* UTF-8 mode */
7919 if (md->utf8)
7920 {
7921 register int d;
7922 for (i = min; i < max; i++)
7923 {
7924 int len = 1;
7925 if (eptr >= md->end_subject) break;
7926 GETCHARLEN(d, eptr, len);
7927 if (fc == d) break;
7928 eptr += len;
7929 }
7930 for(;;)
7931 {
7932 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7933 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7934 if (eptr-- == pp) break; /* Stop if tried at original pos */
7935 BACKCHAR(eptr);
7936 }
7937 }
7938 else
7939 #endif
7940 /* Not UTF-8 mode */
7941 {
7942 for (i = min; i < max; i++)
7943 {
7944 if (eptr >= md->end_subject || fc == *eptr) break;
7945 eptr++;
7946 }
7947 while (eptr >= pp)
7948 {
7949 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7950 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7951 eptr--;
7952 }
7953 }
7954
7955 RRETURN(MATCH_NOMATCH);
7956 }
7957 }
7958 /* Control never gets here */
7959
7960 /* Match a single character type repeatedly; several different opcodes
7961 share code. This is very similar to the code for single characters, but we
7962 repeat it in the interests of efficiency. */
7963
7964 case OP_TYPEEXACT:
7965 min = max = GET2(ecode, 1);
7966 minimize = TRUE;
7967 ecode += 3;
7968 goto REPEATTYPE;
7969
7970 case OP_TYPEUPTO:
7971 case OP_TYPEMINUPTO:
7972 min = 0;
7973 max = GET2(ecode, 1);
7974 minimize = *ecode == OP_TYPEMINUPTO;
7975 ecode += 3;
7976 goto REPEATTYPE;
7977
7978 case OP_TYPESTAR:
7979 case OP_TYPEMINSTAR:
7980 case OP_TYPEPLUS:
7981 case OP_TYPEMINPLUS:
7982 case OP_TYPEQUERY:
7983 case OP_TYPEMINQUERY:
7984 c = *ecode++ - OP_TYPESTAR;
7985 minimize = (c & 1) != 0;
7986 min = rep_min[c]; /* Pick up values from tables; */
7987 max = rep_max[c]; /* zero for max => infinity */
7988 if (max == 0) max = INT_MAX;
7989
7990 /* Common code for all repeated single character type matches. Note that
7991 in UTF-8 mode, '.' matches a character of any length, but for the other
7992 character types, the valid characters are all one-byte long. */
7993
7994 REPEATTYPE:
7995 ctype = *ecode++; /* Code for the character type */
7996
7997 #ifdef SUPPORT_UCP
7998 if (ctype == OP_PROP || ctype == OP_NOTPROP)
7999 {
8000 prop_fail_result = ctype == OP_NOTPROP;
8001 prop_type = *ecode++;
8002 if (prop_type >= 128)
8003 {
8004 prop_test_against = prop_type - 128;
8005 prop_test_variable = &prop_category;
8006 }
8007 else
8008 {
8009 prop_test_against = prop_type;
8010 prop_test_variable = &prop_chartype;
8011 }
8012 }
8013 else prop_type = -1;
8014 #endif
8015
8016 /* First, ensure the minimum number of matches are present. Use inline
8017 code for maximizing the speed, and do the type test once at the start
8018 (i.e. keep it out of the loop). Also we can test that there are at least
8019 the minimum number of bytes before we start. This isn't as effective in
8020 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
8021 is tidier. Also separate the UCP code, which can be the same for both UTF-8
8022 and single-bytes. */
8023
8024 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
8025 if (min > 0)
8026 {
8027 #ifdef SUPPORT_UCP
8028 if (prop_type > 0)
8029 {
8030 for (i = 1; i <= min; i++)
8031 {
8032 GETCHARINC(c, eptr);
8033 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8034 if ((*prop_test_variable == prop_test_against) == prop_fail_result)
8035 RRETURN(MATCH_NOMATCH);
8036 }
8037 }
8038
8039 /* Match extended Unicode sequences. We will get here only if the
8040 support is in the binary; otherwise a compile-time error occurs. */
8041
8042 else if (ctype == OP_EXTUNI)
8043 {
8044 for (i = 1; i <= min; i++)
8045 {
8046 GETCHARINCTEST(c, eptr);
8047 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8048 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
8049 while (eptr < md->end_subject)
8050 {
8051 int len = 1;
8052 if (!md->utf8) c = *eptr; else
8053 {
8054 GETCHARLEN(c, eptr, len);
8055 }
8056 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8057 if (prop_category != ucp_M) break;
8058 eptr += len;
8059 }
8060 }
8061 }
8062
8063 else
8064 #endif /* SUPPORT_UCP */
8065
8066 /* Handle all other cases when the coding is UTF-8 */
8067
8068 #ifdef SUPPORT_UTF8
8069 if (md->utf8) switch(ctype)
8070 {
8071 case OP_ANY:
8072 for (i = 1; i <= min; i++)
8073 {
8074 if (eptr >= md->end_subject ||
8075 (*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0))
8076 RRETURN(MATCH_NOMATCH);
8077 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8078 }
8079 break;
8080
8081 case OP_ANYBYTE:
8082 eptr += min;
8083 break;
8084
8085 case OP_NOT_DIGIT:
8086 for (i = 1; i <= min; i++)
8087 {
8088 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8089 GETCHARINC(c, eptr);
8090 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
8091 RRETURN(MATCH_NOMATCH);
8092 }
8093 break;
8094
8095 case OP_DIGIT:
8096 for (i = 1; i <= min; i++)
8097 {
8098 if (eptr >= md->end_subject ||
8099 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
8100 RRETURN(MATCH_NOMATCH);
8101 /* No need to skip more bytes - we know it's a 1-byte character */
8102 }
8103 break;
8104
8105 case OP_NOT_WHITESPACE:
8106 for (i = 1; i <= min; i++)
8107 {
8108 if (eptr >= md->end_subject ||
8109 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
8110 RRETURN(MATCH_NOMATCH);
8111 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8112 }
8113 break;
8114
8115 case OP_WHITESPACE:
8116 for (i = 1; i <= min; i++)
8117 {
8118 if (eptr >= md->end_subject ||
8119 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
8120 RRETURN(MATCH_NOMATCH);
8121 /* No need to skip more bytes - we know it's a 1-byte character */
8122 }
8123 break;
8124
8125 case OP_NOT_WORDCHAR:
8126 for (i = 1; i <= min; i++)
8127 {
8128 if (eptr >= md->end_subject ||
8129 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
8130 RRETURN(MATCH_NOMATCH);
8131 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8132 }
8133 break;
8134
8135 case OP_WORDCHAR:
8136 for (i = 1; i <= min; i++)
8137 {
8138 if (eptr >= md->end_subject ||
8139 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
8140 RRETURN(MATCH_NOMATCH);
8141 /* No need to skip more bytes - we know it's a 1-byte character */
8142 }
8143 break;
8144
8145 default:
8146 RRETURN(PCRE_ERROR_INTERNAL);
8147 } /* End switch(ctype) */
8148
8149 else
8150 #endif /* SUPPORT_UTF8 */
8151
8152 /* Code for the non-UTF-8 case for minimum matching of operators other
8153 than OP_PROP and OP_NOTPROP. */
8154
8155 switch(ctype)
8156 {
8157 case OP_ANY:
8158 if ((ims & PCRE_DOTALL) == 0)
8159 {
8160 for (i = 1; i <= min; i++)
8161 if (*eptr++ == NEWLINE) RRETURN(MATCH_NOMATCH);
8162 }
8163 else eptr += min;
8164 break;
8165
8166 case OP_ANYBYTE:
8167 eptr += min;
8168 break;
8169
8170 case OP_NOT_DIGIT:
8171 for (i = 1; i <= min; i++)
8172 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
8173 break;
8174
8175 case OP_DIGIT:
8176 for (i = 1; i <= min; i++)
8177 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
8178 break;
8179
8180 case OP_NOT_WHITESPACE:
8181 for (i = 1; i <= min; i++)
8182 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
8183 break;
8184
8185 case OP_WHITESPACE:
8186 for (i = 1; i <= min; i++)
8187 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
8188 break;
8189
8190 case OP_NOT_WORDCHAR:
8191 for (i = 1; i <= min; i++)
8192 if ((md->ctypes[*eptr++] & ctype_word) != 0)
8193 RRETURN(MATCH_NOMATCH);
8194 break;
8195
8196 case OP_WORDCHAR:
8197 for (i = 1; i <= min; i++)
8198 if ((md->ctypes[*eptr++] & ctype_word) == 0)
8199 RRETURN(MATCH_NOMATCH);
8200 break;
8201
8202 default:
8203 RRETURN(PCRE_ERROR_INTERNAL);
8204 }
8205 }
8206
8207 /* If min = max, continue at the same level without recursing */
8208
8209 if (min == max) continue;
8210
8211 /* If minimizing, we have to test the rest of the pattern before each
8212 subsequent match. Again, separate the UTF-8 case for speed, and also
8213 separate the UCP cases. */
8214
8215 if (minimize)
8216 {
8217 #ifdef SUPPORT_UCP
8218 if (prop_type > 0)
8219 {
8220 for (fi = min;; fi++)
8221 {
8222 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8223 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8224 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8225 GETCHARINC(c, eptr);
8226 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8227 if ((*prop_test_variable == prop_test_against) == prop_fail_result)
8228 RRETURN(MATCH_NOMATCH);
8229 }
8230 }
8231
8232 /* Match extended Unicode sequences. We will get here only if the
8233 support is in the binary; otherwise a compile-time error occurs. */
8234
8235 else if (ctype == OP_EXTUNI)
8236 {
8237 for (fi = min;; fi++)
8238 {
8239 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8240 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8241 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8242 GETCHARINCTEST(c, eptr);
8243 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8244 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
8245 while (eptr < md->end_subject)
8246 {
8247 int len = 1;
8248 if (!md->utf8) c = *eptr; else
8249 {
8250 GETCHARLEN(c, eptr, len);
8251 }
8252 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8253 if (prop_category != ucp_M) break;
8254 eptr += len;
8255 }
8256 }
8257 }
8258
8259 else
8260 #endif /* SUPPORT_UCP */
8261
8262 #ifdef SUPPORT_UTF8
8263 /* UTF-8 mode */
8264 if (md->utf8)
8265 {
8266 for (fi = min;; fi++)
8267 {
8268 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8269 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8270 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8271
8272 GETCHARINC(c, eptr);
8273 switch(ctype)
8274 {
8275 case OP_ANY:
8276 if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
8277 break;
8278
8279 case OP_ANYBYTE:
8280 break;
8281
8282 case OP_NOT_DIGIT:
8283 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
8284 RRETURN(MATCH_NOMATCH);
8285 break;
8286
8287 case OP_DIGIT:
8288 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
8289 RRETURN(MATCH_NOMATCH);
8290 break;
8291
8292 case OP_NOT_WHITESPACE:
8293 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
8294 RRETURN(MATCH_NOMATCH);
8295 break;
8296
8297 case OP_WHITESPACE:
8298 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
8299 RRETURN(MATCH_NOMATCH);
8300 break;
8301
8302 case OP_NOT_WORDCHAR:
8303 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
8304 RRETURN(MATCH_NOMATCH);
8305 break;
8306
8307 case OP_WORDCHAR:
8308 if (c >= 256 && (md->ctypes[c] & ctype_word) == 0)
8309 RRETURN(MATCH_NOMATCH);
8310 break;
8311
8312 default:
8313 RRETURN(PCRE_ERROR_INTERNAL);
8314 }
8315 }
8316 }
8317 else
8318 #endif
8319 /* Not UTF-8 mode */
8320 {
8321 for (fi = min;; fi++)
8322 {
8323 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8324 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8325 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8326 c = *eptr++;
8327 switch(ctype)
8328 {
8329 case OP_ANY:
8330 if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
8331 break;
8332
8333 case OP_ANYBYTE:
8334 break;
8335
8336 case OP_NOT_DIGIT:
8337 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
8338 break;
8339
8340 case OP_DIGIT:
8341 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
8342 break;
8343
8344 case OP_NOT_WHITESPACE:
8345 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
8346 break;
8347
8348 case OP_WHITESPACE:
8349 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
8350 break;
8351
8352 case OP_NOT_WORDCHAR:
8353 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
8354 break;
8355
8356 case OP_WORDCHAR:
8357 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
8358 break;
8359
8360 default:
8361 RRETURN(PCRE_ERROR_INTERNAL);
8362 }
8363 }
8364 }
8365 /* Control never gets here */
8366 }
8367
8368 /* If maximizing it is worth using inline code for speed, doing the type
8369 test once at the start (i.e. keep it out of the loop). Again, keep the
8370 UTF-8 and UCP stuff separate. */
8371
8372 else
8373 {
8374 pp = eptr; /* Remember where we started */
8375
8376 #ifdef SUPPORT_UCP
8377 if (prop_type > 0)
8378 {
8379 for (i = min; i < max; i++)
8380 {
8381 int len = 1;
8382 if (eptr >= md->end_subject) break;
8383 GETCHARLEN(c, eptr, len);
8384 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8385 if ((*prop_test_variable == prop_test_against) == prop_fail_result)
8386 break;
8387 eptr+= len;
8388 }
8389
8390 /* eptr is now past the end of the maximum run */
8391
8392 for(;;)
8393 {
8394 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8395 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8396 if (eptr-- == pp) break; /* Stop if tried at original pos */
8397 BACKCHAR(eptr);
8398 }
8399 }
8400
8401 /* Match extended Unicode sequences. We will get here only if the
8402 support is in the binary; otherwise a compile-time error occurs. */
8403
8404 else if (ctype == OP_EXTUNI)
8405 {
8406 for (i = min; i < max; i++)
8407 {
8408 if (eptr >= md->end_subject) break;
8409 GETCHARINCTEST(c, eptr);
8410 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8411 if (prop_category == ucp_M) break;
8412 while (eptr < md->end_subject)
8413 {
8414 int len = 1;
8415 if (!md->utf8) c = *eptr; else
8416 {
8417 GETCHARLEN(c, eptr, len);
8418 }
8419 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8420 if (prop_category != ucp_M) break;
8421 eptr += len;
8422 }
8423 }
8424
8425 /* eptr is now past the end of the maximum run */
8426
8427 for(;;)
8428 {
8429 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8430 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8431 if (eptr-- == pp) break; /* Stop if tried at original pos */
8432 for (;;) /* Move back over one extended */
8433 {
8434 int len = 1;
8435 BACKCHAR(eptr);
8436 if (!md->utf8) c = *eptr; else
8437 {
8438 GETCHARLEN(c, eptr, len);
8439 }
8440 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8441 if (prop_category != ucp_M) break;
8442 eptr--;
8443 }
8444 }
8445 }
8446
8447 else
8448 #endif /* SUPPORT_UCP */
8449
8450 #ifdef SUPPORT_UTF8
8451 /* UTF-8 mode */
8452
8453 if (md->utf8)
8454 {
8455 switch(ctype)
8456 {
8457 case OP_ANY:
8458
8459 /* Special code is required for UTF8, but when the maximum is unlimited
8460 we don't need it, so we repeat the non-UTF8 code. This is probably
8461 worth it, because .* is quite a common idiom. */
8462
8463 if (max < INT_MAX)
8464 {
8465 if ((ims & PCRE_DOTALL) == 0)
8466 {
8467 for (i = min; i < max; i++)
8468 {
8469 if (eptr >= md->end_subject || *eptr == NEWLINE) break;
8470 eptr++;
8471 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8472 }
8473 }
8474 else
8475 {
8476 for (i = min; i < max; i++)
8477 {
8478 eptr++;
8479 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8480 }
8481 }
8482 }
8483
8484 /* Handle unlimited UTF-8 repeat */
8485
8486 else
8487 {
8488 if ((ims & PCRE_DOTALL) == 0)
8489 {
8490 for (i = min; i < max; i++)
8491 {
8492 if (eptr >= md->end_subject || *eptr == NEWLINE) break;
8493 eptr++;
8494 }
8495 break;
8496 }
8497 else
8498 {
8499 c = max - min;
8500 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
8501 eptr += c;
8502 }
8503 }
8504 break;
8505
8506 /* The byte case is the same as non-UTF8 */
8507
8508 case OP_ANYBYTE:
8509 c = max - min;
8510 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
8511 eptr += c;
8512 break;
8513
8514 case OP_NOT_DIGIT:
8515 for (i = min; i < max; i++)
8516 {
8517 int len = 1;
8518 if (eptr >= md->end_subject) break;
8519 GETCHARLEN(c, eptr, len);
8520 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
8521 eptr+= len;
8522 }
8523 break;
8524
8525 case OP_DIGIT:
8526 for (i = min; i < max; i++)
8527 {
8528 int len = 1;
8529 if (eptr >= md->end_subject) break;
8530 GETCHARLEN(c, eptr, len);
8531 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
8532 eptr+= len;
8533 }
8534 break;
8535
8536 case OP_NOT_WHITESPACE:
8537 for (i = min; i < max; i++)
8538 {
8539 int len = 1;
8540 if (eptr >= md->end_subject) break;
8541 GETCHARLEN(c, eptr, len);
8542 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
8543 eptr+= len;
8544 }
8545 break;
8546
8547 case OP_WHITESPACE:
8548 for (i = min; i < max; i++)
8549 {
8550 int len = 1;
8551 if (eptr >= md->end_subject) break;
8552 GETCHARLEN(c, eptr, len);
8553 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
8554 eptr+= len;
8555 }
8556 break;
8557
8558 case OP_NOT_WORDCHAR:
8559 for (i = min; i < max; i++)
8560 {
8561 int len = 1;
8562 if (eptr >= md->end_subject) break;
8563 GETCHARLEN(c, eptr, len);
8564 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
8565 eptr+= len;
8566 }
8567 break;
8568
8569 case OP_WORDCHAR:
8570 for (i = min; i < max; i++)
8571 {
8572 int len = 1;
8573 if (eptr >= md->end_subject) break;
8574 GETCHARLEN(c, eptr, len);
8575 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
8576 eptr+= len;
8577 }
8578 break;
8579
8580 default:
8581 RRETURN(PCRE_ERROR_INTERNAL);
8582 }
8583
8584 /* eptr is now past the end of the maximum run */
8585
8586 for(;;)
8587 {
8588 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8589 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8590 if (eptr-- == pp) break; /* Stop if tried at original pos */
8591 BACKCHAR(eptr);
8592 }
8593 }
8594 else
8595 #endif
8596
8597 /* Not UTF-8 mode */
8598 {
8599 switch(ctype)
8600 {
8601 case OP_ANY:
8602 if ((ims & PCRE_DOTALL) == 0)
8603 {
8604 for (i = min; i < max; i++)
8605 {
8606 if (eptr >= md->end_subject || *eptr == NEWLINE) break;
8607 eptr++;
8608 }
8609 break;
8610 }
8611 /* For DOTALL case, fall through and treat as \C */
8612
8613 case OP_ANYBYTE:
8614 c = max - min;
8615 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
8616 eptr += c;
8617 break;
8618
8619 case OP_NOT_DIGIT:
8620 for (i = min; i < max; i++)
8621 {
8622 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
8623 break;
8624 eptr++;
8625 }
8626 break;
8627
8628 case OP_DIGIT:
8629 for (i = min; i < max; i++)
8630 {
8631 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
8632 break;
8633 eptr++;
8634 }
8635 break;
8636
8637 case OP_NOT_WHITESPACE:
8638 for (i = min; i < max; i++)
8639 {
8640 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
8641 break;
8642 eptr++;
8643 }
8644 break;
8645
8646 case OP_WHITESPACE:
8647 for (i = min; i < max; i++)
8648 {
8649 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
8650 break;
8651 eptr++;
8652 }
8653 break;
8654
8655 case OP_NOT_WORDCHAR:
8656 for (i = min; i < max; i++)
8657 {
8658 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
8659 break;
8660 eptr++;
8661 }
8662 break;
8663
8664 case OP_WORDCHAR:
8665 for (i = min; i < max; i++)
8666 {
8667 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
8668 break;
8669 eptr++;
8670 }
8671 break;
8672
8673 default:
8674 RRETURN(PCRE_ERROR_INTERNAL);
8675 }
8676
8677 /* eptr is now past the end of the maximum run */
8678
8679 while (eptr >= pp)
8680 {
8681 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8682 eptr--;
8683 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8684 }
8685 }
8686
8687 /* Get here if we can't make it match with any permitted repetitions */
8688
8689 RRETURN(MATCH_NOMATCH);
8690 }
8691 /* Control never gets here */
8692
8693 /* There's been some horrible disaster. Since all codes > OP_BRA are
8694 for capturing brackets, and there shouldn't be any gaps between 0 and
8695 OP_BRA, arrival here can only mean there is something seriously wrong
8696 in the code above or the OP_xxx definitions. */
8697
8698 default:
8699 DPRINTF(("Unknown opcode %d\n", *ecode));
8700 RRETURN(PCRE_ERROR_UNKNOWN_NODE);
8701 }
8702
8703 /* Do not stick any code in here without much thought; it is assumed
8704 that "continue" in the code above comes out to here to repeat the main
8705 loop. */
8706
8707 } /* End of main loop */
8708 /* Control never reaches here */
8709 }
8710
8711
8712 /***************************************************************************
8713 ****************************************************************************
8714 RECURSION IN THE match() FUNCTION
8715
8716 Undefine all the macros that were defined above to handle this. */
8717
8718 #ifdef NO_RECURSE
8719 #undef eptr
8720 #undef ecode
8721 #undef offset_top
8722 #undef ims
8723 #undef eptrb
8724 #undef flags
8725
8726 #undef callpat
8727 #undef charptr
8728 #undef data
8729 #undef next
8730 #undef pp
8731 #undef prev
8732 #undef saved_eptr
8733
8734 #undef new_recursive
8735
8736 #undef cur_is_word
8737 #undef condition
8738 #undef minimize
8739 #undef prev_is_word
8740
8741 #undef original_ims
8742
8743 #undef ctype
8744 #undef length
8745 #undef max
8746 #undef min
8747 #undef number
8748 #undef offset
8749 #undef op
8750 #undef save_capture_last
8751 #undef save_offset1
8752 #undef save_offset2
8753 #undef save_offset3
8754 #undef stacksave
8755
8756 #undef newptrb
8757
8758 #endif
8759
8760 /* These two are defined as macros in both cases */
8761
8762 #undef fc
8763 #undef fi
8764
8765 /***************************************************************************
8766 ***************************************************************************/
8767
8768
8769
8770 /*************************************************
8771 * Execute a Regular Expression *
8772 *************************************************/
8773
8774 /* This function applies a compiled re to a subject string and picks out
8775 portions of the string if it matches. Two elements in the vector are set for
8776 each substring: the offsets to the start and end of the substring.
8777
8778 Arguments:
8779 argument_re points to the compiled expression
8780 extra_data points to extra data or is NULL
8781 subject points to the subject string
8782 length length of subject string (may contain binary zeros)
8783 start_offset where to start in the subject string
8784 options option bits
8785 offsets points to a vector of ints to be filled in with offsets
8786 offsetcount the number of elements in the vector
8787
8788 Returns: > 0 => success; value is the number of elements filled in
8789 = 0 => success, but offsets is not big enough
8790 -1 => failed to match
8791 < -1 => some kind of unexpected problem
8792 */
8793
8794 EXPORT int
8795 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
8796 const char *subject, int length, int start_offset, int options, int *offsets,
8797 int offsetcount)
8798 {
8799 int rc, resetcount, ocount;
8800 int first_byte = -1;
8801 int req_byte = -1;
8802 int req_byte2 = -1;
8803 unsigned long int ims = 0;
8804 BOOL using_temporary_offsets = FALSE;
8805 BOOL anchored;
8806 BOOL startline;
8807 BOOL first_byte_caseless = FALSE;
8808 BOOL req_byte_caseless = FALSE;
8809 match_data match_block;
8810 const uschar *tables;
8811 const uschar *start_bits = NULL;
8812 const uschar *start_match = (const uschar *)subject + start_offset;
8813 const uschar *end_subject;
8814 const uschar *req_byte_ptr = start_match - 1;
8815
8816 pcre_study_data internal_study;
8817 const pcre_study_data *study;
8818
8819 real_pcre internal_re;
8820 const real_pcre *external_re = (const real_pcre *)argument_re;
8821 const real_pcre *re = external_re;
8822
8823 /* Plausibility checks */
8824
8825 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
8826 if (re == NULL || subject == NULL ||
8827 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
8828 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
8829
8830 /* Fish out the optional data from the extra_data structure, first setting
8831 the default values. */
8832
8833 study = NULL;
8834 match_block.match_limit = MATCH_LIMIT;
8835 match_block.callout_data = NULL;
8836
8837 /* The table pointer is always in native byte order. */
8838
8839 tables = external_re->tables;
8840
8841 if (extra_data != NULL)
8842 {
8843 register unsigned int flags = extra_data->flags;
8844 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
8845 study = (const pcre_study_data *)extra_data->study_data;
8846 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
8847 match_block.match_limit = extra_data->match_limit;
8848 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
8849 match_block.callout_data = extra_data->callout_data;
8850 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
8851 }
8852
8853 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
8854 is a feature that makes it possible to save compiled regex and re-use them
8855 in other programs later. */
8856
8857 if (tables == NULL) tables = pcre_default_tables;
8858
8859 /* Check that the first field in the block is the magic number. If it is not,
8860 test for a regex that was compiled on a host of opposite endianness. If this is
8861 the case, flipped values are put in internal_re and internal_study if there was
8862 study data too. */
8863
8864 if (re->magic_number != MAGIC_NUMBER)
8865 {
8866 re = try_flipped(re, &internal_re, study, &internal_study);
8867 if (re == NULL) return PCRE_ERROR_BADMAGIC;
8868 if (study != NULL) study = &internal_study;
8869 }
8870
8871 /* Set up other data */
8872
8873 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
8874 startline = (re->options & PCRE_STARTLINE) != 0;
8875
8876 /* The code starts after the real_pcre block and the capture name table. */
8877
8878 match_block.start_code = (const uschar *)external_re + re->name_table_offset +
8879 re->name_count * re->name_entry_size;
8880
8881 match_block.start_subject = (const uschar *)subject;
8882 match_block.start_offset = start_offset;
8883 match_block.end_subject = match_block.start_subject + length;
8884 end_subject = match_block.end_subject;
8885
8886 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
8887 match_block.utf8 = (re->options & PCRE_UTF8) != 0;
8888
8889 match_block.notbol = (options & PCRE_NOTBOL) != 0;
8890 match_block.noteol = (options & PCRE_NOTEOL) != 0;
8891 match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
8892 match_block.partial = (options & PCRE_PARTIAL) != 0;
8893 match_block.hitend = FALSE;
8894
8895 match_block.recursive = NULL; /* No recursion at top level */
8896
8897 match_block.lcc = tables + lcc_offset;
8898 match_block.ctypes = tables + ctypes_offset;
8899
8900 /* Partial matching is supported only for a restricted set of regexes at the
8901 moment. */
8902
8903 if (match_block.partial && (re->options & PCRE_NOPARTIAL) != 0)
8904 return PCRE_ERROR_BADPARTIAL;
8905
8906 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
8907 back the character offset. */
8908
8909 #ifdef SUPPORT_UTF8
8910 if (match_block.utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
8911 {
8912 if (valid_utf8((uschar *)subject, length) >= 0)
8913 return PCRE_ERROR_BADUTF8;
8914 if (start_offset > 0 && start_offset < length)
8915 {
8916 int tb = ((uschar *)subject)[start_offset];
8917 if (tb > 127)
8918 {
8919 tb &= 0xc0;
8920 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
8921 }
8922 }
8923 }
8924 #endif
8925
8926 /* The ims options can vary during the matching as a result of the presence
8927 of (?ims) items in the pattern. They are kept in a local variable so that
8928 restoring at the exit of a group is easy. */
8929
8930 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
8931
8932 /* If the expression has got more back references than the offsets supplied can
8933 hold, we get a temporary chunk of working store to use during the matching.
8934 Otherwise, we can use the vector supplied, rounding down its size to a multiple
8935 of 3. */
8936
8937 ocount = offsetcount - (offsetcount % 3);
8938
8939 if (re->top_backref > 0 && re->top_backref >= ocount/3)
8940 {
8941 ocount = re->top_backref * 3 + 3;
8942 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
8943 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
8944 using_temporary_offsets = TRUE;
8945 DPRINTF(("Got memory to hold back references\n"));
8946 }
8947 else match_block.offset_vector = offsets;
8948
8949 match_block.offset_end = ocount;
8950 match_block.offset_max = (2*ocount)/3;
8951 match_block.offset_overflow = FALSE;
8952 match_block.capture_last = -1;
8953
8954 /* Compute the minimum number of offsets that we need to reset each time. Doing
8955 this makes a huge difference to execution time when there aren't many brackets
8956 in the pattern. */
8957
8958 resetcount = 2 + re->top_bracket * 2;
8959 if (resetcount > offsetcount) resetcount = ocount;
8960
8961 /* Reset the working variable associated with each extraction. These should
8962 never be used unless previously set, but they get saved and restored, and so we
8963 initialize them to avoid reading uninitialized locations. */
8964
8965 if (match_block.offset_vector != NULL)
8966 {
8967 register int *iptr = match_block.offset_vector + ocount;
8968 register int *iend = iptr - resetcount/2 + 1;
8969 while (--iptr >= iend) *iptr = -1;
8970 }
8971
8972 /* Set up the first character to match, if available. The first_byte value is
8973 never set for an anchored regular expression, but the anchoring may be forced
8974 at run time, so we have to test for anchoring. The first char may be unset for
8975 an unanchored pattern, of course. If there's no first char and the pattern was
8976 studied, there may be a bitmap of possible first characters. */
8977
8978 if (!anchored)
8979 {
8980 if ((re->options & PCRE_FIRSTSET) != 0)
8981 {
8982 first_byte = re->first_byte & 255;
8983 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
8984 first_byte = match_block.lcc[first_byte];
8985 }
8986 else
8987 if (!startline && study != NULL &&
8988 (study->options & PCRE_STUDY_MAPPED) != 0)
8989 start_bits = study->start_bits;
8990 }
8991
8992 /* For anchored or unanchored matches, there may be a "last known required
8993 character" set. */
8994
8995 if ((re->options & PCRE_REQCHSET) != 0)
8996 {
8997 req_byte = re->req_byte & 255;
8998 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
8999 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
9000 }
9001
9002 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
9003 the loop runs just once. */
9004
9005 do
9006 {
9007 /* Reset the maximum number of extractions we might see. */
9008
9009 if (match_block.offset_vector != NULL)
9010 {
9011 register int *iptr = match_block.offset_vector;
9012 register int *iend = iptr + resetcount;
9013 while (iptr < iend) *iptr++ = -1;
9014 }
9015
9016 /* Advance to a unique first char if possible */
9017
9018 if (first_byte >= 0)
9019 {
9020 if (first_byte_caseless)
9021 while (start_match < end_subject &&
9022 match_block.lcc[*start_match] != first_byte)
9023 start_match++;
9024 else
9025 while (start_match < end_subject && *start_match != first_byte)
9026 start_match++;
9027 }
9028
9029 /* Or to just after \n for a multiline match if possible */
9030
9031 else if (startline)
9032 {
9033 if (start_match > match_block.start_subject + start_offset)
9034 {
9035 while (start_match < end_subject && start_match[-1] != NEWLINE)
9036 start_match++;
9037 }
9038 }
9039
9040 /* Or to a non-unique first char after study */
9041
9042 else if (start_bits != NULL)
9043 {
9044 while (start_match < end_subject)
9045 {
9046 register unsigned int c = *start_match;
9047 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
9048 }
9049 }
9050
9051 #ifdef DEBUG /* Sigh. Some compilers never learn. */
9052 printf(">>>> Match against: ");
9053 pchars(start_match, end_subject - start_match, TRUE, &match_block);
9054 printf("\n");
9055 #endif
9056
9057 /* If req_byte is set, we know that that character must appear in the subject
9058 for the match to succeed. If the first character is set, req_byte must be
9059 later in the subject; otherwise the test starts at the match point. This
9060 optimization can save a huge amount of backtracking in patterns with nested
9061 unlimited repeats that aren't going to match. Writing separate code for
9062 cased/caseless versions makes it go faster, as does using an autoincrement
9063 and backing off on a match.
9064
9065 HOWEVER: when the subject string is very, very long, searching to its end can
9066 take a long time, and give bad performance on quite ordinary patterns. This
9067 showed up when somebody was matching /^C/ on a 32-megabyte string... so we
9068 don't do this when the string is sufficiently long.
9069
9070 ALSO: this processing is disabled when partial matching is requested.
9071 */
9072
9073 if (req_byte >= 0 &&
9074 end_subject - start_match < REQ_BYTE_MAX &&
9075 !match_block.partial)
9076 {
9077 register const uschar *p = start_match + ((first_byte >= 0)? 1 : 0);
9078
9079 /* We don't need to repeat the search if we haven't yet reached the
9080 place we found it at last time. */
9081
9082 if (p > req_byte_ptr)
9083 {
9084 if (req_byte_caseless)
9085 {
9086 while (p < end_subject)
9087 {
9088 register int pp = *p++;
9089 if (pp == req_byte || pp == req_byte2) { p--; break; }
9090 }
9091 }
9092 else
9093 {
9094 while (p < end_subject)
9095 {
9096 if (*p++ == req_byte) { p--; break; }
9097 }
9098 }
9099
9100 /* If we can't find the required character, break the matching loop */
9101
9102 if (p >= end_subject) break;
9103
9104 /* If we have found the required character, save the point where we
9105 found it, so that we don't search again next time round the loop if
9106 the start hasn't passed this character yet. */
9107
9108 req_byte_ptr = p;
9109 }
9110 }
9111
9112 /* When a match occurs, substrings will be set for all internal extractions;
9113 we just need to set up the whole thing as substring 0 before returning. If
9114 there were too many extractions, set the return code to zero. In the case
9115 where we had to get some local store to hold offsets for backreferences, copy
9116 those back references that we can. In this case there need not be overflow
9117 if certain parts of the pattern were not used. */
9118
9119 match_block.start_match = start_match;
9120 match_block.match_call_count = 0;
9121
9122 rc = match(start_match, match_block.start_code, 2, &match_block, ims, NULL,
9123 match_isgroup);
9124
9125 if (rc == MATCH_NOMATCH)
9126 {
9127 start_match++;
9128 #ifdef SUPPORT_UTF8
9129 if (match_block.utf8)
9130 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
9131 start_match++;
9132 #endif
9133 continue;
9134 }
9135
9136 if (rc != MATCH_MATCH)
9137 {
9138 DPRINTF((">>>> error: returning %d\n", rc));
9139 return rc;
9140 }
9141
9142 /* We have a match! Copy the offset information from temporary store if
9143 necessary */
9144
9145 if (using_temporary_offsets)
9146 {
9147 if (offsetcount >= 4)
9148 {
9149 memcpy(offsets + 2, match_block.offset_vector + 2,
9150 (offsetcount - 2) * sizeof(int));
9151 DPRINTF(("Copied offsets from temporary memory\n"));
9152 }
9153 if (match_block.end_offset_top > offsetcount)
9154 match_block.offset_overflow = TRUE;
9155
9156 DPRINTF(("Freeing temporary memory\n"));
9157 (pcre_free)(match_block.offset_vector);
9158 }
9159
9160 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
9161
9162 if (offsetcount < 2) rc = 0; else
9163 {
9164 offsets[0] = start_match - match_block.start_subject;
9165 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
9166 }
9167
9168 DPRINTF((">>>> returning %d\n", rc));
9169 return rc;
9170 }
9171
9172 /* This "while" is the end of the "do" above */
9173
9174 while (!anchored && start_match <= end_subject);
9175
9176 if (using_temporary_offsets)
9177 {
9178 DPRINTF(("Freeing temporary memory\n"));
9179 (pcre_free)(match_block.offset_vector);
9180 }
9181
9182 if (match_block.partial && match_block.hitend)
9183 {
9184 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
9185 return PCRE_ERROR_PARTIAL;
9186 }
9187 else
9188 {
9189 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
9190 return PCRE_ERROR_NOMATCH;
9191 }
9192 }
9193
9194 /* End of pcre.c */