Commit | Line | Data |
---|---|---|
6bf342e1 | 1 | /* $Cambridge: exim/src/src/pcre/pcre_internal.h,v 1.4 2007/01/23 15:08:45 ph10 Exp $ */ |
8ac170f3 PH |
2 | |
3 | /************************************************* | |
4 | * Perl-Compatible Regular Expressions * | |
5 | *************************************************/ | |
6 | ||
7 | ||
8 | /* PCRE is a library of functions to support regular expressions whose syntax | |
9 | and semantics are as close as possible to those of the Perl 5 language. | |
10 | ||
11 | Written by Philip Hazel | |
aa41d2de | 12 | Copyright (c) 1997-2006 University of Cambridge |
8ac170f3 PH |
13 | |
14 | ----------------------------------------------------------------------------- | |
15 | Redistribution and use in source and binary forms, with or without | |
16 | modification, are permitted provided that the following conditions are met: | |
17 | ||
18 | * Redistributions of source code must retain the above copyright notice, | |
19 | this list of conditions and the following disclaimer. | |
20 | ||
21 | * Redistributions in binary form must reproduce the above copyright | |
22 | notice, this list of conditions and the following disclaimer in the | |
23 | documentation and/or other materials provided with the distribution. | |
24 | ||
25 | * Neither the name of the University of Cambridge nor the names of its | |
26 | contributors may be used to endorse or promote products derived from | |
27 | this software without specific prior written permission. | |
28 | ||
29 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
30 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
31 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
32 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |
33 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
34 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
35 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
36 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
37 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
38 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
39 | POSSIBILITY OF SUCH DAMAGE. | |
40 | ----------------------------------------------------------------------------- | |
41 | */ | |
42 | ||
43 | /* This header contains definitions that are shared between the different | |
44 | modules, but which are not relevant to the exported API. This includes some | |
45 | functions whose names all begin with "_pcre_". */ | |
46 | ||
aa41d2de PH |
47 | #ifndef PCRE_INTERNAL_H |
48 | #define PCRE_INTERNAL_H | |
8ac170f3 PH |
49 | |
50 | /* Define DEBUG to get debugging output on stdout. */ | |
51 | ||
aa41d2de | 52 | #if 0 |
8ac170f3 | 53 | #define DEBUG |
aa41d2de | 54 | #endif |
8ac170f3 PH |
55 | |
56 | /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef | |
57 | inline, and there are *still* stupid compilers about that don't like indented | |
58 | pre-processor statements, or at least there were when I first wrote this. After | |
6bf342e1 | 59 | all, it had only been about 10 years then... |
8ac170f3 | 60 | |
6bf342e1 PH |
61 | It turns out that the Mac Debugging.h header also defines the macro DPRINTF, so |
62 | be absolutely sure we get our version. */ | |
63 | ||
64 | #undef DPRINTF | |
8ac170f3 PH |
65 | #ifdef DEBUG |
66 | #define DPRINTF(p) printf p | |
67 | #else | |
6bf342e1 | 68 | #define DPRINTF(p) /* Nothing */ |
8ac170f3 PH |
69 | #endif |
70 | ||
71 | ||
72 | /* Get the definitions provided by running "configure" */ | |
73 | ||
74 | #include "config.h" | |
75 | ||
76 | /* Standard C headers plus the external interface definition. The only time | |
77 | setjmp and stdarg are used is when NO_RECURSE is set. */ | |
78 | ||
79 | #include <ctype.h> | |
80 | #include <limits.h> | |
81 | #include <setjmp.h> | |
82 | #include <stdarg.h> | |
83 | #include <stddef.h> | |
84 | #include <stdio.h> | |
85 | #include <stdlib.h> | |
86 | #include <string.h> | |
87 | ||
88 | #ifndef PCRE_SPY | |
89 | #define PCRE_DEFINITION /* Win32 __declspec(export) trigger for .dll */ | |
90 | #endif | |
91 | ||
92 | /* We need to have types that specify unsigned 16-bit and 32-bit integers. We | |
93 | cannot determine these outside the compilation (e.g. by running a program as | |
94 | part of "configure") because PCRE is often cross-compiled for use on other | |
95 | systems. Instead we make use of the maximum sizes that are available at | |
96 | preprocessor time in standard C environments. */ | |
97 | ||
98 | #if USHRT_MAX == 65535 | |
99 | typedef unsigned short pcre_uint16; | |
100 | #elif UINT_MAX == 65535 | |
101 | typedef unsigned int pcre_uint16; | |
102 | #else | |
103 | #error Cannot determine a type for 16-bit unsigned integers | |
104 | #endif | |
105 | ||
106 | #if UINT_MAX == 4294967295 | |
107 | typedef unsigned int pcre_uint32; | |
108 | #elif ULONG_MAX == 4294967295 | |
109 | typedef unsigned long int pcre_uint32; | |
110 | #else | |
111 | #error Cannot determine a type for 32-bit unsigned integers | |
112 | #endif | |
113 | ||
114 | /* All character handling must be done as unsigned characters. Otherwise there | |
115 | are problems with top-bit-set characters and functions such as isspace(). | |
116 | However, we leave the interface to the outside world as char *, because that | |
117 | should make things easier for callers. We define a short type for unsigned char | |
118 | to save lots of typing. I tried "uchar", but it causes problems on Digital | |
119 | Unix, where it is defined in sys/types, so use "uschar" instead. */ | |
120 | ||
121 | typedef unsigned char uschar; | |
122 | ||
6bf342e1 PH |
123 | /* This is an unsigned int value that no character can ever have. UTF-8 |
124 | characters only go up to 0x7fffffff (though Unicode doesn't go beyond | |
125 | 0x0010ffff). */ | |
126 | ||
127 | #define NOTACHAR 0xffffffff | |
128 | ||
129 | /* PCRE is able to support several different kinds of newline (CR, LF, CRLF, | |
130 | and "all" at present). The following macros are used to package up testing for | |
131 | newlines. NLBLOCK, PSSTART, and PSEND are defined in the various modules to | |
132 | indicate in which datablock the parameters exist, and what the start/end of | |
133 | string field names are. */ | |
134 | ||
135 | #define NLTYPE_FIXED 0 /* Newline is a fixed length string */ | |
136 | #define NLTYPE_ANY 1 /* Newline is any Unicode line ending */ | |
137 | ||
138 | /* This macro checks for a newline at the given position */ | |
aa41d2de PH |
139 | |
140 | #define IS_NEWLINE(p) \ | |
6bf342e1 PH |
141 | ((NLBLOCK->nltype != NLTYPE_FIXED)? \ |
142 | ((p) < NLBLOCK->PSEND && \ | |
143 | _pcre_is_newline((p), NLBLOCK->PSEND, &(NLBLOCK->nllen), utf8) \ | |
144 | ) \ | |
145 | : \ | |
146 | ((p) <= NLBLOCK->PSEND - NLBLOCK->nllen && \ | |
147 | (p)[0] == NLBLOCK->nl[0] && \ | |
148 | (NLBLOCK->nllen == 1 || (p)[1] == NLBLOCK->nl[1]) \ | |
149 | ) \ | |
150 | ) | |
151 | ||
152 | /* This macro checks for a newline immediately preceding the given position */ | |
153 | ||
154 | #define WAS_NEWLINE(p) \ | |
155 | ((NLBLOCK->nltype != NLTYPE_FIXED)? \ | |
156 | ((p) > NLBLOCK->PSSTART && \ | |
157 | _pcre_was_newline((p), NLBLOCK->PSSTART, &(NLBLOCK->nllen), utf8) \ | |
158 | ) \ | |
159 | : \ | |
160 | ((p) >= NLBLOCK->PSSTART + NLBLOCK->nllen && \ | |
161 | (p)[-NLBLOCK->nllen] == NLBLOCK->nl[0] && \ | |
162 | (NLBLOCK->nllen == 1 || (p)[-NLBLOCK->nllen+1] == NLBLOCK->nl[1]) \ | |
163 | ) \ | |
164 | ) | |
aa41d2de PH |
165 | |
166 | /* When PCRE is compiled as a C++ library, the subject pointer can be replaced | |
167 | with a custom type. This makes it possible, for example, to allow pcre_exec() | |
168 | to process subject strings that are discontinuous by using a smart pointer | |
169 | class. It must always be possible to inspect all of the subject string in | |
170 | pcre_exec() because of the way it backtracks. Two macros are required in the | |
171 | normal case, for sign-unspecified and unsigned char pointers. The former is | |
172 | used for the external interface and appears in pcre.h, which is why its name | |
173 | must begin with PCRE_. */ | |
174 | ||
175 | #ifdef CUSTOM_SUBJECT_PTR | |
176 | #define PCRE_SPTR CUSTOM_SUBJECT_PTR | |
177 | #define USPTR CUSTOM_SUBJECT_PTR | |
178 | #else | |
179 | #define PCRE_SPTR const char * | |
180 | #define USPTR const unsigned char * | |
181 | #endif | |
8ac170f3 | 182 | |
aa41d2de PH |
183 | /* Include the public PCRE header and the definitions of UCP character property |
184 | values. */ | |
8ac170f3 | 185 | |
aa41d2de | 186 | #include "pcre.h" |
8ac170f3 PH |
187 | #include "ucp.h" |
188 | ||
189 | /* When compiling for use with the Virtual Pascal compiler, these functions | |
190 | need to have their names changed. PCRE must be compiled with the -DVPCOMPAT | |
191 | option on the command line. */ | |
192 | ||
193 | #ifdef VPCOMPAT | |
194 | #define strncmp(s1,s2,m) _strncmp(s1,s2,m) | |
195 | #define memcpy(d,s,n) _memcpy(d,s,n) | |
196 | #define memmove(d,s,n) _memmove(d,s,n) | |
197 | #define memset(s,c,n) _memset(s,c,n) | |
198 | #else /* VPCOMPAT */ | |
199 | ||
200 | /* To cope with SunOS4 and other systems that lack memmove() but have bcopy(), | |
201 | define a macro for memmove() if HAVE_MEMMOVE is false, provided that HAVE_BCOPY | |
202 | is set. Otherwise, include an emulating function for those systems that have | |
203 | neither (there some non-Unix environments where this is the case). This assumes | |
204 | that all calls to memmove are moving strings upwards in store, which is the | |
205 | case in PCRE. */ | |
206 | ||
207 | #if ! HAVE_MEMMOVE | |
208 | #undef memmove /* some systems may have a macro */ | |
209 | #if HAVE_BCOPY | |
210 | #define memmove(a, b, c) bcopy(b, a, c) | |
211 | #else /* HAVE_BCOPY */ | |
aa41d2de | 212 | static void * |
8ac170f3 PH |
213 | pcre_memmove(unsigned char *dest, const unsigned char *src, size_t n) |
214 | { | |
aa41d2de | 215 | size_t i; |
8ac170f3 PH |
216 | dest += n; |
217 | src += n; | |
218 | for (i = 0; i < n; ++i) *(--dest) = *(--src); | |
aa41d2de | 219 | return dest; |
8ac170f3 PH |
220 | } |
221 | #define memmove(a, b, c) pcre_memmove(a, b, c) | |
222 | #endif /* not HAVE_BCOPY */ | |
223 | #endif /* not HAVE_MEMMOVE */ | |
224 | #endif /* not VPCOMPAT */ | |
225 | ||
226 | ||
227 | /* PCRE keeps offsets in its compiled code as 2-byte quantities (always stored | |
228 | in big-endian order) by default. These are used, for example, to link from the | |
229 | start of a subpattern to its alternatives and its end. The use of 2 bytes per | |
230 | offset limits the size of the compiled regex to around 64K, which is big enough | |
231 | for almost everybody. However, I received a request for an even bigger limit. | |
232 | For this reason, and also to make the code easier to maintain, the storing and | |
233 | loading of offsets from the byte string is now handled by the macros that are | |
234 | defined here. | |
235 | ||
236 | The macros are controlled by the value of LINK_SIZE. This defaults to 2 in | |
237 | the config.h file, but can be overridden by using -D on the command line. This | |
238 | is automated on Unix systems via the "configure" command. */ | |
239 | ||
240 | #if LINK_SIZE == 2 | |
241 | ||
242 | #define PUT(a,n,d) \ | |
243 | (a[n] = (d) >> 8), \ | |
244 | (a[(n)+1] = (d) & 255) | |
245 | ||
246 | #define GET(a,n) \ | |
247 | (((a)[n] << 8) | (a)[(n)+1]) | |
248 | ||
249 | #define MAX_PATTERN_SIZE (1 << 16) | |
250 | ||
251 | ||
252 | #elif LINK_SIZE == 3 | |
253 | ||
254 | #define PUT(a,n,d) \ | |
255 | (a[n] = (d) >> 16), \ | |
256 | (a[(n)+1] = (d) >> 8), \ | |
257 | (a[(n)+2] = (d) & 255) | |
258 | ||
259 | #define GET(a,n) \ | |
260 | (((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2]) | |
261 | ||
262 | #define MAX_PATTERN_SIZE (1 << 24) | |
263 | ||
264 | ||
265 | #elif LINK_SIZE == 4 | |
266 | ||
267 | #define PUT(a,n,d) \ | |
268 | (a[n] = (d) >> 24), \ | |
269 | (a[(n)+1] = (d) >> 16), \ | |
270 | (a[(n)+2] = (d) >> 8), \ | |
271 | (a[(n)+3] = (d) & 255) | |
272 | ||
273 | #define GET(a,n) \ | |
274 | (((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3]) | |
275 | ||
276 | #define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */ | |
277 | ||
278 | ||
279 | #else | |
280 | #error LINK_SIZE must be either 2, 3, or 4 | |
281 | #endif | |
282 | ||
283 | ||
284 | /* Convenience macro defined in terms of the others */ | |
285 | ||
286 | #define PUTINC(a,n,d) PUT(a,n,d), a += LINK_SIZE | |
287 | ||
288 | ||
289 | /* PCRE uses some other 2-byte quantities that do not change when the size of | |
290 | offsets changes. There are used for repeat counts and for other things such as | |
291 | capturing parenthesis numbers in back references. */ | |
292 | ||
293 | #define PUT2(a,n,d) \ | |
294 | a[n] = (d) >> 8; \ | |
295 | a[(n)+1] = (d) & 255 | |
296 | ||
297 | #define GET2(a,n) \ | |
298 | (((a)[n] << 8) | (a)[(n)+1]) | |
299 | ||
300 | #define PUT2INC(a,n,d) PUT2(a,n,d), a += 2 | |
301 | ||
302 | ||
303 | /* When UTF-8 encoding is being used, a character is no longer just a single | |
304 | byte. The macros for character handling generate simple sequences when used in | |
305 | byte-mode, and more complicated ones for UTF-8 characters. */ | |
306 | ||
307 | #ifndef SUPPORT_UTF8 | |
308 | #define GETCHAR(c, eptr) c = *eptr; | |
309 | #define GETCHARTEST(c, eptr) c = *eptr; | |
310 | #define GETCHARINC(c, eptr) c = *eptr++; | |
311 | #define GETCHARINCTEST(c, eptr) c = *eptr++; | |
312 | #define GETCHARLEN(c, eptr, len) c = *eptr; | |
313 | #define BACKCHAR(eptr) | |
314 | ||
315 | #else /* SUPPORT_UTF8 */ | |
316 | ||
317 | /* Get the next UTF-8 character, not advancing the pointer. This is called when | |
318 | we know we are in UTF-8 mode. */ | |
319 | ||
320 | #define GETCHAR(c, eptr) \ | |
321 | c = *eptr; \ | |
6bf342e1 | 322 | if (c >= 0xc0) \ |
8ac170f3 PH |
323 | { \ |
324 | int gcii; \ | |
325 | int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ | |
326 | int gcss = 6*gcaa; \ | |
327 | c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ | |
328 | for (gcii = 1; gcii <= gcaa; gcii++) \ | |
329 | { \ | |
330 | gcss -= 6; \ | |
331 | c |= (eptr[gcii] & 0x3f) << gcss; \ | |
332 | } \ | |
333 | } | |
334 | ||
335 | /* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the | |
336 | pointer. */ | |
337 | ||
338 | #define GETCHARTEST(c, eptr) \ | |
339 | c = *eptr; \ | |
6bf342e1 | 340 | if (utf8 && c >= 0xc0) \ |
8ac170f3 PH |
341 | { \ |
342 | int gcii; \ | |
343 | int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ | |
344 | int gcss = 6*gcaa; \ | |
345 | c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ | |
346 | for (gcii = 1; gcii <= gcaa; gcii++) \ | |
347 | { \ | |
348 | gcss -= 6; \ | |
349 | c |= (eptr[gcii] & 0x3f) << gcss; \ | |
350 | } \ | |
351 | } | |
352 | ||
353 | /* Get the next UTF-8 character, advancing the pointer. This is called when we | |
354 | know we are in UTF-8 mode. */ | |
355 | ||
356 | #define GETCHARINC(c, eptr) \ | |
357 | c = *eptr++; \ | |
6bf342e1 | 358 | if (c >= 0xc0) \ |
8ac170f3 PH |
359 | { \ |
360 | int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ | |
361 | int gcss = 6*gcaa; \ | |
362 | c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ | |
363 | while (gcaa-- > 0) \ | |
364 | { \ | |
365 | gcss -= 6; \ | |
366 | c |= (*eptr++ & 0x3f) << gcss; \ | |
367 | } \ | |
368 | } | |
369 | ||
370 | /* Get the next character, testing for UTF-8 mode, and advancing the pointer */ | |
371 | ||
372 | #define GETCHARINCTEST(c, eptr) \ | |
373 | c = *eptr++; \ | |
6bf342e1 | 374 | if (utf8 && c >= 0xc0) \ |
8ac170f3 PH |
375 | { \ |
376 | int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ | |
377 | int gcss = 6*gcaa; \ | |
378 | c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ | |
379 | while (gcaa-- > 0) \ | |
380 | { \ | |
381 | gcss -= 6; \ | |
382 | c |= (*eptr++ & 0x3f) << gcss; \ | |
383 | } \ | |
384 | } | |
385 | ||
386 | /* Get the next UTF-8 character, not advancing the pointer, incrementing length | |
387 | if there are extra bytes. This is called when we know we are in UTF-8 mode. */ | |
388 | ||
389 | #define GETCHARLEN(c, eptr, len) \ | |
390 | c = *eptr; \ | |
6bf342e1 | 391 | if (c >= 0xc0) \ |
8ac170f3 PH |
392 | { \ |
393 | int gcii; \ | |
394 | int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ | |
395 | int gcss = 6*gcaa; \ | |
396 | c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ | |
397 | for (gcii = 1; gcii <= gcaa; gcii++) \ | |
398 | { \ | |
399 | gcss -= 6; \ | |
400 | c |= (eptr[gcii] & 0x3f) << gcss; \ | |
401 | } \ | |
402 | len += gcaa; \ | |
403 | } | |
404 | ||
405 | /* If the pointer is not at the start of a character, move it back until | |
406 | it is. Called only in UTF-8 mode. */ | |
407 | ||
408 | #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--; | |
409 | ||
410 | #endif | |
411 | ||
412 | ||
413 | /* In case there is no definition of offsetof() provided - though any proper | |
414 | Standard C system should have one. */ | |
415 | ||
416 | #ifndef offsetof | |
417 | #define offsetof(p_type,field) ((size_t)&(((p_type *)0)->field)) | |
418 | #endif | |
419 | ||
420 | ||
421 | /* These are the public options that can change during matching. */ | |
422 | ||
423 | #define PCRE_IMS (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL) | |
424 | ||
aa41d2de PH |
425 | /* Private options flags start at the most significant end of the four bytes. |
426 | The public options defined in pcre.h start at the least significant end. Make | |
427 | sure they don't overlap! The bits are getting a bit scarce now -- when we run | |
428 | out, there is a dummy word in the structure that could be used for the private | |
429 | bits. */ | |
8ac170f3 | 430 | |
aa41d2de | 431 | #define PCRE_NOPARTIAL 0x80000000 /* can't use partial with this regex */ |
8ac170f3 PH |
432 | #define PCRE_FIRSTSET 0x40000000 /* first_byte is set */ |
433 | #define PCRE_REQCHSET 0x20000000 /* req_byte is set */ | |
434 | #define PCRE_STARTLINE 0x10000000 /* start after \n for multiline */ | |
aa41d2de | 435 | #define PCRE_JCHANGED 0x08000000 /* j option changes within regex */ |
8ac170f3 PH |
436 | |
437 | /* Options for the "extra" block produced by pcre_study(). */ | |
438 | ||
439 | #define PCRE_STUDY_MAPPED 0x01 /* a map of starting chars exists */ | |
440 | ||
441 | /* Masks for identifying the public options that are permitted at compile | |
442 | time, run time, or study time, respectively. */ | |
443 | ||
6bf342e1 PH |
444 | #define PCRE_NEWLINE_BITS (PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|PCRE_NEWLINE_ANY) |
445 | ||
8ac170f3 PH |
446 | #define PUBLIC_OPTIONS \ |
447 | (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \ | |
448 | PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \ | |
aa41d2de | 449 | PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE| \ |
6bf342e1 | 450 | PCRE_DUPNAMES|PCRE_NEWLINE_BITS) |
8ac170f3 PH |
451 | |
452 | #define PUBLIC_EXEC_OPTIONS \ | |
453 | (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \ | |
6bf342e1 | 454 | PCRE_PARTIAL|PCRE_NEWLINE_BITS) |
8ac170f3 PH |
455 | |
456 | #define PUBLIC_DFA_EXEC_OPTIONS \ | |
457 | (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \ | |
6bf342e1 | 458 | PCRE_PARTIAL|PCRE_DFA_SHORTEST|PCRE_DFA_RESTART|PCRE_NEWLINE_BITS) |
8ac170f3 PH |
459 | |
460 | #define PUBLIC_STUDY_OPTIONS 0 /* None defined */ | |
461 | ||
462 | /* Magic number to provide a small check against being handed junk. Also used | |
463 | to detect whether a pattern was compiled on a host of different endianness. */ | |
464 | ||
465 | #define MAGIC_NUMBER 0x50435245UL /* 'PCRE' */ | |
466 | ||
467 | /* Negative values for the firstchar and reqchar variables */ | |
468 | ||
469 | #define REQ_UNSET (-2) | |
470 | #define REQ_NONE (-1) | |
471 | ||
472 | /* The maximum remaining length of subject we are prepared to search for a | |
473 | req_byte match. */ | |
474 | ||
475 | #define REQ_BYTE_MAX 1000 | |
476 | ||
477 | /* Flags added to firstbyte or reqbyte; a "non-literal" item is either a | |
478 | variable-length repeat, or a anything other than literal characters. */ | |
479 | ||
480 | #define REQ_CASELESS 0x0100 /* indicates caselessness */ | |
481 | #define REQ_VARY 0x0200 /* reqbyte followed non-literal item */ | |
482 | ||
483 | /* Miscellaneous definitions */ | |
484 | ||
485 | typedef int BOOL; | |
486 | ||
487 | #define FALSE 0 | |
488 | #define TRUE 1 | |
489 | ||
6bf342e1 | 490 | /* Escape items that are just an encoding of a particular data value. */ |
8ac170f3 PH |
491 | |
492 | #ifndef ESC_e | |
493 | #define ESC_e 27 | |
494 | #endif | |
495 | ||
496 | #ifndef ESC_f | |
497 | #define ESC_f '\f' | |
498 | #endif | |
499 | ||
500 | #ifndef ESC_n | |
6bf342e1 | 501 | #define ESC_n '\n' |
8ac170f3 PH |
502 | #endif |
503 | ||
504 | #ifndef ESC_r | |
505 | #define ESC_r '\r' | |
506 | #endif | |
507 | ||
508 | /* We can't officially use ESC_t because it is a POSIX reserved identifier | |
509 | (presumably because of all the others like size_t). */ | |
510 | ||
511 | #ifndef ESC_tee | |
512 | #define ESC_tee '\t' | |
513 | #endif | |
514 | ||
aa41d2de PH |
515 | /* Codes for different types of Unicode property */ |
516 | ||
517 | #define PT_ANY 0 /* Any property - matches all chars */ | |
518 | #define PT_LAMP 1 /* L& - the union of Lu, Ll, Lt */ | |
519 | #define PT_GC 2 /* General characteristic (e.g. L) */ | |
520 | #define PT_PC 3 /* Particular characteristic (e.g. Lu) */ | |
521 | #define PT_SC 4 /* Script (e.g. Han) */ | |
522 | ||
523 | /* Flag bits and data types for the extended class (OP_XCLASS) for classes that | |
524 | contain UTF-8 characters with values greater than 255. */ | |
525 | ||
526 | #define XCL_NOT 0x01 /* Flag: this is a negative class */ | |
527 | #define XCL_MAP 0x02 /* Flag: a 32-byte map is present */ | |
528 | ||
529 | #define XCL_END 0 /* Marks end of individual items */ | |
530 | #define XCL_SINGLE 1 /* Single item (one multibyte char) follows */ | |
531 | #define XCL_RANGE 2 /* A range (two multibyte chars) follows */ | |
532 | #define XCL_PROP 3 /* Unicode property (2-byte property code follows) */ | |
533 | #define XCL_NOTPROP 4 /* Unicode inverted property (ditto) */ | |
534 | ||
8ac170f3 PH |
535 | /* These are escaped items that aren't just an encoding of a particular data |
536 | value such as \n. They must have non-zero values, as check_escape() returns | |
537 | their negation. Also, they must appear in the same order as in the opcode | |
538 | definitions below, up to ESC_z. There's a dummy for OP_ANY because it | |
539 | corresponds to "." rather than an escape sequence. The final one must be | |
6bf342e1 PH |
540 | ESC_REF as subsequent values are used for backreferences (\1, \2, \3, etc). |
541 | There are two tests in the code for an escape greater than ESC_b and less than | |
542 | ESC_Z to detect the types that may be repeated. These are the types that | |
543 | consume characters. If any new escapes are put in between that don't consume a | |
8ac170f3 PH |
544 | character, that code will have to change. */ |
545 | ||
546 | enum { ESC_A = 1, ESC_G, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W, | |
6bf342e1 PH |
547 | ESC_w, ESC_dum1, ESC_C, ESC_P, ESC_p, ESC_R, ESC_X, ESC_Z, ESC_z, |
548 | ESC_E, ESC_Q, ESC_k, ESC_REF }; | |
549 | ||
8ac170f3 | 550 | |
8ac170f3 PH |
551 | /* Opcode table: OP_BRA must be last, as all values >= it are used for brackets |
552 | that extract substrings. Starting from 1 (i.e. after OP_END), the values up to | |
553 | OP_EOD must correspond in order to the list of escapes immediately above. | |
6bf342e1 PH |
554 | |
555 | To keep stored, compiled patterns compatible, new opcodes should be added | |
556 | immediately before OP_BRA, where (since release 7.0) a gap is left for this | |
557 | purpose. | |
558 | ||
559 | *** NOTE NOTE NOTE *** Whenever this list is updated, the two macro definitions | |
560 | that follow must also be updated to match. There is also a table called | |
561 | "coptable" in pcre_dfa_exec.c that must be updated. */ | |
8ac170f3 PH |
562 | |
563 | enum { | |
564 | OP_END, /* 0 End of pattern */ | |
565 | ||
566 | /* Values corresponding to backslashed metacharacters */ | |
567 | ||
568 | OP_SOD, /* 1 Start of data: \A */ | |
569 | OP_SOM, /* 2 Start of match (subject + offset): \G */ | |
570 | OP_NOT_WORD_BOUNDARY, /* 3 \B */ | |
571 | OP_WORD_BOUNDARY, /* 4 \b */ | |
572 | OP_NOT_DIGIT, /* 5 \D */ | |
573 | OP_DIGIT, /* 6 \d */ | |
574 | OP_NOT_WHITESPACE, /* 7 \S */ | |
575 | OP_WHITESPACE, /* 8 \s */ | |
576 | OP_NOT_WORDCHAR, /* 9 \W */ | |
577 | OP_WORDCHAR, /* 10 \w */ | |
578 | OP_ANY, /* 11 Match any character */ | |
579 | OP_ANYBYTE, /* 12 Match any byte (\C); different to OP_ANY for UTF-8 */ | |
580 | OP_NOTPROP, /* 13 \P (not Unicode property) */ | |
581 | OP_PROP, /* 14 \p (Unicode property) */ | |
6bf342e1 PH |
582 | OP_ANYNL, /* 15 \R (any newline sequence) */ |
583 | OP_EXTUNI, /* 16 \X (extended Unicode sequence */ | |
584 | OP_EODN, /* 17 End of data or \n at end of data: \Z. */ | |
585 | OP_EOD, /* 18 End of data: \z */ | |
586 | ||
587 | OP_OPT, /* 19 Set runtime options */ | |
588 | OP_CIRC, /* 20 Start of line - varies with multiline switch */ | |
589 | OP_DOLL, /* 21 End of line - varies with multiline switch */ | |
590 | OP_CHAR, /* 22 Match one character, casefully */ | |
591 | OP_CHARNC, /* 23 Match one character, caselessly */ | |
592 | OP_NOT, /* 24 Match one character, not the following one */ | |
593 | ||
594 | OP_STAR, /* 25 The maximizing and minimizing versions of */ | |
595 | OP_MINSTAR, /* 26 these six opcodes must come in pairs, with */ | |
596 | OP_PLUS, /* 27 the minimizing one second. */ | |
597 | OP_MINPLUS, /* 28 This first set applies to single characters.*/ | |
598 | OP_QUERY, /* 29 */ | |
599 | OP_MINQUERY, /* 30 */ | |
600 | ||
601 | OP_UPTO, /* 31 From 0 to n matches */ | |
602 | OP_MINUPTO, /* 32 */ | |
603 | OP_EXACT, /* 33 Exactly n matches */ | |
604 | ||
605 | OP_POSSTAR, /* 34 Possessified star */ | |
606 | OP_POSPLUS, /* 35 Possessified plus */ | |
607 | OP_POSQUERY, /* 36 Posesssified query */ | |
608 | OP_POSUPTO, /* 37 Possessified upto */ | |
609 | ||
610 | OP_NOTSTAR, /* 38 The maximizing and minimizing versions of */ | |
611 | OP_NOTMINSTAR, /* 39 these six opcodes must come in pairs, with */ | |
612 | OP_NOTPLUS, /* 40 the minimizing one second. They must be in */ | |
613 | OP_NOTMINPLUS, /* 41 exactly the same order as those above. */ | |
614 | OP_NOTQUERY, /* 42 This set applies to "not" single characters. */ | |
615 | OP_NOTMINQUERY, /* 43 */ | |
616 | ||
617 | OP_NOTUPTO, /* 44 From 0 to n matches */ | |
618 | OP_NOTMINUPTO, /* 45 */ | |
619 | OP_NOTEXACT, /* 46 Exactly n matches */ | |
620 | ||
621 | OP_NOTPOSSTAR, /* 47 Possessified versions */ | |
622 | OP_NOTPOSPLUS, /* 48 */ | |
623 | OP_NOTPOSQUERY, /* 49 */ | |
624 | OP_NOTPOSUPTO, /* 50 */ | |
625 | ||
626 | OP_TYPESTAR, /* 51 The maximizing and minimizing versions of */ | |
627 | OP_TYPEMINSTAR, /* 52 these six opcodes must come in pairs, with */ | |
628 | OP_TYPEPLUS, /* 53 the minimizing one second. These codes must */ | |
629 | OP_TYPEMINPLUS, /* 54 be in exactly the same order as those above. */ | |
630 | OP_TYPEQUERY, /* 55 This set applies to character types such as \d */ | |
631 | OP_TYPEMINQUERY, /* 56 */ | |
632 | ||
633 | OP_TYPEUPTO, /* 57 From 0 to n matches */ | |
634 | OP_TYPEMINUPTO, /* 58 */ | |
635 | OP_TYPEEXACT, /* 59 Exactly n matches */ | |
636 | ||
637 | OP_TYPEPOSSTAR, /* 60 Possessified versions */ | |
638 | OP_TYPEPOSPLUS, /* 61 */ | |
639 | OP_TYPEPOSQUERY, /* 62 */ | |
640 | OP_TYPEPOSUPTO, /* 63 */ | |
641 | ||
642 | OP_CRSTAR, /* 64 The maximizing and minimizing versions of */ | |
643 | OP_CRMINSTAR, /* 65 all these opcodes must come in pairs, with */ | |
644 | OP_CRPLUS, /* 66 the minimizing one second. These codes must */ | |
645 | OP_CRMINPLUS, /* 67 be in exactly the same order as those above. */ | |
646 | OP_CRQUERY, /* 68 These are for character classes and back refs */ | |
647 | OP_CRMINQUERY, /* 69 */ | |
648 | OP_CRRANGE, /* 70 These are different to the three sets above. */ | |
649 | OP_CRMINRANGE, /* 71 */ | |
650 | ||
651 | OP_CLASS, /* 72 Match a character class, chars < 256 only */ | |
652 | OP_NCLASS, /* 73 Same, but the bitmap was created from a negative | |
8ac170f3 PH |
653 | class - the difference is relevant only when a UTF-8 |
654 | character > 255 is encountered. */ | |
655 | ||
6bf342e1 | 656 | OP_XCLASS, /* 74 Extended class for handling UTF-8 chars within the |
8ac170f3 PH |
657 | class. This does both positive and negative. */ |
658 | ||
6bf342e1 PH |
659 | OP_REF, /* 75 Match a back reference */ |
660 | OP_RECURSE, /* 76 Match a numbered subpattern (possibly recursive) */ | |
661 | OP_CALLOUT, /* 77 Call out to external function if provided */ | |
8ac170f3 | 662 | |
6bf342e1 PH |
663 | OP_ALT, /* 78 Start of alternation */ |
664 | OP_KET, /* 79 End of group that doesn't have an unbounded repeat */ | |
665 | OP_KETRMAX, /* 80 These two must remain together and in this */ | |
666 | OP_KETRMIN, /* 81 order. They are for groups the repeat for ever. */ | |
8ac170f3 | 667 | |
6bf342e1 | 668 | /* The assertions must come before BRA, CBRA, ONCE, and COND.*/ |
8ac170f3 | 669 | |
6bf342e1 PH |
670 | OP_ASSERT, /* 82 Positive lookahead */ |
671 | OP_ASSERT_NOT, /* 83 Negative lookahead */ | |
672 | OP_ASSERTBACK, /* 84 Positive lookbehind */ | |
673 | OP_ASSERTBACK_NOT, /* 85 Negative lookbehind */ | |
674 | OP_REVERSE, /* 86 Move pointer back - used in lookbehind assertions */ | |
8ac170f3 | 675 | |
6bf342e1 PH |
676 | /* ONCE, BRA, CBRA, and COND must come after the assertions, with ONCE first, |
677 | as there's a test for >= ONCE for a subpattern that isn't an assertion. */ | |
8ac170f3 | 678 | |
6bf342e1 PH |
679 | OP_ONCE, /* 87 Atomic group */ |
680 | OP_BRA, /* 88 Start of non-capturing bracket */ | |
681 | OP_CBRA, /* 89 Start of capturing bracket */ | |
682 | OP_COND, /* 90 Conditional group */ | |
8ac170f3 | 683 | |
6bf342e1 PH |
684 | /* These three must follow the previous three, in the same order. There's a |
685 | check for >= SBRA to distinguish the two sets. */ | |
8ac170f3 | 686 | |
6bf342e1 PH |
687 | OP_SBRA, /* 91 Start of non-capturing bracket, check empty */ |
688 | OP_SCBRA, /* 92 Start of capturing bracket, check empty */ | |
689 | OP_SCOND, /* 93 Conditional group, check empty */ | |
8ac170f3 | 690 | |
6bf342e1 PH |
691 | OP_CREF, /* 94 Used to hold a capture number as condition */ |
692 | OP_RREF, /* 95 Used to hold a recursion number as condition */ | |
693 | OP_DEF, /* 96 The DEFINE condition */ | |
8ac170f3 | 694 | |
6bf342e1 PH |
695 | OP_BRAZERO, /* 97 These two must remain together and in this */ |
696 | OP_BRAMINZERO /* 98 order. */ | |
697 | }; | |
8ac170f3 PH |
698 | |
699 | ||
700 | /* This macro defines textual names for all the opcodes. These are used only | |
701 | for debugging. The macro is referenced only in pcre_printint.c. */ | |
702 | ||
703 | #define OP_NAME_LIST \ | |
704 | "End", "\\A", "\\G", "\\B", "\\b", "\\D", "\\d", \ | |
705 | "\\S", "\\s", "\\W", "\\w", "Any", "Anybyte", \ | |
6bf342e1 | 706 | "notprop", "prop", "anynl", "extuni", \ |
8ac170f3 PH |
707 | "\\Z", "\\z", \ |
708 | "Opt", "^", "$", "char", "charnc", "not", \ | |
709 | "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ | |
6bf342e1 | 710 | "*+","++", "?+", "{", \ |
8ac170f3 | 711 | "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ |
6bf342e1 | 712 | "*+","++", "?+", "{", \ |
8ac170f3 | 713 | "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ |
6bf342e1 | 714 | "*+","++", "?+", "{", \ |
8ac170f3 PH |
715 | "*", "*?", "+", "+?", "?", "??", "{", "{", \ |
716 | "class", "nclass", "xclass", "Ref", "Recurse", "Callout", \ | |
717 | "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", \ | |
6bf342e1 PH |
718 | "AssertB", "AssertB not", "Reverse", \ |
719 | "Once", "Bra 0", "Bra", "Cond", "SBra 0", "SBra", "SCond", \ | |
720 | "Cond ref", "Cond rec", "Cond def", "Brazero", "Braminzero" | |
8ac170f3 PH |
721 | |
722 | ||
723 | /* This macro defines the length of fixed length operations in the compiled | |
724 | regex. The lengths are used when searching for specific things, and also in the | |
725 | debugging printing of a compiled regex. We use a macro so that it can be | |
726 | defined close to the definitions of the opcodes themselves. | |
727 | ||
728 | As things have been extended, some of these are no longer fixed lenths, but are | |
729 | minima instead. For example, the length of a single-character repeat may vary | |
730 | in UTF-8 mode. The code that uses this table must know about such things. */ | |
731 | ||
732 | #define OP_LENGTHS \ | |
733 | 1, /* End */ \ | |
734 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* \A, \G, \B, \B, \D, \d, \S, \s, \W, \w */ \ | |
735 | 1, 1, /* Any, Anybyte */ \ | |
6bf342e1 | 736 | 3, 3, 1, 1, /* NOTPROP, PROP, EXTUNI, ANYNL */ \ |
8ac170f3 PH |
737 | 1, 1, 2, 1, 1, /* \Z, \z, Opt, ^, $ */ \ |
738 | 2, /* Char - the minimum length */ \ | |
739 | 2, /* Charnc - the minimum length */ \ | |
740 | 2, /* not */ \ | |
741 | /* Positive single-char repeats ** These are */ \ | |
742 | 2, 2, 2, 2, 2, 2, /* *, *?, +, +?, ?, ?? ** minima in */ \ | |
743 | 4, 4, 4, /* upto, minupto, exact ** UTF-8 mode */ \ | |
6bf342e1 | 744 | 2, 2, 2, 4, /* *+, ++, ?+, upto+ */ \ |
8ac170f3 PH |
745 | /* Negative single-char repeats - only for chars < 256 */ \ |
746 | 2, 2, 2, 2, 2, 2, /* NOT *, *?, +, +?, ?, ?? */ \ | |
747 | 4, 4, 4, /* NOT upto, minupto, exact */ \ | |
6bf342e1 | 748 | 2, 2, 2, 4, /* Possessive *, +, ?, upto */ \ |
8ac170f3 PH |
749 | /* Positive type repeats */ \ |
750 | 2, 2, 2, 2, 2, 2, /* Type *, *?, +, +?, ?, ?? */ \ | |
751 | 4, 4, 4, /* Type upto, minupto, exact */ \ | |
6bf342e1 | 752 | 2, 2, 2, 4, /* Possessive *+, ++, ?+, upto+ */ \ |
8ac170f3 PH |
753 | /* Character class & ref repeats */ \ |
754 | 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ \ | |
755 | 5, 5, /* CRRANGE, CRMINRANGE */ \ | |
756 | 33, /* CLASS */ \ | |
757 | 33, /* NCLASS */ \ | |
758 | 0, /* XCLASS - variable length */ \ | |
759 | 3, /* REF */ \ | |
760 | 1+LINK_SIZE, /* RECURSE */ \ | |
761 | 2+2*LINK_SIZE, /* CALLOUT */ \ | |
762 | 1+LINK_SIZE, /* Alt */ \ | |
763 | 1+LINK_SIZE, /* Ket */ \ | |
764 | 1+LINK_SIZE, /* KetRmax */ \ | |
765 | 1+LINK_SIZE, /* KetRmin */ \ | |
766 | 1+LINK_SIZE, /* Assert */ \ | |
767 | 1+LINK_SIZE, /* Assert not */ \ | |
768 | 1+LINK_SIZE, /* Assert behind */ \ | |
769 | 1+LINK_SIZE, /* Assert behind not */ \ | |
770 | 1+LINK_SIZE, /* Reverse */ \ | |
6bf342e1 PH |
771 | 1+LINK_SIZE, /* ONCE */ \ |
772 | 1+LINK_SIZE, /* BRA */ \ | |
773 | 3+LINK_SIZE, /* CBRA */ \ | |
8ac170f3 | 774 | 1+LINK_SIZE, /* COND */ \ |
6bf342e1 PH |
775 | 1+LINK_SIZE, /* SBRA */ \ |
776 | 3+LINK_SIZE, /* SCBRA */ \ | |
777 | 1+LINK_SIZE, /* SCOND */ \ | |
8ac170f3 | 778 | 3, /* CREF */ \ |
6bf342e1 PH |
779 | 3, /* RREF */ \ |
780 | 1, /* DEF */ \ | |
8ac170f3 | 781 | 1, 1, /* BRAZERO, BRAMINZERO */ \ |
8ac170f3 PH |
782 | |
783 | ||
6bf342e1 | 784 | /* A magic value for OP_RREF to indicate the "any recursion" condition. */ |
8ac170f3 | 785 | |
6bf342e1 | 786 | #define RREF_ANY 0xffff |
8ac170f3 PH |
787 | |
788 | /* Error code numbers. They are given names so that they can more easily be | |
789 | tracked. */ | |
790 | ||
791 | enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, | |
792 | ERR10, ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, | |
793 | ERR20, ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, | |
794 | ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, | |
aa41d2de | 795 | ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, |
6bf342e1 | 796 | ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57 }; |
8ac170f3 PH |
797 | |
798 | /* The real format of the start of the pcre block; the index of names and the | |
799 | code vector run on as long as necessary after the end. We store an explicit | |
800 | offset to the name table so that if a regex is compiled on one host, saved, and | |
801 | then run on another where the size of pointers is different, all might still | |
802 | be well. For the case of compiled-on-4 and run-on-8, we include an extra | |
803 | pointer that is always NULL. For future-proofing, a few dummy fields were | |
804 | originally included - even though you can never get this planning right - but | |
805 | there is only one left now. | |
806 | ||
807 | NOTE NOTE NOTE: | |
808 | Because people can now save and re-use compiled patterns, any additions to this | |
809 | structure should be made at the end, and something earlier (e.g. a new | |
810 | flag in the options or one of the dummy fields) should indicate that the new | |
811 | fields are present. Currently PCRE always sets the dummy fields to zero. | |
812 | NOTE NOTE NOTE: | |
813 | */ | |
814 | ||
815 | typedef struct real_pcre { | |
816 | pcre_uint32 magic_number; | |
817 | pcre_uint32 size; /* Total that was malloced */ | |
818 | pcre_uint32 options; | |
819 | pcre_uint32 dummy1; /* For future use, maybe */ | |
820 | ||
821 | pcre_uint16 top_bracket; | |
822 | pcre_uint16 top_backref; | |
823 | pcre_uint16 first_byte; | |
824 | pcre_uint16 req_byte; | |
825 | pcre_uint16 name_table_offset; /* Offset to name table that follows */ | |
826 | pcre_uint16 name_entry_size; /* Size of any name items */ | |
827 | pcre_uint16 name_count; /* Number of name items */ | |
828 | pcre_uint16 ref_count; /* Reference count */ | |
829 | ||
830 | const unsigned char *tables; /* Pointer to tables or NULL for std */ | |
831 | const unsigned char *nullpad; /* NULL padding */ | |
832 | } real_pcre; | |
833 | ||
834 | /* The format of the block used to store data from pcre_study(). The same | |
835 | remark (see NOTE above) about extending this structure applies. */ | |
836 | ||
837 | typedef struct pcre_study_data { | |
838 | pcre_uint32 size; /* Total that was malloced */ | |
839 | pcre_uint32 options; | |
840 | uschar start_bits[32]; | |
841 | } pcre_study_data; | |
842 | ||
843 | /* Structure for passing "static" information around between the functions | |
844 | doing the compiling, so that they are thread-safe. */ | |
845 | ||
846 | typedef struct compile_data { | |
847 | const uschar *lcc; /* Points to lower casing table */ | |
848 | const uschar *fcc; /* Points to case-flipping table */ | |
849 | const uschar *cbits; /* Points to character type table */ | |
850 | const uschar *ctypes; /* Points to table of type maps */ | |
6bf342e1 | 851 | const uschar *start_workspace;/* The start of working space */ |
8ac170f3 PH |
852 | const uschar *start_code; /* The start of the compiled code */ |
853 | const uschar *start_pattern; /* The start of the pattern */ | |
6bf342e1 PH |
854 | const uschar *end_pattern; /* The end of the pattern */ |
855 | uschar *hwm; /* High watermark of workspace */ | |
8ac170f3 PH |
856 | uschar *name_table; /* The name/number table */ |
857 | int names_found; /* Number of entries so far */ | |
858 | int name_entry_size; /* Size of each entry */ | |
6bf342e1 | 859 | int bracount; /* Count of capturing parens */ |
8ac170f3 PH |
860 | int top_backref; /* Maximum back reference */ |
861 | unsigned int backref_map; /* Bitmap of low back refs */ | |
6bf342e1 | 862 | int external_options; /* External (initial) options */ |
8ac170f3 PH |
863 | int req_varyopt; /* "After variable item" flag for reqbyte */ |
864 | BOOL nopartial; /* Set TRUE if partial won't work */ | |
6bf342e1 PH |
865 | int nltype; /* Newline type */ |
866 | int nllen; /* Newline string length */ | |
867 | uschar nl[4]; /* Newline string when fixed length */ | |
8ac170f3 PH |
868 | } compile_data; |
869 | ||
870 | /* Structure for maintaining a chain of pointers to the currently incomplete | |
871 | branches, for testing for left recursion. */ | |
872 | ||
873 | typedef struct branch_chain { | |
874 | struct branch_chain *outer; | |
875 | uschar *current; | |
876 | } branch_chain; | |
877 | ||
878 | /* Structure for items in a linked list that represents an explicit recursive | |
879 | call within the pattern. */ | |
880 | ||
881 | typedef struct recursion_info { | |
882 | struct recursion_info *prevrec; /* Previous recursion record (or NULL) */ | |
883 | int group_num; /* Number of group that was called */ | |
884 | const uschar *after_call; /* "Return value": points after the call in the expr */ | |
aa41d2de | 885 | USPTR save_start; /* Old value of md->start_match */ |
8ac170f3 PH |
886 | int *offset_save; /* Pointer to start of saved offsets */ |
887 | int saved_max; /* Number of saved offsets */ | |
888 | } recursion_info; | |
889 | ||
890 | /* When compiling in a mode that doesn't use recursive calls to match(), | |
891 | a structure is used to remember local variables on the heap. It is defined in | |
aa41d2de PH |
892 | pcre_exec.c, close to the match() function, so that it is easy to keep it in |
893 | step with any changes of local variable. However, the pointer to the current | |
894 | frame must be saved in some "static" place over a longjmp(). We declare the | |
895 | structure here so that we can put a pointer in the match_data structure. NOTE: | |
896 | This isn't used for a "normal" compilation of pcre. */ | |
8ac170f3 PH |
897 | |
898 | struct heapframe; | |
899 | ||
6bf342e1 PH |
900 | /* Structure for building a chain of data for holding the values of the subject |
901 | pointer at the start of each subpattern, so as to detect when an empty string | |
902 | has been matched by a subpattern - to break infinite loops. */ | |
903 | ||
904 | typedef struct eptrblock { | |
905 | struct eptrblock *epb_prev; | |
906 | USPTR epb_saved_eptr; | |
907 | } eptrblock; | |
908 | ||
909 | ||
8ac170f3 PH |
910 | /* Structure for passing "static" information around between the functions |
911 | doing traditional NFA matching, so that they are thread-safe. */ | |
912 | ||
913 | typedef struct match_data { | |
aa41d2de PH |
914 | unsigned long int match_call_count; /* As it says */ |
915 | unsigned long int match_limit; /* As it says */ | |
916 | unsigned long int match_limit_recursion; /* As it says */ | |
8ac170f3 PH |
917 | int *offset_vector; /* Offset vector */ |
918 | int offset_end; /* One past the end */ | |
919 | int offset_max; /* The maximum usable for return data */ | |
6bf342e1 PH |
920 | int nltype; /* Newline type */ |
921 | int nllen; /* Newline string length */ | |
922 | uschar nl[4]; /* Newline string when fixed */ | |
8ac170f3 PH |
923 | const uschar *lcc; /* Points to lower casing table */ |
924 | const uschar *ctypes; /* Points to table of type maps */ | |
925 | BOOL offset_overflow; /* Set if too many extractions */ | |
926 | BOOL notbol; /* NOTBOL flag */ | |
927 | BOOL noteol; /* NOTEOL flag */ | |
928 | BOOL utf8; /* UTF8 flag */ | |
929 | BOOL endonly; /* Dollar not before final \n */ | |
930 | BOOL notempty; /* Empty string match not wanted */ | |
931 | BOOL partial; /* PARTIAL flag */ | |
932 | BOOL hitend; /* Hit the end of the subject at some point */ | |
933 | const uschar *start_code; /* For use when recursing */ | |
aa41d2de PH |
934 | USPTR start_subject; /* Start of the subject string */ |
935 | USPTR end_subject; /* End of the subject string */ | |
936 | USPTR start_match; /* Start of this match attempt */ | |
937 | USPTR end_match_ptr; /* Subject position at end match */ | |
8ac170f3 PH |
938 | int end_offset_top; /* Highwater mark at end of match */ |
939 | int capture_last; /* Most recent capture number */ | |
940 | int start_offset; /* The start offset value */ | |
6bf342e1 PH |
941 | eptrblock *eptrchain; /* Chain of eptrblocks for tail recursions */ |
942 | int eptrn; /* Next free eptrblock */ | |
8ac170f3 PH |
943 | recursion_info *recursive; /* Linked list of recursion data */ |
944 | void *callout_data; /* To pass back to callouts */ | |
945 | struct heapframe *thisframe; /* Used only when compiling for no recursion */ | |
946 | } match_data; | |
947 | ||
948 | /* A similar structure is used for the same purpose by the DFA matching | |
949 | functions. */ | |
950 | ||
951 | typedef struct dfa_match_data { | |
952 | const uschar *start_code; /* Start of the compiled pattern */ | |
953 | const uschar *start_subject; /* Start of the subject string */ | |
954 | const uschar *end_subject; /* End of subject string */ | |
955 | const uschar *tables; /* Character tables */ | |
956 | int moptions; /* Match options */ | |
957 | int poptions; /* Pattern options */ | |
6bf342e1 PH |
958 | int nltype; /* Newline type */ |
959 | int nllen; /* Newline string length */ | |
960 | uschar nl[4]; /* Newline string when fixed */ | |
8ac170f3 PH |
961 | void *callout_data; /* To pass back to callouts */ |
962 | } dfa_match_data; | |
963 | ||
964 | /* Bit definitions for entries in the pcre_ctypes table. */ | |
965 | ||
966 | #define ctype_space 0x01 | |
967 | #define ctype_letter 0x02 | |
968 | #define ctype_digit 0x04 | |
969 | #define ctype_xdigit 0x08 | |
970 | #define ctype_word 0x10 /* alphameric or '_' */ | |
971 | #define ctype_meta 0x80 /* regexp meta char or zero (end pattern) */ | |
972 | ||
973 | /* Offsets for the bitmap tables in pcre_cbits. Each table contains a set | |
974 | of bits for a class map. Some classes are built by combining these tables. */ | |
975 | ||
976 | #define cbit_space 0 /* [:space:] or \s */ | |
977 | #define cbit_xdigit 32 /* [:xdigit:] */ | |
978 | #define cbit_digit 64 /* [:digit:] or \d */ | |
979 | #define cbit_upper 96 /* [:upper:] */ | |
980 | #define cbit_lower 128 /* [:lower:] */ | |
981 | #define cbit_word 160 /* [:word:] or \w */ | |
982 | #define cbit_graph 192 /* [:graph:] */ | |
983 | #define cbit_print 224 /* [:print:] */ | |
984 | #define cbit_punct 256 /* [:punct:] */ | |
985 | #define cbit_cntrl 288 /* [:cntrl:] */ | |
986 | #define cbit_length 320 /* Length of the cbits table */ | |
987 | ||
988 | /* Offsets of the various tables from the base tables pointer, and | |
989 | total length. */ | |
990 | ||
991 | #define lcc_offset 0 | |
992 | #define fcc_offset 256 | |
993 | #define cbits_offset 512 | |
994 | #define ctypes_offset (cbits_offset + cbit_length) | |
995 | #define tables_length (ctypes_offset + 256) | |
996 | ||
aa41d2de PH |
997 | /* Layout of the UCP type table that translates property names into types and |
998 | codes. */ | |
8ac170f3 PH |
999 | |
1000 | typedef struct { | |
1001 | const char *name; | |
aa41d2de PH |
1002 | pcre_uint16 type; |
1003 | pcre_uint16 value; | |
8ac170f3 PH |
1004 | } ucp_type_table; |
1005 | ||
1006 | ||
1007 | /* Internal shared data tables. These are tables that are used by more than one | |
1008 | of the exported public functions. They have to be "external" in the C sense, | |
1009 | but are not part of the PCRE public API. The data for these tables is in the | |
1010 | pcre_tables.c module. */ | |
1011 | ||
1012 | extern const int _pcre_utf8_table1[]; | |
1013 | extern const int _pcre_utf8_table2[]; | |
1014 | extern const int _pcre_utf8_table3[]; | |
1015 | extern const uschar _pcre_utf8_table4[]; | |
1016 | ||
1017 | extern const int _pcre_utf8_table1_size; | |
1018 | ||
1019 | extern const ucp_type_table _pcre_utt[]; | |
1020 | extern const int _pcre_utt_size; | |
1021 | ||
1022 | extern const uschar _pcre_default_tables[]; | |
1023 | ||
1024 | extern const uschar _pcre_OP_lengths[]; | |
1025 | ||
1026 | ||
1027 | /* Internal shared functions. These are functions that are used by more than | |
1028 | one of the exported public functions. They have to be "external" in the C | |
1029 | sense, but are not part of the PCRE public API. */ | |
1030 | ||
6bf342e1 PH |
1031 | extern BOOL _pcre_is_newline(const uschar *, const uschar *, int *, |
1032 | BOOL); | |
1033 | extern int _pcre_ord2utf8(int, uschar *); | |
1034 | extern real_pcre *_pcre_try_flipped(const real_pcre *, real_pcre *, | |
1035 | const pcre_study_data *, pcre_study_data *); | |
1036 | extern int _pcre_ucp_findprop(const unsigned int, int *, int *); | |
1037 | extern unsigned int _pcre_ucp_othercase(const unsigned int); | |
1038 | extern int _pcre_valid_utf8(const uschar *, int); | |
1039 | extern BOOL _pcre_was_newline(const uschar *, const uschar *, int *, | |
1040 | BOOL); | |
1041 | extern BOOL _pcre_xclass(int, const uschar *); | |
8ac170f3 | 1042 | |
aa41d2de PH |
1043 | #endif |
1044 | ||
8ac170f3 | 1045 | /* End of pcre_internal.h */ |