Commit | Line | Data |
---|---|---|
64f2600a | 1 | /* $Cambridge: exim/src/src/pcre/pcre_internal.h,v 1.5 2007/06/26 11:16:54 ph10 Exp $ */ |
8ac170f3 PH |
2 | |
3 | /************************************************* | |
4 | * Perl-Compatible Regular Expressions * | |
5 | *************************************************/ | |
6 | ||
7 | ||
8 | /* PCRE is a library of functions to support regular expressions whose syntax | |
9 | and semantics are as close as possible to those of the Perl 5 language. | |
10 | ||
11 | Written by Philip Hazel | |
64f2600a | 12 | Copyright (c) 1997-2007 University of Cambridge |
8ac170f3 PH |
13 | |
14 | ----------------------------------------------------------------------------- | |
15 | Redistribution and use in source and binary forms, with or without | |
16 | modification, are permitted provided that the following conditions are met: | |
17 | ||
18 | * Redistributions of source code must retain the above copyright notice, | |
19 | this list of conditions and the following disclaimer. | |
20 | ||
21 | * Redistributions in binary form must reproduce the above copyright | |
22 | notice, this list of conditions and the following disclaimer in the | |
23 | documentation and/or other materials provided with the distribution. | |
24 | ||
25 | * Neither the name of the University of Cambridge nor the names of its | |
26 | contributors may be used to endorse or promote products derived from | |
27 | this software without specific prior written permission. | |
28 | ||
29 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
30 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
31 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
32 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |
33 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
34 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
35 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
36 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
37 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
38 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
39 | POSSIBILITY OF SUCH DAMAGE. | |
40 | ----------------------------------------------------------------------------- | |
41 | */ | |
42 | ||
43 | /* This header contains definitions that are shared between the different | |
44 | modules, but which are not relevant to the exported API. This includes some | |
45 | functions whose names all begin with "_pcre_". */ | |
46 | ||
aa41d2de PH |
47 | #ifndef PCRE_INTERNAL_H |
48 | #define PCRE_INTERNAL_H | |
8ac170f3 PH |
49 | |
50 | /* Define DEBUG to get debugging output on stdout. */ | |
51 | ||
aa41d2de | 52 | #if 0 |
8ac170f3 | 53 | #define DEBUG |
aa41d2de | 54 | #endif |
8ac170f3 PH |
55 | |
56 | /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef | |
57 | inline, and there are *still* stupid compilers about that don't like indented | |
58 | pre-processor statements, or at least there were when I first wrote this. After | |
6bf342e1 | 59 | all, it had only been about 10 years then... |
8ac170f3 | 60 | |
6bf342e1 PH |
61 | It turns out that the Mac Debugging.h header also defines the macro DPRINTF, so |
62 | be absolutely sure we get our version. */ | |
63 | ||
64 | #undef DPRINTF | |
8ac170f3 PH |
65 | #ifdef DEBUG |
66 | #define DPRINTF(p) printf p | |
67 | #else | |
6bf342e1 | 68 | #define DPRINTF(p) /* Nothing */ |
8ac170f3 PH |
69 | #endif |
70 | ||
71 | ||
72 | /* Get the definitions provided by running "configure" */ | |
73 | ||
74 | #include "config.h" | |
75 | ||
76 | /* Standard C headers plus the external interface definition. The only time | |
77 | setjmp and stdarg are used is when NO_RECURSE is set. */ | |
78 | ||
79 | #include <ctype.h> | |
80 | #include <limits.h> | |
81 | #include <setjmp.h> | |
82 | #include <stdarg.h> | |
83 | #include <stddef.h> | |
84 | #include <stdio.h> | |
85 | #include <stdlib.h> | |
86 | #include <string.h> | |
87 | ||
64f2600a PH |
88 | /* When compiling a DLL for Windows, the exported symbols have to be declared |
89 | using some MS magic. I found some useful information on this web page: | |
90 | http://msdn2.microsoft.com/en-us/library/y4h7bcy6(VS.80).aspx. According to the | |
91 | information there, using __declspec(dllexport) without "extern" we have a | |
92 | definition; with "extern" we have a declaration. The settings here override the | |
93 | setting in pcre.h (which is included below); it defines only PCRE_EXP_DECL, | |
94 | which is all that is needed for applications (they just import the symbols). We | |
95 | use: | |
96 | ||
97 | PCRE_EXP_DECL for declarations | |
98 | PCRE_EXP_DEFN for definitions of exported functions | |
99 | PCRE_EXP_DATA_DEFN for definitions of exported variables | |
100 | ||
101 | The reason for the two DEFN macros is that in non-Windows environments, one | |
102 | does not want to have "extern" before variable definitions because it leads to | |
103 | compiler warnings. So we distinguish between functions and variables. In | |
104 | Windows, the two should always be the same. | |
105 | ||
106 | The reason for wrapping this in #ifndef PCRE_EXP_DECL is so that pcretest, | |
107 | which is an application, but needs to import this file in order to "peek" at | |
108 | internals, can #include pcre.h first to get an application's-eye view. | |
109 | ||
110 | In principle, people compiling for non-Windows, non-Unix-like (i.e. uncommon, | |
111 | special-purpose environments) might want to stick other stuff in front of | |
112 | exported symbols. That's why, in the non-Windows case, we set PCRE_EXP_DEFN and | |
113 | PCRE_EXP_DATA_DEFN only if they are not already set. */ | |
114 | ||
115 | #ifndef PCRE_EXP_DECL | |
116 | # ifdef _WIN32 | |
117 | # ifdef DLL_EXPORT | |
118 | # define PCRE_EXP_DECL extern __declspec(dllexport) | |
119 | # define PCRE_EXP_DEFN __declspec(dllexport) | |
120 | # define PCRE_EXP_DATA_DEFN __declspec(dllexport) | |
121 | # else | |
122 | # define PCRE_EXP_DECL extern | |
123 | # define PCRE_EXP_DEFN | |
124 | # define PCRE_EXP_DATA_DEFN | |
125 | # endif | |
126 | # | |
127 | # else | |
128 | # ifdef __cplusplus | |
129 | # define PCRE_EXP_DECL extern "C" | |
130 | # else | |
131 | # define PCRE_EXP_DECL extern | |
132 | # endif | |
133 | # ifndef PCRE_EXP_DEFN | |
134 | # define PCRE_EXP_DEFN PCRE_EXP_DECL | |
135 | # endif | |
136 | # ifndef PCRE_EXP_DATA_DEFN | |
137 | # define PCRE_EXP_DATA_DEFN | |
138 | # endif | |
139 | # endif | |
8ac170f3 PH |
140 | #endif |
141 | ||
142 | /* We need to have types that specify unsigned 16-bit and 32-bit integers. We | |
143 | cannot determine these outside the compilation (e.g. by running a program as | |
144 | part of "configure") because PCRE is often cross-compiled for use on other | |
145 | systems. Instead we make use of the maximum sizes that are available at | |
146 | preprocessor time in standard C environments. */ | |
147 | ||
148 | #if USHRT_MAX == 65535 | |
149 | typedef unsigned short pcre_uint16; | |
150 | #elif UINT_MAX == 65535 | |
151 | typedef unsigned int pcre_uint16; | |
152 | #else | |
153 | #error Cannot determine a type for 16-bit unsigned integers | |
154 | #endif | |
155 | ||
156 | #if UINT_MAX == 4294967295 | |
157 | typedef unsigned int pcre_uint32; | |
158 | #elif ULONG_MAX == 4294967295 | |
159 | typedef unsigned long int pcre_uint32; | |
160 | #else | |
161 | #error Cannot determine a type for 32-bit unsigned integers | |
162 | #endif | |
163 | ||
164 | /* All character handling must be done as unsigned characters. Otherwise there | |
165 | are problems with top-bit-set characters and functions such as isspace(). | |
166 | However, we leave the interface to the outside world as char *, because that | |
167 | should make things easier for callers. We define a short type for unsigned char | |
168 | to save lots of typing. I tried "uchar", but it causes problems on Digital | |
169 | Unix, where it is defined in sys/types, so use "uschar" instead. */ | |
170 | ||
171 | typedef unsigned char uschar; | |
172 | ||
6bf342e1 PH |
173 | /* This is an unsigned int value that no character can ever have. UTF-8 |
174 | characters only go up to 0x7fffffff (though Unicode doesn't go beyond | |
175 | 0x0010ffff). */ | |
176 | ||
177 | #define NOTACHAR 0xffffffff | |
178 | ||
179 | /* PCRE is able to support several different kinds of newline (CR, LF, CRLF, | |
64f2600a PH |
180 | "any" and "anycrlf" at present). The following macros are used to package up |
181 | testing for newlines. NLBLOCK, PSSTART, and PSEND are defined in the various | |
182 | modules to indicate in which datablock the parameters exist, and what the | |
183 | start/end of string field names are. */ | |
6bf342e1 | 184 | |
64f2600a PH |
185 | #define NLTYPE_FIXED 0 /* Newline is a fixed length string */ |
186 | #define NLTYPE_ANY 1 /* Newline is any Unicode line ending */ | |
187 | #define NLTYPE_ANYCRLF 2 /* Newline is CR, LF, or CRLF */ | |
6bf342e1 PH |
188 | |
189 | /* This macro checks for a newline at the given position */ | |
aa41d2de PH |
190 | |
191 | #define IS_NEWLINE(p) \ | |
6bf342e1 PH |
192 | ((NLBLOCK->nltype != NLTYPE_FIXED)? \ |
193 | ((p) < NLBLOCK->PSEND && \ | |
64f2600a PH |
194 | _pcre_is_newline((p), NLBLOCK->nltype, NLBLOCK->PSEND, &(NLBLOCK->nllen),\ |
195 | utf8)) \ | |
6bf342e1 PH |
196 | : \ |
197 | ((p) <= NLBLOCK->PSEND - NLBLOCK->nllen && \ | |
198 | (p)[0] == NLBLOCK->nl[0] && \ | |
199 | (NLBLOCK->nllen == 1 || (p)[1] == NLBLOCK->nl[1]) \ | |
200 | ) \ | |
201 | ) | |
202 | ||
203 | /* This macro checks for a newline immediately preceding the given position */ | |
204 | ||
205 | #define WAS_NEWLINE(p) \ | |
206 | ((NLBLOCK->nltype != NLTYPE_FIXED)? \ | |
207 | ((p) > NLBLOCK->PSSTART && \ | |
64f2600a PH |
208 | _pcre_was_newline((p), NLBLOCK->nltype, NLBLOCK->PSSTART, \ |
209 | &(NLBLOCK->nllen), utf8)) \ | |
6bf342e1 PH |
210 | : \ |
211 | ((p) >= NLBLOCK->PSSTART + NLBLOCK->nllen && \ | |
212 | (p)[-NLBLOCK->nllen] == NLBLOCK->nl[0] && \ | |
213 | (NLBLOCK->nllen == 1 || (p)[-NLBLOCK->nllen+1] == NLBLOCK->nl[1]) \ | |
214 | ) \ | |
215 | ) | |
aa41d2de PH |
216 | |
217 | /* When PCRE is compiled as a C++ library, the subject pointer can be replaced | |
218 | with a custom type. This makes it possible, for example, to allow pcre_exec() | |
219 | to process subject strings that are discontinuous by using a smart pointer | |
220 | class. It must always be possible to inspect all of the subject string in | |
221 | pcre_exec() because of the way it backtracks. Two macros are required in the | |
222 | normal case, for sign-unspecified and unsigned char pointers. The former is | |
223 | used for the external interface and appears in pcre.h, which is why its name | |
224 | must begin with PCRE_. */ | |
225 | ||
226 | #ifdef CUSTOM_SUBJECT_PTR | |
227 | #define PCRE_SPTR CUSTOM_SUBJECT_PTR | |
228 | #define USPTR CUSTOM_SUBJECT_PTR | |
229 | #else | |
230 | #define PCRE_SPTR const char * | |
231 | #define USPTR const unsigned char * | |
232 | #endif | |
8ac170f3 | 233 | |
64f2600a PH |
234 | |
235 | ||
aa41d2de PH |
236 | /* Include the public PCRE header and the definitions of UCP character property |
237 | values. */ | |
8ac170f3 | 238 | |
64f2600a | 239 | #include <pcre.h> |
8ac170f3 PH |
240 | #include "ucp.h" |
241 | ||
242 | /* When compiling for use with the Virtual Pascal compiler, these functions | |
243 | need to have their names changed. PCRE must be compiled with the -DVPCOMPAT | |
244 | option on the command line. */ | |
245 | ||
246 | #ifdef VPCOMPAT | |
64f2600a | 247 | #define strlen(s) _strlen(s) |
8ac170f3 | 248 | #define strncmp(s1,s2,m) _strncmp(s1,s2,m) |
64f2600a | 249 | #define memcmp(s,c,n) _memcmp(s,c,n) |
8ac170f3 PH |
250 | #define memcpy(d,s,n) _memcpy(d,s,n) |
251 | #define memmove(d,s,n) _memmove(d,s,n) | |
252 | #define memset(s,c,n) _memset(s,c,n) | |
253 | #else /* VPCOMPAT */ | |
254 | ||
255 | /* To cope with SunOS4 and other systems that lack memmove() but have bcopy(), | |
256 | define a macro for memmove() if HAVE_MEMMOVE is false, provided that HAVE_BCOPY | |
257 | is set. Otherwise, include an emulating function for those systems that have | |
64f2600a | 258 | neither (there some non-Unix environments where this is the case). */ |
8ac170f3 | 259 | |
64f2600a | 260 | #ifndef HAVE_MEMMOVE |
8ac170f3 | 261 | #undef memmove /* some systems may have a macro */ |
64f2600a | 262 | #ifdef HAVE_BCOPY |
8ac170f3 PH |
263 | #define memmove(a, b, c) bcopy(b, a, c) |
264 | #else /* HAVE_BCOPY */ | |
aa41d2de | 265 | static void * |
64f2600a | 266 | pcre_memmove(void *d, const void *s, size_t n) |
8ac170f3 | 267 | { |
aa41d2de | 268 | size_t i; |
64f2600a PH |
269 | unsigned char *dest = (unsigned char *)d; |
270 | const unsigned char *src = (const unsigned char *)s; | |
271 | if (dest > src) | |
272 | { | |
273 | dest += n; | |
274 | src += n; | |
275 | for (i = 0; i < n; ++i) *(--dest) = *(--src); | |
276 | return (void *)dest; | |
277 | } | |
278 | else | |
279 | { | |
280 | for (i = 0; i < n; ++i) *dest++ = *src++; | |
281 | return (void *)(dest - n); | |
282 | } | |
8ac170f3 PH |
283 | } |
284 | #define memmove(a, b, c) pcre_memmove(a, b, c) | |
285 | #endif /* not HAVE_BCOPY */ | |
286 | #endif /* not HAVE_MEMMOVE */ | |
287 | #endif /* not VPCOMPAT */ | |
288 | ||
289 | ||
290 | /* PCRE keeps offsets in its compiled code as 2-byte quantities (always stored | |
291 | in big-endian order) by default. These are used, for example, to link from the | |
292 | start of a subpattern to its alternatives and its end. The use of 2 bytes per | |
293 | offset limits the size of the compiled regex to around 64K, which is big enough | |
294 | for almost everybody. However, I received a request for an even bigger limit. | |
295 | For this reason, and also to make the code easier to maintain, the storing and | |
296 | loading of offsets from the byte string is now handled by the macros that are | |
297 | defined here. | |
298 | ||
299 | The macros are controlled by the value of LINK_SIZE. This defaults to 2 in | |
300 | the config.h file, but can be overridden by using -D on the command line. This | |
301 | is automated on Unix systems via the "configure" command. */ | |
302 | ||
303 | #if LINK_SIZE == 2 | |
304 | ||
305 | #define PUT(a,n,d) \ | |
306 | (a[n] = (d) >> 8), \ | |
307 | (a[(n)+1] = (d) & 255) | |
308 | ||
309 | #define GET(a,n) \ | |
310 | (((a)[n] << 8) | (a)[(n)+1]) | |
311 | ||
312 | #define MAX_PATTERN_SIZE (1 << 16) | |
313 | ||
314 | ||
315 | #elif LINK_SIZE == 3 | |
316 | ||
317 | #define PUT(a,n,d) \ | |
318 | (a[n] = (d) >> 16), \ | |
319 | (a[(n)+1] = (d) >> 8), \ | |
320 | (a[(n)+2] = (d) & 255) | |
321 | ||
322 | #define GET(a,n) \ | |
323 | (((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2]) | |
324 | ||
325 | #define MAX_PATTERN_SIZE (1 << 24) | |
326 | ||
327 | ||
328 | #elif LINK_SIZE == 4 | |
329 | ||
330 | #define PUT(a,n,d) \ | |
331 | (a[n] = (d) >> 24), \ | |
332 | (a[(n)+1] = (d) >> 16), \ | |
333 | (a[(n)+2] = (d) >> 8), \ | |
334 | (a[(n)+3] = (d) & 255) | |
335 | ||
336 | #define GET(a,n) \ | |
337 | (((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3]) | |
338 | ||
339 | #define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */ | |
340 | ||
341 | ||
342 | #else | |
343 | #error LINK_SIZE must be either 2, 3, or 4 | |
344 | #endif | |
345 | ||
346 | ||
347 | /* Convenience macro defined in terms of the others */ | |
348 | ||
349 | #define PUTINC(a,n,d) PUT(a,n,d), a += LINK_SIZE | |
350 | ||
351 | ||
352 | /* PCRE uses some other 2-byte quantities that do not change when the size of | |
353 | offsets changes. There are used for repeat counts and for other things such as | |
354 | capturing parenthesis numbers in back references. */ | |
355 | ||
356 | #define PUT2(a,n,d) \ | |
357 | a[n] = (d) >> 8; \ | |
358 | a[(n)+1] = (d) & 255 | |
359 | ||
360 | #define GET2(a,n) \ | |
361 | (((a)[n] << 8) | (a)[(n)+1]) | |
362 | ||
363 | #define PUT2INC(a,n,d) PUT2(a,n,d), a += 2 | |
364 | ||
365 | ||
366 | /* When UTF-8 encoding is being used, a character is no longer just a single | |
367 | byte. The macros for character handling generate simple sequences when used in | |
368 | byte-mode, and more complicated ones for UTF-8 characters. */ | |
369 | ||
370 | #ifndef SUPPORT_UTF8 | |
371 | #define GETCHAR(c, eptr) c = *eptr; | |
372 | #define GETCHARTEST(c, eptr) c = *eptr; | |
373 | #define GETCHARINC(c, eptr) c = *eptr++; | |
374 | #define GETCHARINCTEST(c, eptr) c = *eptr++; | |
375 | #define GETCHARLEN(c, eptr, len) c = *eptr; | |
376 | #define BACKCHAR(eptr) | |
377 | ||
378 | #else /* SUPPORT_UTF8 */ | |
379 | ||
380 | /* Get the next UTF-8 character, not advancing the pointer. This is called when | |
381 | we know we are in UTF-8 mode. */ | |
382 | ||
383 | #define GETCHAR(c, eptr) \ | |
384 | c = *eptr; \ | |
6bf342e1 | 385 | if (c >= 0xc0) \ |
8ac170f3 PH |
386 | { \ |
387 | int gcii; \ | |
388 | int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ | |
389 | int gcss = 6*gcaa; \ | |
390 | c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ | |
391 | for (gcii = 1; gcii <= gcaa; gcii++) \ | |
392 | { \ | |
393 | gcss -= 6; \ | |
394 | c |= (eptr[gcii] & 0x3f) << gcss; \ | |
395 | } \ | |
396 | } | |
397 | ||
398 | /* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the | |
399 | pointer. */ | |
400 | ||
401 | #define GETCHARTEST(c, eptr) \ | |
402 | c = *eptr; \ | |
6bf342e1 | 403 | if (utf8 && c >= 0xc0) \ |
8ac170f3 PH |
404 | { \ |
405 | int gcii; \ | |
406 | int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ | |
407 | int gcss = 6*gcaa; \ | |
408 | c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ | |
409 | for (gcii = 1; gcii <= gcaa; gcii++) \ | |
410 | { \ | |
411 | gcss -= 6; \ | |
412 | c |= (eptr[gcii] & 0x3f) << gcss; \ | |
413 | } \ | |
414 | } | |
415 | ||
416 | /* Get the next UTF-8 character, advancing the pointer. This is called when we | |
417 | know we are in UTF-8 mode. */ | |
418 | ||
419 | #define GETCHARINC(c, eptr) \ | |
420 | c = *eptr++; \ | |
6bf342e1 | 421 | if (c >= 0xc0) \ |
8ac170f3 PH |
422 | { \ |
423 | int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ | |
424 | int gcss = 6*gcaa; \ | |
425 | c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ | |
426 | while (gcaa-- > 0) \ | |
427 | { \ | |
428 | gcss -= 6; \ | |
429 | c |= (*eptr++ & 0x3f) << gcss; \ | |
430 | } \ | |
431 | } | |
432 | ||
433 | /* Get the next character, testing for UTF-8 mode, and advancing the pointer */ | |
434 | ||
435 | #define GETCHARINCTEST(c, eptr) \ | |
436 | c = *eptr++; \ | |
6bf342e1 | 437 | if (utf8 && c >= 0xc0) \ |
8ac170f3 PH |
438 | { \ |
439 | int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ | |
440 | int gcss = 6*gcaa; \ | |
441 | c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ | |
442 | while (gcaa-- > 0) \ | |
443 | { \ | |
444 | gcss -= 6; \ | |
445 | c |= (*eptr++ & 0x3f) << gcss; \ | |
446 | } \ | |
447 | } | |
448 | ||
449 | /* Get the next UTF-8 character, not advancing the pointer, incrementing length | |
450 | if there are extra bytes. This is called when we know we are in UTF-8 mode. */ | |
451 | ||
452 | #define GETCHARLEN(c, eptr, len) \ | |
453 | c = *eptr; \ | |
6bf342e1 | 454 | if (c >= 0xc0) \ |
8ac170f3 PH |
455 | { \ |
456 | int gcii; \ | |
457 | int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ | |
458 | int gcss = 6*gcaa; \ | |
459 | c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ | |
460 | for (gcii = 1; gcii <= gcaa; gcii++) \ | |
461 | { \ | |
462 | gcss -= 6; \ | |
463 | c |= (eptr[gcii] & 0x3f) << gcss; \ | |
464 | } \ | |
465 | len += gcaa; \ | |
466 | } | |
467 | ||
468 | /* If the pointer is not at the start of a character, move it back until | |
469 | it is. Called only in UTF-8 mode. */ | |
470 | ||
471 | #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--; | |
472 | ||
473 | #endif | |
474 | ||
475 | ||
476 | /* In case there is no definition of offsetof() provided - though any proper | |
477 | Standard C system should have one. */ | |
478 | ||
479 | #ifndef offsetof | |
480 | #define offsetof(p_type,field) ((size_t)&(((p_type *)0)->field)) | |
481 | #endif | |
482 | ||
483 | ||
484 | /* These are the public options that can change during matching. */ | |
485 | ||
486 | #define PCRE_IMS (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL) | |
487 | ||
aa41d2de PH |
488 | /* Private options flags start at the most significant end of the four bytes. |
489 | The public options defined in pcre.h start at the least significant end. Make | |
490 | sure they don't overlap! The bits are getting a bit scarce now -- when we run | |
491 | out, there is a dummy word in the structure that could be used for the private | |
492 | bits. */ | |
8ac170f3 | 493 | |
aa41d2de | 494 | #define PCRE_NOPARTIAL 0x80000000 /* can't use partial with this regex */ |
8ac170f3 PH |
495 | #define PCRE_FIRSTSET 0x40000000 /* first_byte is set */ |
496 | #define PCRE_REQCHSET 0x20000000 /* req_byte is set */ | |
497 | #define PCRE_STARTLINE 0x10000000 /* start after \n for multiline */ | |
aa41d2de | 498 | #define PCRE_JCHANGED 0x08000000 /* j option changes within regex */ |
8ac170f3 PH |
499 | |
500 | /* Options for the "extra" block produced by pcre_study(). */ | |
501 | ||
502 | #define PCRE_STUDY_MAPPED 0x01 /* a map of starting chars exists */ | |
503 | ||
504 | /* Masks for identifying the public options that are permitted at compile | |
505 | time, run time, or study time, respectively. */ | |
506 | ||
64f2600a PH |
507 | #define PCRE_NEWLINE_BITS (PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|PCRE_NEWLINE_ANY| \ |
508 | PCRE_NEWLINE_ANYCRLF) | |
6bf342e1 | 509 | |
8ac170f3 PH |
510 | #define PUBLIC_OPTIONS \ |
511 | (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \ | |
512 | PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \ | |
aa41d2de | 513 | PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE| \ |
6bf342e1 | 514 | PCRE_DUPNAMES|PCRE_NEWLINE_BITS) |
8ac170f3 PH |
515 | |
516 | #define PUBLIC_EXEC_OPTIONS \ | |
517 | (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \ | |
6bf342e1 | 518 | PCRE_PARTIAL|PCRE_NEWLINE_BITS) |
8ac170f3 PH |
519 | |
520 | #define PUBLIC_DFA_EXEC_OPTIONS \ | |
521 | (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \ | |
6bf342e1 | 522 | PCRE_PARTIAL|PCRE_DFA_SHORTEST|PCRE_DFA_RESTART|PCRE_NEWLINE_BITS) |
8ac170f3 PH |
523 | |
524 | #define PUBLIC_STUDY_OPTIONS 0 /* None defined */ | |
525 | ||
526 | /* Magic number to provide a small check against being handed junk. Also used | |
527 | to detect whether a pattern was compiled on a host of different endianness. */ | |
528 | ||
529 | #define MAGIC_NUMBER 0x50435245UL /* 'PCRE' */ | |
530 | ||
531 | /* Negative values for the firstchar and reqchar variables */ | |
532 | ||
533 | #define REQ_UNSET (-2) | |
534 | #define REQ_NONE (-1) | |
535 | ||
536 | /* The maximum remaining length of subject we are prepared to search for a | |
537 | req_byte match. */ | |
538 | ||
539 | #define REQ_BYTE_MAX 1000 | |
540 | ||
541 | /* Flags added to firstbyte or reqbyte; a "non-literal" item is either a | |
542 | variable-length repeat, or a anything other than literal characters. */ | |
543 | ||
544 | #define REQ_CASELESS 0x0100 /* indicates caselessness */ | |
545 | #define REQ_VARY 0x0200 /* reqbyte followed non-literal item */ | |
546 | ||
547 | /* Miscellaneous definitions */ | |
548 | ||
549 | typedef int BOOL; | |
550 | ||
551 | #define FALSE 0 | |
552 | #define TRUE 1 | |
553 | ||
6bf342e1 | 554 | /* Escape items that are just an encoding of a particular data value. */ |
8ac170f3 PH |
555 | |
556 | #ifndef ESC_e | |
557 | #define ESC_e 27 | |
558 | #endif | |
559 | ||
560 | #ifndef ESC_f | |
561 | #define ESC_f '\f' | |
562 | #endif | |
563 | ||
564 | #ifndef ESC_n | |
6bf342e1 | 565 | #define ESC_n '\n' |
8ac170f3 PH |
566 | #endif |
567 | ||
568 | #ifndef ESC_r | |
569 | #define ESC_r '\r' | |
570 | #endif | |
571 | ||
572 | /* We can't officially use ESC_t because it is a POSIX reserved identifier | |
573 | (presumably because of all the others like size_t). */ | |
574 | ||
575 | #ifndef ESC_tee | |
576 | #define ESC_tee '\t' | |
577 | #endif | |
578 | ||
aa41d2de PH |
579 | /* Codes for different types of Unicode property */ |
580 | ||
581 | #define PT_ANY 0 /* Any property - matches all chars */ | |
582 | #define PT_LAMP 1 /* L& - the union of Lu, Ll, Lt */ | |
583 | #define PT_GC 2 /* General characteristic (e.g. L) */ | |
584 | #define PT_PC 3 /* Particular characteristic (e.g. Lu) */ | |
585 | #define PT_SC 4 /* Script (e.g. Han) */ | |
586 | ||
587 | /* Flag bits and data types for the extended class (OP_XCLASS) for classes that | |
588 | contain UTF-8 characters with values greater than 255. */ | |
589 | ||
590 | #define XCL_NOT 0x01 /* Flag: this is a negative class */ | |
591 | #define XCL_MAP 0x02 /* Flag: a 32-byte map is present */ | |
592 | ||
593 | #define XCL_END 0 /* Marks end of individual items */ | |
594 | #define XCL_SINGLE 1 /* Single item (one multibyte char) follows */ | |
595 | #define XCL_RANGE 2 /* A range (two multibyte chars) follows */ | |
596 | #define XCL_PROP 3 /* Unicode property (2-byte property code follows) */ | |
597 | #define XCL_NOTPROP 4 /* Unicode inverted property (ditto) */ | |
598 | ||
8ac170f3 PH |
599 | /* These are escaped items that aren't just an encoding of a particular data |
600 | value such as \n. They must have non-zero values, as check_escape() returns | |
601 | their negation. Also, they must appear in the same order as in the opcode | |
602 | definitions below, up to ESC_z. There's a dummy for OP_ANY because it | |
603 | corresponds to "." rather than an escape sequence. The final one must be | |
6bf342e1 PH |
604 | ESC_REF as subsequent values are used for backreferences (\1, \2, \3, etc). |
605 | There are two tests in the code for an escape greater than ESC_b and less than | |
606 | ESC_Z to detect the types that may be repeated. These are the types that | |
607 | consume characters. If any new escapes are put in between that don't consume a | |
8ac170f3 PH |
608 | character, that code will have to change. */ |
609 | ||
64f2600a PH |
610 | enum { ESC_A = 1, ESC_G, ESC_K, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, |
611 | ESC_W, ESC_w, ESC_dum1, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H, ESC_h, | |
612 | ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z, ESC_E, ESC_Q, ESC_k, ESC_REF }; | |
6bf342e1 | 613 | |
8ac170f3 | 614 | |
8ac170f3 PH |
615 | /* Opcode table: OP_BRA must be last, as all values >= it are used for brackets |
616 | that extract substrings. Starting from 1 (i.e. after OP_END), the values up to | |
617 | OP_EOD must correspond in order to the list of escapes immediately above. | |
6bf342e1 PH |
618 | |
619 | To keep stored, compiled patterns compatible, new opcodes should be added | |
620 | immediately before OP_BRA, where (since release 7.0) a gap is left for this | |
621 | purpose. | |
622 | ||
623 | *** NOTE NOTE NOTE *** Whenever this list is updated, the two macro definitions | |
624 | that follow must also be updated to match. There is also a table called | |
625 | "coptable" in pcre_dfa_exec.c that must be updated. */ | |
8ac170f3 PH |
626 | |
627 | enum { | |
628 | OP_END, /* 0 End of pattern */ | |
629 | ||
630 | /* Values corresponding to backslashed metacharacters */ | |
631 | ||
632 | OP_SOD, /* 1 Start of data: \A */ | |
633 | OP_SOM, /* 2 Start of match (subject + offset): \G */ | |
64f2600a PH |
634 | OP_SET_SOM, /* 3 Set start of match (\K) */ |
635 | OP_NOT_WORD_BOUNDARY, /* 4 \B */ | |
636 | OP_WORD_BOUNDARY, /* 5 \b */ | |
637 | OP_NOT_DIGIT, /* 6 \D */ | |
638 | OP_DIGIT, /* 7 \d */ | |
639 | OP_NOT_WHITESPACE, /* 8 \S */ | |
640 | OP_WHITESPACE, /* 9 \s */ | |
641 | OP_NOT_WORDCHAR, /* 10 \W */ | |
642 | OP_WORDCHAR, /* 11 \w */ | |
643 | OP_ANY, /* 12 Match any character */ | |
644 | OP_ANYBYTE, /* 13 Match any byte (\C); different to OP_ANY for UTF-8 */ | |
645 | OP_NOTPROP, /* 14 \P (not Unicode property) */ | |
646 | OP_PROP, /* 15 \p (Unicode property) */ | |
647 | OP_ANYNL, /* 16 \R (any newline sequence) */ | |
648 | OP_NOT_HSPACE, /* 17 \H (not horizontal whitespace) */ | |
649 | OP_HSPACE, /* 18 \h (horizontal whitespace) */ | |
650 | OP_NOT_VSPACE, /* 19 \V (not vertical whitespace) */ | |
651 | OP_VSPACE, /* 20 \v (vertical whitespace) */ | |
652 | OP_EXTUNI, /* 21 \X (extended Unicode sequence */ | |
653 | OP_EODN, /* 22 End of data or \n at end of data: \Z. */ | |
654 | OP_EOD, /* 23 End of data: \z */ | |
655 | ||
656 | OP_OPT, /* 24 Set runtime options */ | |
657 | OP_CIRC, /* 25 Start of line - varies with multiline switch */ | |
658 | OP_DOLL, /* 26 End of line - varies with multiline switch */ | |
659 | OP_CHAR, /* 27 Match one character, casefully */ | |
660 | OP_CHARNC, /* 28 Match one character, caselessly */ | |
661 | OP_NOT, /* 29 Match one character, not the following one */ | |
662 | ||
663 | OP_STAR, /* 30 The maximizing and minimizing versions of */ | |
664 | OP_MINSTAR, /* 31 these six opcodes must come in pairs, with */ | |
665 | OP_PLUS, /* 32 the minimizing one second. */ | |
666 | OP_MINPLUS, /* 33 This first set applies to single characters.*/ | |
667 | OP_QUERY, /* 34 */ | |
668 | OP_MINQUERY, /* 35 */ | |
669 | ||
670 | OP_UPTO, /* 36 From 0 to n matches */ | |
671 | OP_MINUPTO, /* 37 */ | |
672 | OP_EXACT, /* 38 Exactly n matches */ | |
673 | ||
674 | OP_POSSTAR, /* 39 Possessified star */ | |
675 | OP_POSPLUS, /* 40 Possessified plus */ | |
676 | OP_POSQUERY, /* 41 Posesssified query */ | |
677 | OP_POSUPTO, /* 42 Possessified upto */ | |
678 | ||
679 | OP_NOTSTAR, /* 43 The maximizing and minimizing versions of */ | |
680 | OP_NOTMINSTAR, /* 44 these six opcodes must come in pairs, with */ | |
681 | OP_NOTPLUS, /* 45 the minimizing one second. They must be in */ | |
682 | OP_NOTMINPLUS, /* 46 exactly the same order as those above. */ | |
683 | OP_NOTQUERY, /* 47 This set applies to "not" single characters. */ | |
684 | OP_NOTMINQUERY, /* 48 */ | |
685 | ||
686 | OP_NOTUPTO, /* 49 From 0 to n matches */ | |
687 | OP_NOTMINUPTO, /* 50 */ | |
688 | OP_NOTEXACT, /* 51 Exactly n matches */ | |
689 | ||
690 | OP_NOTPOSSTAR, /* 52 Possessified versions */ | |
691 | OP_NOTPOSPLUS, /* 53 */ | |
692 | OP_NOTPOSQUERY, /* 54 */ | |
693 | OP_NOTPOSUPTO, /* 55 */ | |
694 | ||
695 | OP_TYPESTAR, /* 56 The maximizing and minimizing versions of */ | |
696 | OP_TYPEMINSTAR, /* 57 these six opcodes must come in pairs, with */ | |
697 | OP_TYPEPLUS, /* 58 the minimizing one second. These codes must */ | |
698 | OP_TYPEMINPLUS, /* 59 be in exactly the same order as those above. */ | |
699 | OP_TYPEQUERY, /* 60 This set applies to character types such as \d */ | |
700 | OP_TYPEMINQUERY, /* 61 */ | |
701 | ||
702 | OP_TYPEUPTO, /* 62 From 0 to n matches */ | |
703 | OP_TYPEMINUPTO, /* 63 */ | |
704 | OP_TYPEEXACT, /* 64 Exactly n matches */ | |
705 | ||
706 | OP_TYPEPOSSTAR, /* 65 Possessified versions */ | |
707 | OP_TYPEPOSPLUS, /* 66 */ | |
708 | OP_TYPEPOSQUERY, /* 67 */ | |
709 | OP_TYPEPOSUPTO, /* 68 */ | |
710 | ||
711 | OP_CRSTAR, /* 69 The maximizing and minimizing versions of */ | |
712 | OP_CRMINSTAR, /* 70 all these opcodes must come in pairs, with */ | |
713 | OP_CRPLUS, /* 71 the minimizing one second. These codes must */ | |
714 | OP_CRMINPLUS, /* 72 be in exactly the same order as those above. */ | |
715 | OP_CRQUERY, /* 73 These are for character classes and back refs */ | |
716 | OP_CRMINQUERY, /* 74 */ | |
717 | OP_CRRANGE, /* 75 These are different to the three sets above. */ | |
718 | OP_CRMINRANGE, /* 76 */ | |
719 | ||
720 | OP_CLASS, /* 77 Match a character class, chars < 256 only */ | |
721 | OP_NCLASS, /* 78 Same, but the bitmap was created from a negative | |
8ac170f3 PH |
722 | class - the difference is relevant only when a UTF-8 |
723 | character > 255 is encountered. */ | |
724 | ||
64f2600a | 725 | OP_XCLASS, /* 79 Extended class for handling UTF-8 chars within the |
8ac170f3 PH |
726 | class. This does both positive and negative. */ |
727 | ||
64f2600a PH |
728 | OP_REF, /* 80 Match a back reference */ |
729 | OP_RECURSE, /* 81 Match a numbered subpattern (possibly recursive) */ | |
730 | OP_CALLOUT, /* 82 Call out to external function if provided */ | |
8ac170f3 | 731 | |
64f2600a PH |
732 | OP_ALT, /* 83 Start of alternation */ |
733 | OP_KET, /* 84 End of group that doesn't have an unbounded repeat */ | |
734 | OP_KETRMAX, /* 85 These two must remain together and in this */ | |
735 | OP_KETRMIN, /* 86 order. They are for groups the repeat for ever. */ | |
8ac170f3 | 736 | |
6bf342e1 | 737 | /* The assertions must come before BRA, CBRA, ONCE, and COND.*/ |
8ac170f3 | 738 | |
64f2600a PH |
739 | OP_ASSERT, /* 87 Positive lookahead */ |
740 | OP_ASSERT_NOT, /* 88 Negative lookahead */ | |
741 | OP_ASSERTBACK, /* 89 Positive lookbehind */ | |
742 | OP_ASSERTBACK_NOT, /* 90 Negative lookbehind */ | |
743 | OP_REVERSE, /* 91 Move pointer back - used in lookbehind assertions */ | |
8ac170f3 | 744 | |
6bf342e1 PH |
745 | /* ONCE, BRA, CBRA, and COND must come after the assertions, with ONCE first, |
746 | as there's a test for >= ONCE for a subpattern that isn't an assertion. */ | |
8ac170f3 | 747 | |
64f2600a PH |
748 | OP_ONCE, /* 92 Atomic group */ |
749 | OP_BRA, /* 83 Start of non-capturing bracket */ | |
750 | OP_CBRA, /* 94 Start of capturing bracket */ | |
751 | OP_COND, /* 95 Conditional group */ | |
8ac170f3 | 752 | |
6bf342e1 PH |
753 | /* These three must follow the previous three, in the same order. There's a |
754 | check for >= SBRA to distinguish the two sets. */ | |
8ac170f3 | 755 | |
64f2600a PH |
756 | OP_SBRA, /* 96 Start of non-capturing bracket, check empty */ |
757 | OP_SCBRA, /* 97 Start of capturing bracket, check empty */ | |
758 | OP_SCOND, /* 98 Conditional group, check empty */ | |
8ac170f3 | 759 | |
64f2600a PH |
760 | OP_CREF, /* 99 Used to hold a capture number as condition */ |
761 | OP_RREF, /* 100 Used to hold a recursion number as condition */ | |
762 | OP_DEF, /* 101 The DEFINE condition */ | |
8ac170f3 | 763 | |
64f2600a PH |
764 | OP_BRAZERO, /* 102 These two must remain together and in this */ |
765 | OP_BRAMINZERO /* 103 order. */ | |
6bf342e1 | 766 | }; |
8ac170f3 PH |
767 | |
768 | ||
769 | /* This macro defines textual names for all the opcodes. These are used only | |
770 | for debugging. The macro is referenced only in pcre_printint.c. */ | |
771 | ||
772 | #define OP_NAME_LIST \ | |
64f2600a | 773 | "End", "\\A", "\\G", "\\K", "\\B", "\\b", "\\D", "\\d", \ |
8ac170f3 | 774 | "\\S", "\\s", "\\W", "\\w", "Any", "Anybyte", \ |
64f2600a PH |
775 | "notprop", "prop", "\\R", "\\H", "\\h", "\\V", "\\v", \ |
776 | "extuni", "\\Z", "\\z", \ | |
8ac170f3 PH |
777 | "Opt", "^", "$", "char", "charnc", "not", \ |
778 | "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ | |
6bf342e1 | 779 | "*+","++", "?+", "{", \ |
8ac170f3 | 780 | "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ |
6bf342e1 | 781 | "*+","++", "?+", "{", \ |
8ac170f3 | 782 | "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ |
6bf342e1 | 783 | "*+","++", "?+", "{", \ |
8ac170f3 PH |
784 | "*", "*?", "+", "+?", "?", "??", "{", "{", \ |
785 | "class", "nclass", "xclass", "Ref", "Recurse", "Callout", \ | |
786 | "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", \ | |
6bf342e1 PH |
787 | "AssertB", "AssertB not", "Reverse", \ |
788 | "Once", "Bra 0", "Bra", "Cond", "SBra 0", "SBra", "SCond", \ | |
789 | "Cond ref", "Cond rec", "Cond def", "Brazero", "Braminzero" | |
8ac170f3 PH |
790 | |
791 | ||
792 | /* This macro defines the length of fixed length operations in the compiled | |
793 | regex. The lengths are used when searching for specific things, and also in the | |
794 | debugging printing of a compiled regex. We use a macro so that it can be | |
795 | defined close to the definitions of the opcodes themselves. | |
796 | ||
797 | As things have been extended, some of these are no longer fixed lenths, but are | |
798 | minima instead. For example, the length of a single-character repeat may vary | |
799 | in UTF-8 mode. The code that uses this table must know about such things. */ | |
800 | ||
801 | #define OP_LENGTHS \ | |
802 | 1, /* End */ \ | |
64f2600a PH |
803 | 1, 1, 1, 1, 1, /* \A, \G, \K, \B, \b */ \ |
804 | 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */ \ | |
8ac170f3 | 805 | 1, 1, /* Any, Anybyte */ \ |
64f2600a PH |
806 | 3, 3, 1, /* NOTPROP, PROP, EXTUNI */ \ |
807 | 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */ \ | |
8ac170f3 PH |
808 | 1, 1, 2, 1, 1, /* \Z, \z, Opt, ^, $ */ \ |
809 | 2, /* Char - the minimum length */ \ | |
810 | 2, /* Charnc - the minimum length */ \ | |
811 | 2, /* not */ \ | |
812 | /* Positive single-char repeats ** These are */ \ | |
813 | 2, 2, 2, 2, 2, 2, /* *, *?, +, +?, ?, ?? ** minima in */ \ | |
814 | 4, 4, 4, /* upto, minupto, exact ** UTF-8 mode */ \ | |
6bf342e1 | 815 | 2, 2, 2, 4, /* *+, ++, ?+, upto+ */ \ |
8ac170f3 PH |
816 | /* Negative single-char repeats - only for chars < 256 */ \ |
817 | 2, 2, 2, 2, 2, 2, /* NOT *, *?, +, +?, ?, ?? */ \ | |
818 | 4, 4, 4, /* NOT upto, minupto, exact */ \ | |
6bf342e1 | 819 | 2, 2, 2, 4, /* Possessive *, +, ?, upto */ \ |
8ac170f3 PH |
820 | /* Positive type repeats */ \ |
821 | 2, 2, 2, 2, 2, 2, /* Type *, *?, +, +?, ?, ?? */ \ | |
822 | 4, 4, 4, /* Type upto, minupto, exact */ \ | |
6bf342e1 | 823 | 2, 2, 2, 4, /* Possessive *+, ++, ?+, upto+ */ \ |
8ac170f3 PH |
824 | /* Character class & ref repeats */ \ |
825 | 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ \ | |
826 | 5, 5, /* CRRANGE, CRMINRANGE */ \ | |
827 | 33, /* CLASS */ \ | |
828 | 33, /* NCLASS */ \ | |
829 | 0, /* XCLASS - variable length */ \ | |
830 | 3, /* REF */ \ | |
831 | 1+LINK_SIZE, /* RECURSE */ \ | |
832 | 2+2*LINK_SIZE, /* CALLOUT */ \ | |
833 | 1+LINK_SIZE, /* Alt */ \ | |
834 | 1+LINK_SIZE, /* Ket */ \ | |
835 | 1+LINK_SIZE, /* KetRmax */ \ | |
836 | 1+LINK_SIZE, /* KetRmin */ \ | |
837 | 1+LINK_SIZE, /* Assert */ \ | |
838 | 1+LINK_SIZE, /* Assert not */ \ | |
839 | 1+LINK_SIZE, /* Assert behind */ \ | |
840 | 1+LINK_SIZE, /* Assert behind not */ \ | |
841 | 1+LINK_SIZE, /* Reverse */ \ | |
6bf342e1 PH |
842 | 1+LINK_SIZE, /* ONCE */ \ |
843 | 1+LINK_SIZE, /* BRA */ \ | |
844 | 3+LINK_SIZE, /* CBRA */ \ | |
8ac170f3 | 845 | 1+LINK_SIZE, /* COND */ \ |
6bf342e1 PH |
846 | 1+LINK_SIZE, /* SBRA */ \ |
847 | 3+LINK_SIZE, /* SCBRA */ \ | |
848 | 1+LINK_SIZE, /* SCOND */ \ | |
8ac170f3 | 849 | 3, /* CREF */ \ |
6bf342e1 PH |
850 | 3, /* RREF */ \ |
851 | 1, /* DEF */ \ | |
8ac170f3 | 852 | 1, 1, /* BRAZERO, BRAMINZERO */ \ |
8ac170f3 PH |
853 | |
854 | ||
6bf342e1 | 855 | /* A magic value for OP_RREF to indicate the "any recursion" condition. */ |
8ac170f3 | 856 | |
6bf342e1 | 857 | #define RREF_ANY 0xffff |
8ac170f3 PH |
858 | |
859 | /* Error code numbers. They are given names so that they can more easily be | |
860 | tracked. */ | |
861 | ||
862 | enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, | |
863 | ERR10, ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, | |
864 | ERR20, ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, | |
865 | ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, | |
aa41d2de | 866 | ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, |
64f2600a | 867 | ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58 }; |
8ac170f3 PH |
868 | |
869 | /* The real format of the start of the pcre block; the index of names and the | |
870 | code vector run on as long as necessary after the end. We store an explicit | |
871 | offset to the name table so that if a regex is compiled on one host, saved, and | |
872 | then run on another where the size of pointers is different, all might still | |
873 | be well. For the case of compiled-on-4 and run-on-8, we include an extra | |
874 | pointer that is always NULL. For future-proofing, a few dummy fields were | |
875 | originally included - even though you can never get this planning right - but | |
876 | there is only one left now. | |
877 | ||
878 | NOTE NOTE NOTE: | |
879 | Because people can now save and re-use compiled patterns, any additions to this | |
880 | structure should be made at the end, and something earlier (e.g. a new | |
881 | flag in the options or one of the dummy fields) should indicate that the new | |
882 | fields are present. Currently PCRE always sets the dummy fields to zero. | |
883 | NOTE NOTE NOTE: | |
884 | */ | |
885 | ||
886 | typedef struct real_pcre { | |
887 | pcre_uint32 magic_number; | |
888 | pcre_uint32 size; /* Total that was malloced */ | |
889 | pcre_uint32 options; | |
890 | pcre_uint32 dummy1; /* For future use, maybe */ | |
891 | ||
892 | pcre_uint16 top_bracket; | |
893 | pcre_uint16 top_backref; | |
894 | pcre_uint16 first_byte; | |
895 | pcre_uint16 req_byte; | |
896 | pcre_uint16 name_table_offset; /* Offset to name table that follows */ | |
897 | pcre_uint16 name_entry_size; /* Size of any name items */ | |
898 | pcre_uint16 name_count; /* Number of name items */ | |
899 | pcre_uint16 ref_count; /* Reference count */ | |
900 | ||
901 | const unsigned char *tables; /* Pointer to tables or NULL for std */ | |
902 | const unsigned char *nullpad; /* NULL padding */ | |
903 | } real_pcre; | |
904 | ||
905 | /* The format of the block used to store data from pcre_study(). The same | |
906 | remark (see NOTE above) about extending this structure applies. */ | |
907 | ||
908 | typedef struct pcre_study_data { | |
909 | pcre_uint32 size; /* Total that was malloced */ | |
910 | pcre_uint32 options; | |
911 | uschar start_bits[32]; | |
912 | } pcre_study_data; | |
913 | ||
914 | /* Structure for passing "static" information around between the functions | |
915 | doing the compiling, so that they are thread-safe. */ | |
916 | ||
917 | typedef struct compile_data { | |
918 | const uschar *lcc; /* Points to lower casing table */ | |
919 | const uschar *fcc; /* Points to case-flipping table */ | |
920 | const uschar *cbits; /* Points to character type table */ | |
921 | const uschar *ctypes; /* Points to table of type maps */ | |
6bf342e1 | 922 | const uschar *start_workspace;/* The start of working space */ |
8ac170f3 PH |
923 | const uschar *start_code; /* The start of the compiled code */ |
924 | const uschar *start_pattern; /* The start of the pattern */ | |
6bf342e1 PH |
925 | const uschar *end_pattern; /* The end of the pattern */ |
926 | uschar *hwm; /* High watermark of workspace */ | |
8ac170f3 PH |
927 | uschar *name_table; /* The name/number table */ |
928 | int names_found; /* Number of entries so far */ | |
929 | int name_entry_size; /* Size of each entry */ | |
6bf342e1 | 930 | int bracount; /* Count of capturing parens */ |
8ac170f3 PH |
931 | int top_backref; /* Maximum back reference */ |
932 | unsigned int backref_map; /* Bitmap of low back refs */ | |
6bf342e1 | 933 | int external_options; /* External (initial) options */ |
8ac170f3 PH |
934 | int req_varyopt; /* "After variable item" flag for reqbyte */ |
935 | BOOL nopartial; /* Set TRUE if partial won't work */ | |
6bf342e1 PH |
936 | int nltype; /* Newline type */ |
937 | int nllen; /* Newline string length */ | |
938 | uschar nl[4]; /* Newline string when fixed length */ | |
8ac170f3 PH |
939 | } compile_data; |
940 | ||
941 | /* Structure for maintaining a chain of pointers to the currently incomplete | |
942 | branches, for testing for left recursion. */ | |
943 | ||
944 | typedef struct branch_chain { | |
945 | struct branch_chain *outer; | |
946 | uschar *current; | |
947 | } branch_chain; | |
948 | ||
949 | /* Structure for items in a linked list that represents an explicit recursive | |
950 | call within the pattern. */ | |
951 | ||
952 | typedef struct recursion_info { | |
953 | struct recursion_info *prevrec; /* Previous recursion record (or NULL) */ | |
954 | int group_num; /* Number of group that was called */ | |
955 | const uschar *after_call; /* "Return value": points after the call in the expr */ | |
64f2600a | 956 | USPTR save_start; /* Old value of mstart */ |
8ac170f3 PH |
957 | int *offset_save; /* Pointer to start of saved offsets */ |
958 | int saved_max; /* Number of saved offsets */ | |
959 | } recursion_info; | |
960 | ||
6bf342e1 PH |
961 | /* Structure for building a chain of data for holding the values of the subject |
962 | pointer at the start of each subpattern, so as to detect when an empty string | |
963 | has been matched by a subpattern - to break infinite loops. */ | |
964 | ||
965 | typedef struct eptrblock { | |
966 | struct eptrblock *epb_prev; | |
967 | USPTR epb_saved_eptr; | |
968 | } eptrblock; | |
969 | ||
970 | ||
8ac170f3 PH |
971 | /* Structure for passing "static" information around between the functions |
972 | doing traditional NFA matching, so that they are thread-safe. */ | |
973 | ||
974 | typedef struct match_data { | |
aa41d2de PH |
975 | unsigned long int match_call_count; /* As it says */ |
976 | unsigned long int match_limit; /* As it says */ | |
977 | unsigned long int match_limit_recursion; /* As it says */ | |
8ac170f3 PH |
978 | int *offset_vector; /* Offset vector */ |
979 | int offset_end; /* One past the end */ | |
980 | int offset_max; /* The maximum usable for return data */ | |
6bf342e1 PH |
981 | int nltype; /* Newline type */ |
982 | int nllen; /* Newline string length */ | |
983 | uschar nl[4]; /* Newline string when fixed */ | |
8ac170f3 PH |
984 | const uschar *lcc; /* Points to lower casing table */ |
985 | const uschar *ctypes; /* Points to table of type maps */ | |
986 | BOOL offset_overflow; /* Set if too many extractions */ | |
987 | BOOL notbol; /* NOTBOL flag */ | |
988 | BOOL noteol; /* NOTEOL flag */ | |
989 | BOOL utf8; /* UTF8 flag */ | |
990 | BOOL endonly; /* Dollar not before final \n */ | |
991 | BOOL notempty; /* Empty string match not wanted */ | |
992 | BOOL partial; /* PARTIAL flag */ | |
993 | BOOL hitend; /* Hit the end of the subject at some point */ | |
994 | const uschar *start_code; /* For use when recursing */ | |
aa41d2de PH |
995 | USPTR start_subject; /* Start of the subject string */ |
996 | USPTR end_subject; /* End of the subject string */ | |
64f2600a | 997 | USPTR start_match_ptr; /* Start of matched string */ |
aa41d2de | 998 | USPTR end_match_ptr; /* Subject position at end match */ |
8ac170f3 PH |
999 | int end_offset_top; /* Highwater mark at end of match */ |
1000 | int capture_last; /* Most recent capture number */ | |
1001 | int start_offset; /* The start offset value */ | |
6bf342e1 PH |
1002 | eptrblock *eptrchain; /* Chain of eptrblocks for tail recursions */ |
1003 | int eptrn; /* Next free eptrblock */ | |
8ac170f3 PH |
1004 | recursion_info *recursive; /* Linked list of recursion data */ |
1005 | void *callout_data; /* To pass back to callouts */ | |
8ac170f3 PH |
1006 | } match_data; |
1007 | ||
1008 | /* A similar structure is used for the same purpose by the DFA matching | |
1009 | functions. */ | |
1010 | ||
1011 | typedef struct dfa_match_data { | |
1012 | const uschar *start_code; /* Start of the compiled pattern */ | |
1013 | const uschar *start_subject; /* Start of the subject string */ | |
1014 | const uschar *end_subject; /* End of subject string */ | |
1015 | const uschar *tables; /* Character tables */ | |
1016 | int moptions; /* Match options */ | |
1017 | int poptions; /* Pattern options */ | |
6bf342e1 PH |
1018 | int nltype; /* Newline type */ |
1019 | int nllen; /* Newline string length */ | |
1020 | uschar nl[4]; /* Newline string when fixed */ | |
8ac170f3 PH |
1021 | void *callout_data; /* To pass back to callouts */ |
1022 | } dfa_match_data; | |
1023 | ||
1024 | /* Bit definitions for entries in the pcre_ctypes table. */ | |
1025 | ||
1026 | #define ctype_space 0x01 | |
1027 | #define ctype_letter 0x02 | |
1028 | #define ctype_digit 0x04 | |
1029 | #define ctype_xdigit 0x08 | |
1030 | #define ctype_word 0x10 /* alphameric or '_' */ | |
1031 | #define ctype_meta 0x80 /* regexp meta char or zero (end pattern) */ | |
1032 | ||
1033 | /* Offsets for the bitmap tables in pcre_cbits. Each table contains a set | |
1034 | of bits for a class map. Some classes are built by combining these tables. */ | |
1035 | ||
1036 | #define cbit_space 0 /* [:space:] or \s */ | |
1037 | #define cbit_xdigit 32 /* [:xdigit:] */ | |
1038 | #define cbit_digit 64 /* [:digit:] or \d */ | |
1039 | #define cbit_upper 96 /* [:upper:] */ | |
1040 | #define cbit_lower 128 /* [:lower:] */ | |
1041 | #define cbit_word 160 /* [:word:] or \w */ | |
1042 | #define cbit_graph 192 /* [:graph:] */ | |
1043 | #define cbit_print 224 /* [:print:] */ | |
1044 | #define cbit_punct 256 /* [:punct:] */ | |
1045 | #define cbit_cntrl 288 /* [:cntrl:] */ | |
1046 | #define cbit_length 320 /* Length of the cbits table */ | |
1047 | ||
1048 | /* Offsets of the various tables from the base tables pointer, and | |
1049 | total length. */ | |
1050 | ||
1051 | #define lcc_offset 0 | |
1052 | #define fcc_offset 256 | |
1053 | #define cbits_offset 512 | |
1054 | #define ctypes_offset (cbits_offset + cbit_length) | |
1055 | #define tables_length (ctypes_offset + 256) | |
1056 | ||
aa41d2de PH |
1057 | /* Layout of the UCP type table that translates property names into types and |
1058 | codes. */ | |
8ac170f3 PH |
1059 | |
1060 | typedef struct { | |
1061 | const char *name; | |
aa41d2de PH |
1062 | pcre_uint16 type; |
1063 | pcre_uint16 value; | |
8ac170f3 PH |
1064 | } ucp_type_table; |
1065 | ||
1066 | ||
1067 | /* Internal shared data tables. These are tables that are used by more than one | |
1068 | of the exported public functions. They have to be "external" in the C sense, | |
1069 | but are not part of the PCRE public API. The data for these tables is in the | |
1070 | pcre_tables.c module. */ | |
1071 | ||
1072 | extern const int _pcre_utf8_table1[]; | |
1073 | extern const int _pcre_utf8_table2[]; | |
1074 | extern const int _pcre_utf8_table3[]; | |
1075 | extern const uschar _pcre_utf8_table4[]; | |
1076 | ||
1077 | extern const int _pcre_utf8_table1_size; | |
1078 | ||
1079 | extern const ucp_type_table _pcre_utt[]; | |
1080 | extern const int _pcre_utt_size; | |
1081 | ||
1082 | extern const uschar _pcre_default_tables[]; | |
1083 | ||
1084 | extern const uschar _pcre_OP_lengths[]; | |
1085 | ||
1086 | ||
1087 | /* Internal shared functions. These are functions that are used by more than | |
1088 | one of the exported public functions. They have to be "external" in the C | |
1089 | sense, but are not part of the PCRE public API. */ | |
1090 | ||
64f2600a PH |
1091 | extern BOOL _pcre_is_newline(const uschar *, int, const uschar *, |
1092 | int *, BOOL); | |
6bf342e1 PH |
1093 | extern int _pcre_ord2utf8(int, uschar *); |
1094 | extern real_pcre *_pcre_try_flipped(const real_pcre *, real_pcre *, | |
1095 | const pcre_study_data *, pcre_study_data *); | |
1096 | extern int _pcre_ucp_findprop(const unsigned int, int *, int *); | |
1097 | extern unsigned int _pcre_ucp_othercase(const unsigned int); | |
1098 | extern int _pcre_valid_utf8(const uschar *, int); | |
64f2600a PH |
1099 | extern BOOL _pcre_was_newline(const uschar *, int, const uschar *, |
1100 | int *, BOOL); | |
6bf342e1 | 1101 | extern BOOL _pcre_xclass(int, const uschar *); |
8ac170f3 | 1102 | |
aa41d2de PH |
1103 | #endif |
1104 | ||
8ac170f3 | 1105 | /* End of pcre_internal.h */ |