Commit | Line | Data |
---|---|---|
8ac170f3 PH |
1 | /* $Cambridge: exim/src/src/pcre/internal.h,v 1.2 2005/06/15 08:57:10 ph10 Exp $ */ |
2 | ||
c86f6258 PH |
3 | /************************************************* |
4 | * Perl-Compatible Regular Expressions * | |
5 | *************************************************/ | |
6 | ||
7 | ||
8 | /* This is a library of functions to support regular expressions whose syntax | |
9 | and semantics are as close as possible to those of the Perl 5 language. See | |
10 | the file doc/Tech.Notes for some information on the internals. | |
11 | ||
12 | Written by: Philip Hazel <ph10@cam.ac.uk> | |
13 | ||
14 | Copyright (c) 1997-2004 University of Cambridge | |
15 | ||
16 | ----------------------------------------------------------------------------- | |
17 | Redistribution and use in source and binary forms, with or without | |
18 | modification, are permitted provided that the following conditions are met: | |
19 | ||
20 | * Redistributions of source code must retain the above copyright notice, | |
21 | this list of conditions and the following disclaimer. | |
22 | ||
23 | * Redistributions in binary form must reproduce the above copyright | |
24 | notice, this list of conditions and the following disclaimer in the | |
25 | documentation and/or other materials provided with the distribution. | |
26 | ||
27 | * Neither the name of the University of Cambridge nor the names of its | |
28 | contributors may be used to endorse or promote products derived from | |
29 | this software without specific prior written permission. | |
30 | ||
31 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
32 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
33 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
34 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |
35 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
36 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
37 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
38 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
39 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
40 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
41 | POSSIBILITY OF SUCH DAMAGE. | |
42 | ----------------------------------------------------------------------------- | |
43 | */ | |
44 | ||
45 | /* This header contains definitions that are shared between the different | |
46 | modules, but which are not relevant to the outside. */ | |
47 | ||
48 | /* Get the definitions provided by running "configure" */ | |
49 | ||
50 | #include "config.h" | |
51 | ||
52 | /* Standard C headers plus the external interface definition. The only time | |
53 | setjmp and stdarg are used is when NO_RECURSE is set. */ | |
54 | ||
55 | #include <ctype.h> | |
56 | #include <limits.h> | |
57 | #include <setjmp.h> | |
58 | #include <stdarg.h> | |
59 | #include <stddef.h> | |
60 | #include <stdio.h> | |
61 | #include <stdlib.h> | |
62 | #include <string.h> | |
63 | ||
64 | #ifndef PCRE_SPY | |
65 | #define PCRE_DEFINITION /* Win32 __declspec(export) trigger for .dll */ | |
66 | #endif | |
67 | ||
68 | /* We need to have types that specify unsigned 16-bit and 32-bit integers. We | |
69 | cannot determine these outside the compilation (e.g. by running a program as | |
70 | part of "configure") because PCRE is often cross-compiled for use on other | |
71 | systems. Instead we make use of the maximum sizes that are available at | |
72 | preprocessor time in standard C environments. */ | |
73 | ||
74 | #if USHRT_MAX == 65535 | |
75 | typedef unsigned short pcre_uint16; | |
76 | #elif UINT_MAX == 65535 | |
77 | typedef unsigned int pcre_uint16; | |
78 | #else | |
79 | #error Cannot determine a type for 16-bit unsigned integers | |
80 | #endif | |
81 | ||
82 | #if UINT_MAX == 4294967295 | |
83 | typedef unsigned int pcre_uint32; | |
84 | #elif ULONG_MAX == 4294967295 | |
85 | typedef unsigned long int pcre_uint32; | |
86 | #else | |
87 | #error Cannot determine a type for 32-bit unsigned integers | |
88 | #endif | |
89 | ||
90 | /* All character handling must be done as unsigned characters. Otherwise there | |
91 | are problems with top-bit-set characters and functions such as isspace(). | |
92 | However, we leave the interface to the outside world as char *, because that | |
93 | should make things easier for callers. We define a short type for unsigned char | |
94 | to save lots of typing. I tried "uchar", but it causes problems on Digital | |
95 | Unix, where it is defined in sys/types, so use "uschar" instead. */ | |
96 | ||
97 | typedef unsigned char uschar; | |
98 | ||
99 | /* Include the public PCRE header */ | |
100 | ||
101 | #include "pcre.h" | |
102 | ||
103 | /* When compiling for use with the Virtual Pascal compiler, these functions | |
104 | need to have their names changed. PCRE must be compiled with the -DVPCOMPAT | |
105 | option on the command line. */ | |
106 | ||
107 | #ifdef VPCOMPAT | |
108 | #define strncmp(s1,s2,m) _strncmp(s1,s2,m) | |
109 | #define memcpy(d,s,n) _memcpy(d,s,n) | |
110 | #define memmove(d,s,n) _memmove(d,s,n) | |
111 | #define memset(s,c,n) _memset(s,c,n) | |
112 | #else /* VPCOMPAT */ | |
113 | ||
114 | /* To cope with SunOS4 and other systems that lack memmove() but have bcopy(), | |
115 | define a macro for memmove() if HAVE_MEMMOVE is false, provided that HAVE_BCOPY | |
116 | is set. Otherwise, include an emulating function for those systems that have | |
117 | neither (there some non-Unix environments where this is the case). This assumes | |
118 | that all calls to memmove are moving strings upwards in store, which is the | |
119 | case in PCRE. */ | |
120 | ||
121 | #if ! HAVE_MEMMOVE | |
122 | #undef memmove /* some systems may have a macro */ | |
123 | #if HAVE_BCOPY | |
124 | #define memmove(a, b, c) bcopy(b, a, c) | |
125 | #else /* HAVE_BCOPY */ | |
126 | void * | |
127 | pcre_memmove(unsigned char *dest, const unsigned char *src, size_t n) | |
128 | { | |
129 | int i; | |
130 | dest += n; | |
131 | src += n; | |
132 | for (i = 0; i < n; ++i) *(--dest) = *(--src); | |
133 | } | |
134 | #define memmove(a, b, c) pcre_memmove(a, b, c) | |
135 | #endif /* not HAVE_BCOPY */ | |
136 | #endif /* not HAVE_MEMMOVE */ | |
137 | #endif /* not VPCOMPAT */ | |
138 | ||
139 | ||
140 | /* PCRE keeps offsets in its compiled code as 2-byte quantities (always stored | |
141 | in big-endian order) by default. These are used, for example, to link from the | |
142 | start of a subpattern to its alternatives and its end. The use of 2 bytes per | |
143 | offset limits the size of the compiled regex to around 64K, which is big enough | |
144 | for almost everybody. However, I received a request for an even bigger limit. | |
145 | For this reason, and also to make the code easier to maintain, the storing and | |
146 | loading of offsets from the byte string is now handled by the macros that are | |
147 | defined here. | |
148 | ||
149 | The macros are controlled by the value of LINK_SIZE. This defaults to 2 in | |
150 | the config.h file, but can be overridden by using -D on the command line. This | |
151 | is automated on Unix systems via the "configure" command. */ | |
152 | ||
153 | #if LINK_SIZE == 2 | |
154 | ||
155 | #define PUT(a,n,d) \ | |
156 | (a[n] = (d) >> 8), \ | |
157 | (a[(n)+1] = (d) & 255) | |
158 | ||
159 | #define GET(a,n) \ | |
160 | (((a)[n] << 8) | (a)[(n)+1]) | |
161 | ||
162 | #define MAX_PATTERN_SIZE (1 << 16) | |
163 | ||
164 | ||
165 | #elif LINK_SIZE == 3 | |
166 | ||
167 | #define PUT(a,n,d) \ | |
168 | (a[n] = (d) >> 16), \ | |
169 | (a[(n)+1] = (d) >> 8), \ | |
170 | (a[(n)+2] = (d) & 255) | |
171 | ||
172 | #define GET(a,n) \ | |
173 | (((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2]) | |
174 | ||
175 | #define MAX_PATTERN_SIZE (1 << 24) | |
176 | ||
177 | ||
178 | #elif LINK_SIZE == 4 | |
179 | ||
180 | #define PUT(a,n,d) \ | |
181 | (a[n] = (d) >> 24), \ | |
182 | (a[(n)+1] = (d) >> 16), \ | |
183 | (a[(n)+2] = (d) >> 8), \ | |
184 | (a[(n)+3] = (d) & 255) | |
185 | ||
186 | #define GET(a,n) \ | |
187 | (((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3]) | |
188 | ||
189 | #define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */ | |
190 | ||
191 | ||
192 | #else | |
193 | #error LINK_SIZE must be either 2, 3, or 4 | |
194 | #endif | |
195 | ||
196 | ||
197 | /* Convenience macro defined in terms of the others */ | |
198 | ||
199 | #define PUTINC(a,n,d) PUT(a,n,d), a += LINK_SIZE | |
200 | ||
201 | ||
202 | /* PCRE uses some other 2-byte quantities that do not change when the size of | |
203 | offsets changes. There are used for repeat counts and for other things such as | |
204 | capturing parenthesis numbers in back references. */ | |
205 | ||
206 | #define PUT2(a,n,d) \ | |
207 | a[n] = (d) >> 8; \ | |
208 | a[(n)+1] = (d) & 255 | |
209 | ||
210 | #define GET2(a,n) \ | |
211 | (((a)[n] << 8) | (a)[(n)+1]) | |
212 | ||
213 | #define PUT2INC(a,n,d) PUT2(a,n,d), a += 2 | |
214 | ||
215 | ||
216 | /* In case there is no definition of offsetof() provided - though any proper | |
217 | Standard C system should have one. */ | |
218 | ||
219 | #ifndef offsetof | |
220 | #define offsetof(p_type,field) ((size_t)&(((p_type *)0)->field)) | |
221 | #endif | |
222 | ||
223 | ||
224 | /* These are the public options that can change during matching. */ | |
225 | ||
226 | #define PCRE_IMS (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL) | |
227 | ||
228 | /* Private options flags start at the most significant end of the four bytes, | |
229 | but skip the top bit so we can use ints for convenience without getting tangled | |
230 | with negative values. The public options defined in pcre.h start at the least | |
231 | significant end. Make sure they don't overlap, though now that we have expanded | |
232 | to four bytes, there is plenty of space. */ | |
233 | ||
234 | #define PCRE_FIRSTSET 0x40000000 /* first_byte is set */ | |
235 | #define PCRE_REQCHSET 0x20000000 /* req_byte is set */ | |
236 | #define PCRE_STARTLINE 0x10000000 /* start after \n for multiline */ | |
237 | #define PCRE_ICHANGED 0x08000000 /* i option changes within regex */ | |
238 | #define PCRE_NOPARTIAL 0x04000000 /* can't use partial with this regex */ | |
239 | ||
240 | /* Options for the "extra" block produced by pcre_study(). */ | |
241 | ||
242 | #define PCRE_STUDY_MAPPED 0x01 /* a map of starting chars exists */ | |
243 | ||
244 | /* Masks for identifying the public options which are permitted at compile | |
245 | time, run time or study time, respectively. */ | |
246 | ||
247 | #define PUBLIC_OPTIONS \ | |
248 | (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \ | |
249 | PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \ | |
250 | PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT) | |
251 | ||
252 | #define PUBLIC_EXEC_OPTIONS \ | |
253 | (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \ | |
254 | PCRE_PARTIAL) | |
255 | ||
256 | #define PUBLIC_STUDY_OPTIONS 0 /* None defined */ | |
257 | ||
258 | /* Magic number to provide a small check against being handed junk. */ | |
259 | ||
260 | #define MAGIC_NUMBER 0x50435245UL /* 'PCRE' */ | |
261 | ||
262 | /* Negative values for the firstchar and reqchar variables */ | |
263 | ||
264 | #define REQ_UNSET (-2) | |
265 | #define REQ_NONE (-1) | |
266 | ||
267 | /* Flags added to firstbyte or reqbyte; a "non-literal" item is either a | |
268 | variable-length repeat, or a anything other than literal characters. */ | |
269 | ||
270 | #define REQ_CASELESS 0x0100 /* indicates caselessness */ | |
271 | #define REQ_VARY 0x0200 /* reqbyte followed non-literal item */ | |
272 | ||
273 | /* Miscellaneous definitions */ | |
274 | ||
275 | typedef int BOOL; | |
276 | ||
277 | #define FALSE 0 | |
278 | #define TRUE 1 | |
279 | ||
280 | /* Escape items that are just an encoding of a particular data value. Note that | |
281 | ESC_n is defined as yet another macro, which is set in config.h to either \n | |
282 | (the default) or \r (which some people want). */ | |
283 | ||
284 | #ifndef ESC_e | |
285 | #define ESC_e 27 | |
286 | #endif | |
287 | ||
288 | #ifndef ESC_f | |
289 | #define ESC_f '\f' | |
290 | #endif | |
291 | ||
292 | #ifndef ESC_n | |
293 | #define ESC_n NEWLINE | |
294 | #endif | |
295 | ||
296 | #ifndef ESC_r | |
297 | #define ESC_r '\r' | |
298 | #endif | |
299 | ||
300 | /* We can't officially use ESC_t because it is a POSIX reserved identifier | |
301 | (presumably because of all the others like size_t). */ | |
302 | ||
303 | #ifndef ESC_tee | |
304 | #define ESC_tee '\t' | |
305 | #endif | |
306 | ||
307 | /* These are escaped items that aren't just an encoding of a particular data | |
308 | value such as \n. They must have non-zero values, as check_escape() returns | |
309 | their negation. Also, they must appear in the same order as in the opcode | |
310 | definitions below, up to ESC_z. There's a dummy for OP_ANY because it | |
311 | corresponds to "." rather than an escape sequence. The final one must be | |
312 | ESC_REF as subsequent values are used for \1, \2, \3, etc. There is are two | |
313 | tests in the code for an escape greater than ESC_b and less than ESC_Z to | |
314 | detect the types that may be repeated. These are the types that consume | |
315 | characters. If any new escapes are put in between that don't consume a | |
316 | character, that code will have to change. */ | |
317 | ||
318 | enum { ESC_A = 1, ESC_G, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W, | |
319 | ESC_w, ESC_dum1, ESC_C, ESC_P, ESC_p, ESC_X, ESC_Z, ESC_z, ESC_E, | |
320 | ESC_Q, ESC_REF }; | |
321 | ||
322 | /* Flag bits and data types for the extended class (OP_XCLASS) for classes that | |
323 | contain UTF-8 characters with values greater than 255. */ | |
324 | ||
325 | #define XCL_NOT 0x01 /* Flag: this is a negative class */ | |
326 | #define XCL_MAP 0x02 /* Flag: a 32-byte map is present */ | |
327 | ||
328 | #define XCL_END 0 /* Marks end of individual items */ | |
329 | #define XCL_SINGLE 1 /* Single item (one multibyte char) follows */ | |
330 | #define XCL_RANGE 2 /* A range (two multibyte chars) follows */ | |
331 | #define XCL_PROP 3 /* Unicode property (one property code) follows */ | |
332 | #define XCL_NOTPROP 4 /* Unicode inverted property (ditto) */ | |
333 | ||
334 | ||
335 | /* Opcode table: OP_BRA must be last, as all values >= it are used for brackets | |
336 | that extract substrings. Starting from 1 (i.e. after OP_END), the values up to | |
337 | OP_EOD must correspond in order to the list of escapes immediately above. | |
338 | Note that whenever this list is updated, the two macro definitions that follow | |
339 | must also be updated to match. */ | |
340 | ||
341 | enum { | |
342 | OP_END, /* 0 End of pattern */ | |
343 | ||
344 | /* Values corresponding to backslashed metacharacters */ | |
345 | ||
346 | OP_SOD, /* 1 Start of data: \A */ | |
347 | OP_SOM, /* 2 Start of match (subject + offset): \G */ | |
348 | OP_NOT_WORD_BOUNDARY, /* 3 \B */ | |
349 | OP_WORD_BOUNDARY, /* 4 \b */ | |
350 | OP_NOT_DIGIT, /* 5 \D */ | |
351 | OP_DIGIT, /* 6 \d */ | |
352 | OP_NOT_WHITESPACE, /* 7 \S */ | |
353 | OP_WHITESPACE, /* 8 \s */ | |
354 | OP_NOT_WORDCHAR, /* 9 \W */ | |
355 | OP_WORDCHAR, /* 10 \w */ | |
356 | OP_ANY, /* 11 Match any character */ | |
357 | OP_ANYBYTE, /* 12 Match any byte (\C); different to OP_ANY for UTF-8 */ | |
358 | OP_NOTPROP, /* 13 \P (not Unicode property) */ | |
359 | OP_PROP, /* 14 \p (Unicode property) */ | |
360 | OP_EXTUNI, /* 15 \X (extended Unicode sequence */ | |
361 | OP_EODN, /* 16 End of data or \n at end of data: \Z. */ | |
362 | OP_EOD, /* 17 End of data: \z */ | |
363 | ||
364 | OP_OPT, /* 18 Set runtime options */ | |
365 | OP_CIRC, /* 19 Start of line - varies with multiline switch */ | |
366 | OP_DOLL, /* 20 End of line - varies with multiline switch */ | |
367 | OP_CHAR, /* 21 Match one character, casefully */ | |
368 | OP_CHARNC, /* 22 Match one character, caselessly */ | |
369 | OP_NOT, /* 23 Match anything but the following char */ | |
370 | ||
371 | OP_STAR, /* 24 The maximizing and minimizing versions of */ | |
372 | OP_MINSTAR, /* 25 all these opcodes must come in pairs, with */ | |
373 | OP_PLUS, /* 26 the minimizing one second. */ | |
374 | OP_MINPLUS, /* 27 This first set applies to single characters */ | |
375 | OP_QUERY, /* 28 */ | |
376 | OP_MINQUERY, /* 29 */ | |
377 | OP_UPTO, /* 30 From 0 to n matches */ | |
378 | OP_MINUPTO, /* 31 */ | |
379 | OP_EXACT, /* 32 Exactly n matches */ | |
380 | ||
381 | OP_NOTSTAR, /* 33 The maximizing and minimizing versions of */ | |
382 | OP_NOTMINSTAR, /* 34 all these opcodes must come in pairs, with */ | |
383 | OP_NOTPLUS, /* 35 the minimizing one second. */ | |
384 | OP_NOTMINPLUS, /* 36 This set applies to "not" single characters */ | |
385 | OP_NOTQUERY, /* 37 */ | |
386 | OP_NOTMINQUERY, /* 38 */ | |
387 | OP_NOTUPTO, /* 39 From 0 to n matches */ | |
388 | OP_NOTMINUPTO, /* 40 */ | |
389 | OP_NOTEXACT, /* 41 Exactly n matches */ | |
390 | ||
391 | OP_TYPESTAR, /* 42 The maximizing and minimizing versions of */ | |
392 | OP_TYPEMINSTAR, /* 43 all these opcodes must come in pairs, with */ | |
393 | OP_TYPEPLUS, /* 44 the minimizing one second. These codes must */ | |
394 | OP_TYPEMINPLUS, /* 45 be in exactly the same order as those above. */ | |
395 | OP_TYPEQUERY, /* 46 This set applies to character types such as \d */ | |
396 | OP_TYPEMINQUERY, /* 47 */ | |
397 | OP_TYPEUPTO, /* 48 From 0 to n matches */ | |
398 | OP_TYPEMINUPTO, /* 49 */ | |
399 | OP_TYPEEXACT, /* 50 Exactly n matches */ | |
400 | ||
401 | OP_CRSTAR, /* 51 The maximizing and minimizing versions of */ | |
402 | OP_CRMINSTAR, /* 52 all these opcodes must come in pairs, with */ | |
403 | OP_CRPLUS, /* 53 the minimizing one second. These codes must */ | |
404 | OP_CRMINPLUS, /* 54 be in exactly the same order as those above. */ | |
405 | OP_CRQUERY, /* 55 These are for character classes and back refs */ | |
406 | OP_CRMINQUERY, /* 56 */ | |
407 | OP_CRRANGE, /* 57 These are different to the three sets above. */ | |
408 | OP_CRMINRANGE, /* 58 */ | |
409 | ||
410 | OP_CLASS, /* 59 Match a character class, chars < 256 only */ | |
411 | OP_NCLASS, /* 60 Same, but the bitmap was created from a negative | |
412 | class - the difference is relevant only when a UTF-8 | |
413 | character > 255 is encountered. */ | |
414 | ||
415 | OP_XCLASS, /* 61 Extended class for handling UTF-8 chars within the | |
416 | class. This does both positive and negative. */ | |
417 | ||
418 | OP_REF, /* 62 Match a back reference */ | |
419 | OP_RECURSE, /* 63 Match a numbered subpattern (possibly recursive) */ | |
420 | OP_CALLOUT, /* 64 Call out to external function if provided */ | |
421 | ||
422 | OP_ALT, /* 65 Start of alternation */ | |
423 | OP_KET, /* 66 End of group that doesn't have an unbounded repeat */ | |
424 | OP_KETRMAX, /* 67 These two must remain together and in this */ | |
425 | OP_KETRMIN, /* 68 order. They are for groups the repeat for ever. */ | |
426 | ||
427 | /* The assertions must come before ONCE and COND */ | |
428 | ||
429 | OP_ASSERT, /* 69 Positive lookahead */ | |
430 | OP_ASSERT_NOT, /* 70 Negative lookahead */ | |
431 | OP_ASSERTBACK, /* 71 Positive lookbehind */ | |
432 | OP_ASSERTBACK_NOT, /* 72 Negative lookbehind */ | |
433 | OP_REVERSE, /* 73 Move pointer back - used in lookbehind assertions */ | |
434 | ||
435 | /* ONCE and COND must come after the assertions, with ONCE first, as there's | |
436 | a test for >= ONCE for a subpattern that isn't an assertion. */ | |
437 | ||
438 | OP_ONCE, /* 74 Once matched, don't back up into the subpattern */ | |
439 | OP_COND, /* 75 Conditional group */ | |
440 | OP_CREF, /* 76 Used to hold an extraction string number (cond ref) */ | |
441 | ||
442 | OP_BRAZERO, /* 77 These two must remain together and in this */ | |
443 | OP_BRAMINZERO, /* 78 order. */ | |
444 | ||
445 | OP_BRANUMBER, /* 79 Used for extracting brackets whose number is greater | |
446 | than can fit into an opcode. */ | |
447 | ||
448 | OP_BRA /* 80 This and greater values are used for brackets that | |
449 | extract substrings up to EXTRACT_BASIC_MAX. After | |
450 | that, use is made of OP_BRANUMBER. */ | |
451 | }; | |
452 | ||
453 | /* WARNING WARNING WARNING: There is an implicit assumption in pcre.c and | |
454 | study.c that all opcodes are less than 128 in value. This makes handling UTF-8 | |
455 | character sequences easier. */ | |
456 | ||
457 | /* The highest extraction number before we have to start using additional | |
458 | bytes. (Originally PCRE didn't have support for extraction counts highter than | |
459 | this number.) The value is limited by the number of opcodes left after OP_BRA, | |
460 | i.e. 255 - OP_BRA. We actually set it a bit lower to leave room for additional | |
461 | opcodes. */ | |
462 | ||
463 | #define EXTRACT_BASIC_MAX 100 | |
464 | ||
465 | ||
466 | /* This macro defines textual names for all the opcodes. There are used only | |
467 | for debugging, in pcre.c when DEBUG is defined, and also in pcretest.c. The | |
468 | macro is referenced only in printint.c. */ | |
469 | ||
470 | #define OP_NAME_LIST \ | |
471 | "End", "\\A", "\\G", "\\B", "\\b", "\\D", "\\d", \ | |
472 | "\\S", "\\s", "\\W", "\\w", "Any", "Anybyte", \ | |
473 | "notprop", "prop", "extuni", \ | |
474 | "\\Z", "\\z", \ | |
475 | "Opt", "^", "$", "char", "charnc", "not", \ | |
476 | "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ | |
477 | "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ | |
478 | "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ | |
479 | "*", "*?", "+", "+?", "?", "??", "{", "{", \ | |
480 | "class", "nclass", "xclass", "Ref", "Recurse", "Callout", \ | |
481 | "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", \ | |
482 | "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cond ref",\ | |
483 | "Brazero", "Braminzero", "Branumber", "Bra" | |
484 | ||
485 | ||
486 | /* This macro defines the length of fixed length operations in the compiled | |
487 | regex. The lengths are used when searching for specific things, and also in the | |
488 | debugging printing of a compiled regex. We use a macro so that it can be | |
489 | incorporated both into pcre.c and pcretest.c without being publicly exposed. | |
490 | ||
491 | As things have been extended, some of these are no longer fixed lenths, but are | |
492 | minima instead. For example, the length of a single-character repeat may vary | |
493 | in UTF-8 mode. The code that uses this table must know about such things. */ | |
494 | ||
495 | #define OP_LENGTHS \ | |
496 | 1, /* End */ \ | |
497 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* \A, \G, \B, \B, \D, \d, \S, \s, \W, \w */ \ | |
498 | 1, 1, /* Any, Anybyte */ \ | |
499 | 2, 2, 1, /* NOTPROP, PROP, EXTUNI */ \ | |
500 | 1, 1, 2, 1, 1, /* \Z, \z, Opt, ^, $ */ \ | |
501 | 2, /* Char - the minimum length */ \ | |
502 | 2, /* Charnc - the minimum length */ \ | |
503 | 2, /* not */ \ | |
504 | /* Positive single-char repeats ** These are */ \ | |
505 | 2, 2, 2, 2, 2, 2, /* *, *?, +, +?, ?, ?? ** minima in */ \ | |
506 | 4, 4, 4, /* upto, minupto, exact ** UTF-8 mode */ \ | |
507 | /* Negative single-char repeats - only for chars < 256 */ \ | |
508 | 2, 2, 2, 2, 2, 2, /* NOT *, *?, +, +?, ?, ?? */ \ | |
509 | 4, 4, 4, /* NOT upto, minupto, exact */ \ | |
510 | /* Positive type repeats */ \ | |
511 | 2, 2, 2, 2, 2, 2, /* Type *, *?, +, +?, ?, ?? */ \ | |
512 | 4, 4, 4, /* Type upto, minupto, exact */ \ | |
513 | /* Character class & ref repeats */ \ | |
514 | 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ \ | |
515 | 5, 5, /* CRRANGE, CRMINRANGE */ \ | |
516 | 33, /* CLASS */ \ | |
517 | 33, /* NCLASS */ \ | |
518 | 0, /* XCLASS - variable length */ \ | |
519 | 3, /* REF */ \ | |
520 | 1+LINK_SIZE, /* RECURSE */ \ | |
521 | 2+2*LINK_SIZE, /* CALLOUT */ \ | |
522 | 1+LINK_SIZE, /* Alt */ \ | |
523 | 1+LINK_SIZE, /* Ket */ \ | |
524 | 1+LINK_SIZE, /* KetRmax */ \ | |
525 | 1+LINK_SIZE, /* KetRmin */ \ | |
526 | 1+LINK_SIZE, /* Assert */ \ | |
527 | 1+LINK_SIZE, /* Assert not */ \ | |
528 | 1+LINK_SIZE, /* Assert behind */ \ | |
529 | 1+LINK_SIZE, /* Assert behind not */ \ | |
530 | 1+LINK_SIZE, /* Reverse */ \ | |
531 | 1+LINK_SIZE, /* Once */ \ | |
532 | 1+LINK_SIZE, /* COND */ \ | |
533 | 3, /* CREF */ \ | |
534 | 1, 1, /* BRAZERO, BRAMINZERO */ \ | |
535 | 3, /* BRANUMBER */ \ | |
536 | 1+LINK_SIZE /* BRA */ \ | |
537 | ||
538 | ||
539 | /* A magic value for OP_CREF to indicate the "in recursion" condition. */ | |
540 | ||
541 | #define CREF_RECURSE 0xffff | |
542 | ||
543 | /* The texts of compile-time error messages are defined as macros here so that | |
544 | they can be accessed by the POSIX wrapper and converted into error codes. Yes, | |
545 | I could have used error codes in the first place, but didn't feel like changing | |
546 | just to accommodate the POSIX wrapper. */ | |
547 | ||
548 | #define ERR1 "\\ at end of pattern" | |
549 | #define ERR2 "\\c at end of pattern" | |
550 | #define ERR3 "unrecognized character follows \\" | |
551 | #define ERR4 "numbers out of order in {} quantifier" | |
552 | #define ERR5 "number too big in {} quantifier" | |
553 | #define ERR6 "missing terminating ] for character class" | |
554 | #define ERR7 "invalid escape sequence in character class" | |
555 | #define ERR8 "range out of order in character class" | |
556 | #define ERR9 "nothing to repeat" | |
557 | #define ERR10 "operand of unlimited repeat could match the empty string" | |
558 | #define ERR11 "internal error: unexpected repeat" | |
559 | #define ERR12 "unrecognized character after (?" | |
560 | #define ERR13 "POSIX named classes are supported only within a class" | |
561 | #define ERR14 "missing )" | |
562 | #define ERR15 "reference to non-existent subpattern" | |
563 | #define ERR16 "erroffset passed as NULL" | |
564 | #define ERR17 "unknown option bit(s) set" | |
565 | #define ERR18 "missing ) after comment" | |
566 | #define ERR19 "parentheses nested too deeply" | |
567 | #define ERR20 "regular expression too large" | |
568 | #define ERR21 "failed to get memory" | |
569 | #define ERR22 "unmatched parentheses" | |
570 | #define ERR23 "internal error: code overflow" | |
571 | #define ERR24 "unrecognized character after (?<" | |
572 | #define ERR25 "lookbehind assertion is not fixed length" | |
573 | #define ERR26 "malformed number after (?(" | |
574 | #define ERR27 "conditional group contains more than two branches" | |
575 | #define ERR28 "assertion expected after (?(" | |
576 | #define ERR29 "(?R or (?digits must be followed by )" | |
577 | #define ERR30 "unknown POSIX class name" | |
578 | #define ERR31 "POSIX collating elements are not supported" | |
579 | #define ERR32 "this version of PCRE is not compiled with PCRE_UTF8 support" | |
580 | #define ERR33 "spare error" | |
581 | #define ERR34 "character value in \\x{...} sequence is too large" | |
582 | #define ERR35 "invalid condition (?(0)" | |
583 | #define ERR36 "\\C not allowed in lookbehind assertion" | |
584 | #define ERR37 "PCRE does not support \\L, \\l, \\N, \\U, or \\u" | |
585 | #define ERR38 "number after (?C is > 255" | |
586 | #define ERR39 "closing ) for (?C expected" | |
587 | #define ERR40 "recursive call could loop indefinitely" | |
588 | #define ERR41 "unrecognized character after (?P" | |
589 | #define ERR42 "syntax error after (?P" | |
590 | #define ERR43 "two named groups have the same name" | |
591 | #define ERR44 "invalid UTF-8 string" | |
592 | #define ERR45 "support for \\P, \\p, and \\X has not been compiled" | |
593 | #define ERR46 "malformed \\P or \\p sequence" | |
594 | #define ERR47 "unknown property name after \\P or \\p" | |
595 | ||
596 | /* The real format of the start of the pcre block; the index of names and the | |
597 | code vector run on as long as necessary after the end. We store an explicit | |
598 | offset to the name table so that if a regex is compiled on one host, saved, and | |
599 | then run on another where the size of pointers is different, all might still | |
600 | be well. For the case of compiled-on-4 and run-on-8, we include an extra | |
601 | pointer that is always NULL. For future-proofing, we also include a few dummy | |
602 | fields - even though you can never get this planning right! | |
603 | ||
604 | NOTE NOTE NOTE: | |
605 | Because people can now save and re-use compiled patterns, any additions to this | |
606 | structure should be made at the end, and something earlier (e.g. a new | |
607 | flag in the options or one of the dummy fields) should indicate that the new | |
608 | fields are present. Currently PCRE always sets the dummy fields to zero. | |
609 | NOTE NOTE NOTE: | |
610 | */ | |
611 | ||
612 | typedef struct real_pcre { | |
613 | pcre_uint32 magic_number; | |
614 | pcre_uint32 size; /* Total that was malloced */ | |
615 | pcre_uint32 options; | |
616 | pcre_uint32 dummy1; /* For future use, maybe */ | |
617 | ||
618 | pcre_uint16 top_bracket; | |
619 | pcre_uint16 top_backref; | |
620 | pcre_uint16 first_byte; | |
621 | pcre_uint16 req_byte; | |
622 | pcre_uint16 name_table_offset; /* Offset to name table that follows */ | |
623 | pcre_uint16 name_entry_size; /* Size of any name items */ | |
624 | pcre_uint16 name_count; /* Number of name items */ | |
625 | pcre_uint16 dummy2; /* For future use, maybe */ | |
626 | ||
627 | const unsigned char *tables; /* Pointer to tables or NULL for std */ | |
628 | const unsigned char *nullpad; /* NULL padding */ | |
629 | } real_pcre; | |
630 | ||
631 | /* The format of the block used to store data from pcre_study(). The same | |
632 | remark (see NOTE above) about extending this structure applies. */ | |
633 | ||
634 | typedef struct pcre_study_data { | |
635 | pcre_uint32 size; /* Total that was malloced */ | |
636 | pcre_uint32 options; | |
637 | uschar start_bits[32]; | |
638 | } pcre_study_data; | |
639 | ||
640 | /* Structure for passing "static" information around between the functions | |
641 | doing the compiling, so that they are thread-safe. */ | |
642 | ||
643 | typedef struct compile_data { | |
644 | const uschar *lcc; /* Points to lower casing table */ | |
645 | const uschar *fcc; /* Points to case-flipping table */ | |
646 | const uschar *cbits; /* Points to character type table */ | |
647 | const uschar *ctypes; /* Points to table of type maps */ | |
648 | const uschar *start_code; /* The start of the compiled code */ | |
649 | const uschar *start_pattern; /* The start of the pattern */ | |
650 | uschar *name_table; /* The name/number table */ | |
651 | int names_found; /* Number of entries so far */ | |
652 | int name_entry_size; /* Size of each entry */ | |
653 | int top_backref; /* Maximum back reference */ | |
654 | unsigned int backref_map; /* Bitmap of low back refs */ | |
655 | int req_varyopt; /* "After variable item" flag for reqbyte */ | |
656 | BOOL nopartial; /* Set TRUE if partial won't work */ | |
657 | } compile_data; | |
658 | ||
659 | /* Structure for maintaining a chain of pointers to the currently incomplete | |
660 | branches, for testing for left recursion. */ | |
661 | ||
662 | typedef struct branch_chain { | |
663 | struct branch_chain *outer; | |
664 | uschar *current; | |
665 | } branch_chain; | |
666 | ||
667 | /* Structure for items in a linked list that represents an explicit recursive | |
668 | call within the pattern. */ | |
669 | ||
670 | typedef struct recursion_info { | |
671 | struct recursion_info *prevrec; /* Previous recursion record (or NULL) */ | |
672 | int group_num; /* Number of group that was called */ | |
673 | const uschar *after_call; /* "Return value": points after the call in the expr */ | |
674 | const uschar *save_start; /* Old value of md->start_match */ | |
675 | int *offset_save; /* Pointer to start of saved offsets */ | |
676 | int saved_max; /* Number of saved offsets */ | |
677 | } recursion_info; | |
678 | ||
679 | /* When compiling in a mode that doesn't use recursive calls to match(), | |
680 | a structure is used to remember local variables on the heap. It is defined in | |
681 | pcre.c, close to the match() function, so that it is easy to keep it in step | |
682 | with any changes of local variable. However, the pointer to the current frame | |
683 | must be saved in some "static" place over a longjmp(). We declare the | |
684 | structure here so that we can put a pointer in the match_data structure. | |
685 | NOTE: This isn't used for a "normal" compilation of pcre. */ | |
686 | ||
687 | struct heapframe; | |
688 | ||
689 | /* Structure for passing "static" information around between the functions | |
690 | doing the matching, so that they are thread-safe. */ | |
691 | ||
692 | typedef struct match_data { | |
693 | unsigned long int match_call_count; /* As it says */ | |
694 | unsigned long int match_limit;/* As it says */ | |
695 | int *offset_vector; /* Offset vector */ | |
696 | int offset_end; /* One past the end */ | |
697 | int offset_max; /* The maximum usable for return data */ | |
698 | const uschar *lcc; /* Points to lower casing table */ | |
699 | const uschar *ctypes; /* Points to table of type maps */ | |
700 | BOOL offset_overflow; /* Set if too many extractions */ | |
701 | BOOL notbol; /* NOTBOL flag */ | |
702 | BOOL noteol; /* NOTEOL flag */ | |
703 | BOOL utf8; /* UTF8 flag */ | |
704 | BOOL endonly; /* Dollar not before final \n */ | |
705 | BOOL notempty; /* Empty string match not wanted */ | |
706 | BOOL partial; /* PARTIAL flag */ | |
707 | BOOL hitend; /* Hit the end of the subject at some point */ | |
708 | const uschar *start_code; /* For use when recursing */ | |
709 | const uschar *start_subject; /* Start of the subject string */ | |
710 | const uschar *end_subject; /* End of the subject string */ | |
711 | const uschar *start_match; /* Start of this match attempt */ | |
712 | const uschar *end_match_ptr; /* Subject position at end match */ | |
713 | int end_offset_top; /* Highwater mark at end of match */ | |
714 | int capture_last; /* Most recent capture number */ | |
715 | int start_offset; /* The start offset value */ | |
716 | recursion_info *recursive; /* Linked list of recursion data */ | |
717 | void *callout_data; /* To pass back to callouts */ | |
718 | struct heapframe *thisframe; /* Used only when compiling for no recursion */ | |
719 | } match_data; | |
720 | ||
721 | /* Bit definitions for entries in the pcre_ctypes table. */ | |
722 | ||
723 | #define ctype_space 0x01 | |
724 | #define ctype_letter 0x02 | |
725 | #define ctype_digit 0x04 | |
726 | #define ctype_xdigit 0x08 | |
727 | #define ctype_word 0x10 /* alphameric or '_' */ | |
728 | #define ctype_meta 0x80 /* regexp meta char or zero (end pattern) */ | |
729 | ||
730 | /* Offsets for the bitmap tables in pcre_cbits. Each table contains a set | |
731 | of bits for a class map. Some classes are built by combining these tables. */ | |
732 | ||
733 | #define cbit_space 0 /* [:space:] or \s */ | |
734 | #define cbit_xdigit 32 /* [:xdigit:] */ | |
735 | #define cbit_digit 64 /* [:digit:] or \d */ | |
736 | #define cbit_upper 96 /* [:upper:] */ | |
737 | #define cbit_lower 128 /* [:lower:] */ | |
738 | #define cbit_word 160 /* [:word:] or \w */ | |
739 | #define cbit_graph 192 /* [:graph:] */ | |
740 | #define cbit_print 224 /* [:print:] */ | |
741 | #define cbit_punct 256 /* [:punct:] */ | |
742 | #define cbit_cntrl 288 /* [:cntrl:] */ | |
743 | #define cbit_length 320 /* Length of the cbits table */ | |
744 | ||
745 | /* Offsets of the various tables from the base tables pointer, and | |
746 | total length. */ | |
747 | ||
748 | #define lcc_offset 0 | |
749 | #define fcc_offset 256 | |
750 | #define cbits_offset 512 | |
751 | #define ctypes_offset (cbits_offset + cbit_length) | |
752 | #define tables_length (ctypes_offset + 256) | |
753 | ||
754 | /* End of internal.h */ |