Commit | Line | Data |
---|---|---|
c86f6258 PH |
1 | /************************************************* |
2 | * Perl-Compatible Regular Expressions * | |
3 | *************************************************/ | |
4 | ||
5 | ||
6 | /* This is a library of functions to support regular expressions whose syntax | |
7 | and semantics are as close as possible to those of the Perl 5 language. See | |
8 | the file doc/Tech.Notes for some information on the internals. | |
9 | ||
10 | Written by: Philip Hazel <ph10@cam.ac.uk> | |
11 | ||
12 | Copyright (c) 1997-2004 University of Cambridge | |
13 | ||
14 | ----------------------------------------------------------------------------- | |
15 | Redistribution and use in source and binary forms, with or without | |
16 | modification, are permitted provided that the following conditions are met: | |
17 | ||
18 | * Redistributions of source code must retain the above copyright notice, | |
19 | this list of conditions and the following disclaimer. | |
20 | ||
21 | * Redistributions in binary form must reproduce the above copyright | |
22 | notice, this list of conditions and the following disclaimer in the | |
23 | documentation and/or other materials provided with the distribution. | |
24 | ||
25 | * Neither the name of the University of Cambridge nor the names of its | |
26 | contributors may be used to endorse or promote products derived from | |
27 | this software without specific prior written permission. | |
28 | ||
29 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
30 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
31 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
32 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |
33 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
34 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
35 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
36 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
37 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
38 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
39 | POSSIBILITY OF SUCH DAMAGE. | |
40 | ----------------------------------------------------------------------------- | |
41 | */ | |
42 | ||
43 | /* This header contains definitions that are shared between the different | |
44 | modules, but which are not relevant to the outside. */ | |
45 | ||
46 | /* Get the definitions provided by running "configure" */ | |
47 | ||
48 | #include "config.h" | |
49 | ||
50 | /* Standard C headers plus the external interface definition. The only time | |
51 | setjmp and stdarg are used is when NO_RECURSE is set. */ | |
52 | ||
53 | #include <ctype.h> | |
54 | #include <limits.h> | |
55 | #include <setjmp.h> | |
56 | #include <stdarg.h> | |
57 | #include <stddef.h> | |
58 | #include <stdio.h> | |
59 | #include <stdlib.h> | |
60 | #include <string.h> | |
61 | ||
62 | #ifndef PCRE_SPY | |
63 | #define PCRE_DEFINITION /* Win32 __declspec(export) trigger for .dll */ | |
64 | #endif | |
65 | ||
66 | /* We need to have types that specify unsigned 16-bit and 32-bit integers. We | |
67 | cannot determine these outside the compilation (e.g. by running a program as | |
68 | part of "configure") because PCRE is often cross-compiled for use on other | |
69 | systems. Instead we make use of the maximum sizes that are available at | |
70 | preprocessor time in standard C environments. */ | |
71 | ||
72 | #if USHRT_MAX == 65535 | |
73 | typedef unsigned short pcre_uint16; | |
74 | #elif UINT_MAX == 65535 | |
75 | typedef unsigned int pcre_uint16; | |
76 | #else | |
77 | #error Cannot determine a type for 16-bit unsigned integers | |
78 | #endif | |
79 | ||
80 | #if UINT_MAX == 4294967295 | |
81 | typedef unsigned int pcre_uint32; | |
82 | #elif ULONG_MAX == 4294967295 | |
83 | typedef unsigned long int pcre_uint32; | |
84 | #else | |
85 | #error Cannot determine a type for 32-bit unsigned integers | |
86 | #endif | |
87 | ||
88 | /* All character handling must be done as unsigned characters. Otherwise there | |
89 | are problems with top-bit-set characters and functions such as isspace(). | |
90 | However, we leave the interface to the outside world as char *, because that | |
91 | should make things easier for callers. We define a short type for unsigned char | |
92 | to save lots of typing. I tried "uchar", but it causes problems on Digital | |
93 | Unix, where it is defined in sys/types, so use "uschar" instead. */ | |
94 | ||
95 | typedef unsigned char uschar; | |
96 | ||
97 | /* Include the public PCRE header */ | |
98 | ||
99 | #include "pcre.h" | |
100 | ||
101 | /* When compiling for use with the Virtual Pascal compiler, these functions | |
102 | need to have their names changed. PCRE must be compiled with the -DVPCOMPAT | |
103 | option on the command line. */ | |
104 | ||
105 | #ifdef VPCOMPAT | |
106 | #define strncmp(s1,s2,m) _strncmp(s1,s2,m) | |
107 | #define memcpy(d,s,n) _memcpy(d,s,n) | |
108 | #define memmove(d,s,n) _memmove(d,s,n) | |
109 | #define memset(s,c,n) _memset(s,c,n) | |
110 | #else /* VPCOMPAT */ | |
111 | ||
112 | /* To cope with SunOS4 and other systems that lack memmove() but have bcopy(), | |
113 | define a macro for memmove() if HAVE_MEMMOVE is false, provided that HAVE_BCOPY | |
114 | is set. Otherwise, include an emulating function for those systems that have | |
115 | neither (there some non-Unix environments where this is the case). This assumes | |
116 | that all calls to memmove are moving strings upwards in store, which is the | |
117 | case in PCRE. */ | |
118 | ||
119 | #if ! HAVE_MEMMOVE | |
120 | #undef memmove /* some systems may have a macro */ | |
121 | #if HAVE_BCOPY | |
122 | #define memmove(a, b, c) bcopy(b, a, c) | |
123 | #else /* HAVE_BCOPY */ | |
124 | void * | |
125 | pcre_memmove(unsigned char *dest, const unsigned char *src, size_t n) | |
126 | { | |
127 | int i; | |
128 | dest += n; | |
129 | src += n; | |
130 | for (i = 0; i < n; ++i) *(--dest) = *(--src); | |
131 | } | |
132 | #define memmove(a, b, c) pcre_memmove(a, b, c) | |
133 | #endif /* not HAVE_BCOPY */ | |
134 | #endif /* not HAVE_MEMMOVE */ | |
135 | #endif /* not VPCOMPAT */ | |
136 | ||
137 | ||
138 | /* PCRE keeps offsets in its compiled code as 2-byte quantities (always stored | |
139 | in big-endian order) by default. These are used, for example, to link from the | |
140 | start of a subpattern to its alternatives and its end. The use of 2 bytes per | |
141 | offset limits the size of the compiled regex to around 64K, which is big enough | |
142 | for almost everybody. However, I received a request for an even bigger limit. | |
143 | For this reason, and also to make the code easier to maintain, the storing and | |
144 | loading of offsets from the byte string is now handled by the macros that are | |
145 | defined here. | |
146 | ||
147 | The macros are controlled by the value of LINK_SIZE. This defaults to 2 in | |
148 | the config.h file, but can be overridden by using -D on the command line. This | |
149 | is automated on Unix systems via the "configure" command. */ | |
150 | ||
151 | #if LINK_SIZE == 2 | |
152 | ||
153 | #define PUT(a,n,d) \ | |
154 | (a[n] = (d) >> 8), \ | |
155 | (a[(n)+1] = (d) & 255) | |
156 | ||
157 | #define GET(a,n) \ | |
158 | (((a)[n] << 8) | (a)[(n)+1]) | |
159 | ||
160 | #define MAX_PATTERN_SIZE (1 << 16) | |
161 | ||
162 | ||
163 | #elif LINK_SIZE == 3 | |
164 | ||
165 | #define PUT(a,n,d) \ | |
166 | (a[n] = (d) >> 16), \ | |
167 | (a[(n)+1] = (d) >> 8), \ | |
168 | (a[(n)+2] = (d) & 255) | |
169 | ||
170 | #define GET(a,n) \ | |
171 | (((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2]) | |
172 | ||
173 | #define MAX_PATTERN_SIZE (1 << 24) | |
174 | ||
175 | ||
176 | #elif LINK_SIZE == 4 | |
177 | ||
178 | #define PUT(a,n,d) \ | |
179 | (a[n] = (d) >> 24), \ | |
180 | (a[(n)+1] = (d) >> 16), \ | |
181 | (a[(n)+2] = (d) >> 8), \ | |
182 | (a[(n)+3] = (d) & 255) | |
183 | ||
184 | #define GET(a,n) \ | |
185 | (((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3]) | |
186 | ||
187 | #define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */ | |
188 | ||
189 | ||
190 | #else | |
191 | #error LINK_SIZE must be either 2, 3, or 4 | |
192 | #endif | |
193 | ||
194 | ||
195 | /* Convenience macro defined in terms of the others */ | |
196 | ||
197 | #define PUTINC(a,n,d) PUT(a,n,d), a += LINK_SIZE | |
198 | ||
199 | ||
200 | /* PCRE uses some other 2-byte quantities that do not change when the size of | |
201 | offsets changes. There are used for repeat counts and for other things such as | |
202 | capturing parenthesis numbers in back references. */ | |
203 | ||
204 | #define PUT2(a,n,d) \ | |
205 | a[n] = (d) >> 8; \ | |
206 | a[(n)+1] = (d) & 255 | |
207 | ||
208 | #define GET2(a,n) \ | |
209 | (((a)[n] << 8) | (a)[(n)+1]) | |
210 | ||
211 | #define PUT2INC(a,n,d) PUT2(a,n,d), a += 2 | |
212 | ||
213 | ||
214 | /* In case there is no definition of offsetof() provided - though any proper | |
215 | Standard C system should have one. */ | |
216 | ||
217 | #ifndef offsetof | |
218 | #define offsetof(p_type,field) ((size_t)&(((p_type *)0)->field)) | |
219 | #endif | |
220 | ||
221 | ||
222 | /* These are the public options that can change during matching. */ | |
223 | ||
224 | #define PCRE_IMS (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL) | |
225 | ||
226 | /* Private options flags start at the most significant end of the four bytes, | |
227 | but skip the top bit so we can use ints for convenience without getting tangled | |
228 | with negative values. The public options defined in pcre.h start at the least | |
229 | significant end. Make sure they don't overlap, though now that we have expanded | |
230 | to four bytes, there is plenty of space. */ | |
231 | ||
232 | #define PCRE_FIRSTSET 0x40000000 /* first_byte is set */ | |
233 | #define PCRE_REQCHSET 0x20000000 /* req_byte is set */ | |
234 | #define PCRE_STARTLINE 0x10000000 /* start after \n for multiline */ | |
235 | #define PCRE_ICHANGED 0x08000000 /* i option changes within regex */ | |
236 | #define PCRE_NOPARTIAL 0x04000000 /* can't use partial with this regex */ | |
237 | ||
238 | /* Options for the "extra" block produced by pcre_study(). */ | |
239 | ||
240 | #define PCRE_STUDY_MAPPED 0x01 /* a map of starting chars exists */ | |
241 | ||
242 | /* Masks for identifying the public options which are permitted at compile | |
243 | time, run time or study time, respectively. */ | |
244 | ||
245 | #define PUBLIC_OPTIONS \ | |
246 | (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \ | |
247 | PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \ | |
248 | PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT) | |
249 | ||
250 | #define PUBLIC_EXEC_OPTIONS \ | |
251 | (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \ | |
252 | PCRE_PARTIAL) | |
253 | ||
254 | #define PUBLIC_STUDY_OPTIONS 0 /* None defined */ | |
255 | ||
256 | /* Magic number to provide a small check against being handed junk. */ | |
257 | ||
258 | #define MAGIC_NUMBER 0x50435245UL /* 'PCRE' */ | |
259 | ||
260 | /* Negative values for the firstchar and reqchar variables */ | |
261 | ||
262 | #define REQ_UNSET (-2) | |
263 | #define REQ_NONE (-1) | |
264 | ||
265 | /* Flags added to firstbyte or reqbyte; a "non-literal" item is either a | |
266 | variable-length repeat, or a anything other than literal characters. */ | |
267 | ||
268 | #define REQ_CASELESS 0x0100 /* indicates caselessness */ | |
269 | #define REQ_VARY 0x0200 /* reqbyte followed non-literal item */ | |
270 | ||
271 | /* Miscellaneous definitions */ | |
272 | ||
273 | typedef int BOOL; | |
274 | ||
275 | #define FALSE 0 | |
276 | #define TRUE 1 | |
277 | ||
278 | /* Escape items that are just an encoding of a particular data value. Note that | |
279 | ESC_n is defined as yet another macro, which is set in config.h to either \n | |
280 | (the default) or \r (which some people want). */ | |
281 | ||
282 | #ifndef ESC_e | |
283 | #define ESC_e 27 | |
284 | #endif | |
285 | ||
286 | #ifndef ESC_f | |
287 | #define ESC_f '\f' | |
288 | #endif | |
289 | ||
290 | #ifndef ESC_n | |
291 | #define ESC_n NEWLINE | |
292 | #endif | |
293 | ||
294 | #ifndef ESC_r | |
295 | #define ESC_r '\r' | |
296 | #endif | |
297 | ||
298 | /* We can't officially use ESC_t because it is a POSIX reserved identifier | |
299 | (presumably because of all the others like size_t). */ | |
300 | ||
301 | #ifndef ESC_tee | |
302 | #define ESC_tee '\t' | |
303 | #endif | |
304 | ||
305 | /* These are escaped items that aren't just an encoding of a particular data | |
306 | value such as \n. They must have non-zero values, as check_escape() returns | |
307 | their negation. Also, they must appear in the same order as in the opcode | |
308 | definitions below, up to ESC_z. There's a dummy for OP_ANY because it | |
309 | corresponds to "." rather than an escape sequence. The final one must be | |
310 | ESC_REF as subsequent values are used for \1, \2, \3, etc. There is are two | |
311 | tests in the code for an escape greater than ESC_b and less than ESC_Z to | |
312 | detect the types that may be repeated. These are the types that consume | |
313 | characters. If any new escapes are put in between that don't consume a | |
314 | character, that code will have to change. */ | |
315 | ||
316 | enum { ESC_A = 1, ESC_G, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W, | |
317 | ESC_w, ESC_dum1, ESC_C, ESC_P, ESC_p, ESC_X, ESC_Z, ESC_z, ESC_E, | |
318 | ESC_Q, ESC_REF }; | |
319 | ||
320 | /* Flag bits and data types for the extended class (OP_XCLASS) for classes that | |
321 | contain UTF-8 characters with values greater than 255. */ | |
322 | ||
323 | #define XCL_NOT 0x01 /* Flag: this is a negative class */ | |
324 | #define XCL_MAP 0x02 /* Flag: a 32-byte map is present */ | |
325 | ||
326 | #define XCL_END 0 /* Marks end of individual items */ | |
327 | #define XCL_SINGLE 1 /* Single item (one multibyte char) follows */ | |
328 | #define XCL_RANGE 2 /* A range (two multibyte chars) follows */ | |
329 | #define XCL_PROP 3 /* Unicode property (one property code) follows */ | |
330 | #define XCL_NOTPROP 4 /* Unicode inverted property (ditto) */ | |
331 | ||
332 | ||
333 | /* Opcode table: OP_BRA must be last, as all values >= it are used for brackets | |
334 | that extract substrings. Starting from 1 (i.e. after OP_END), the values up to | |
335 | OP_EOD must correspond in order to the list of escapes immediately above. | |
336 | Note that whenever this list is updated, the two macro definitions that follow | |
337 | must also be updated to match. */ | |
338 | ||
339 | enum { | |
340 | OP_END, /* 0 End of pattern */ | |
341 | ||
342 | /* Values corresponding to backslashed metacharacters */ | |
343 | ||
344 | OP_SOD, /* 1 Start of data: \A */ | |
345 | OP_SOM, /* 2 Start of match (subject + offset): \G */ | |
346 | OP_NOT_WORD_BOUNDARY, /* 3 \B */ | |
347 | OP_WORD_BOUNDARY, /* 4 \b */ | |
348 | OP_NOT_DIGIT, /* 5 \D */ | |
349 | OP_DIGIT, /* 6 \d */ | |
350 | OP_NOT_WHITESPACE, /* 7 \S */ | |
351 | OP_WHITESPACE, /* 8 \s */ | |
352 | OP_NOT_WORDCHAR, /* 9 \W */ | |
353 | OP_WORDCHAR, /* 10 \w */ | |
354 | OP_ANY, /* 11 Match any character */ | |
355 | OP_ANYBYTE, /* 12 Match any byte (\C); different to OP_ANY for UTF-8 */ | |
356 | OP_NOTPROP, /* 13 \P (not Unicode property) */ | |
357 | OP_PROP, /* 14 \p (Unicode property) */ | |
358 | OP_EXTUNI, /* 15 \X (extended Unicode sequence */ | |
359 | OP_EODN, /* 16 End of data or \n at end of data: \Z. */ | |
360 | OP_EOD, /* 17 End of data: \z */ | |
361 | ||
362 | OP_OPT, /* 18 Set runtime options */ | |
363 | OP_CIRC, /* 19 Start of line - varies with multiline switch */ | |
364 | OP_DOLL, /* 20 End of line - varies with multiline switch */ | |
365 | OP_CHAR, /* 21 Match one character, casefully */ | |
366 | OP_CHARNC, /* 22 Match one character, caselessly */ | |
367 | OP_NOT, /* 23 Match anything but the following char */ | |
368 | ||
369 | OP_STAR, /* 24 The maximizing and minimizing versions of */ | |
370 | OP_MINSTAR, /* 25 all these opcodes must come in pairs, with */ | |
371 | OP_PLUS, /* 26 the minimizing one second. */ | |
372 | OP_MINPLUS, /* 27 This first set applies to single characters */ | |
373 | OP_QUERY, /* 28 */ | |
374 | OP_MINQUERY, /* 29 */ | |
375 | OP_UPTO, /* 30 From 0 to n matches */ | |
376 | OP_MINUPTO, /* 31 */ | |
377 | OP_EXACT, /* 32 Exactly n matches */ | |
378 | ||
379 | OP_NOTSTAR, /* 33 The maximizing and minimizing versions of */ | |
380 | OP_NOTMINSTAR, /* 34 all these opcodes must come in pairs, with */ | |
381 | OP_NOTPLUS, /* 35 the minimizing one second. */ | |
382 | OP_NOTMINPLUS, /* 36 This set applies to "not" single characters */ | |
383 | OP_NOTQUERY, /* 37 */ | |
384 | OP_NOTMINQUERY, /* 38 */ | |
385 | OP_NOTUPTO, /* 39 From 0 to n matches */ | |
386 | OP_NOTMINUPTO, /* 40 */ | |
387 | OP_NOTEXACT, /* 41 Exactly n matches */ | |
388 | ||
389 | OP_TYPESTAR, /* 42 The maximizing and minimizing versions of */ | |
390 | OP_TYPEMINSTAR, /* 43 all these opcodes must come in pairs, with */ | |
391 | OP_TYPEPLUS, /* 44 the minimizing one second. These codes must */ | |
392 | OP_TYPEMINPLUS, /* 45 be in exactly the same order as those above. */ | |
393 | OP_TYPEQUERY, /* 46 This set applies to character types such as \d */ | |
394 | OP_TYPEMINQUERY, /* 47 */ | |
395 | OP_TYPEUPTO, /* 48 From 0 to n matches */ | |
396 | OP_TYPEMINUPTO, /* 49 */ | |
397 | OP_TYPEEXACT, /* 50 Exactly n matches */ | |
398 | ||
399 | OP_CRSTAR, /* 51 The maximizing and minimizing versions of */ | |
400 | OP_CRMINSTAR, /* 52 all these opcodes must come in pairs, with */ | |
401 | OP_CRPLUS, /* 53 the minimizing one second. These codes must */ | |
402 | OP_CRMINPLUS, /* 54 be in exactly the same order as those above. */ | |
403 | OP_CRQUERY, /* 55 These are for character classes and back refs */ | |
404 | OP_CRMINQUERY, /* 56 */ | |
405 | OP_CRRANGE, /* 57 These are different to the three sets above. */ | |
406 | OP_CRMINRANGE, /* 58 */ | |
407 | ||
408 | OP_CLASS, /* 59 Match a character class, chars < 256 only */ | |
409 | OP_NCLASS, /* 60 Same, but the bitmap was created from a negative | |
410 | class - the difference is relevant only when a UTF-8 | |
411 | character > 255 is encountered. */ | |
412 | ||
413 | OP_XCLASS, /* 61 Extended class for handling UTF-8 chars within the | |
414 | class. This does both positive and negative. */ | |
415 | ||
416 | OP_REF, /* 62 Match a back reference */ | |
417 | OP_RECURSE, /* 63 Match a numbered subpattern (possibly recursive) */ | |
418 | OP_CALLOUT, /* 64 Call out to external function if provided */ | |
419 | ||
420 | OP_ALT, /* 65 Start of alternation */ | |
421 | OP_KET, /* 66 End of group that doesn't have an unbounded repeat */ | |
422 | OP_KETRMAX, /* 67 These two must remain together and in this */ | |
423 | OP_KETRMIN, /* 68 order. They are for groups the repeat for ever. */ | |
424 | ||
425 | /* The assertions must come before ONCE and COND */ | |
426 | ||
427 | OP_ASSERT, /* 69 Positive lookahead */ | |
428 | OP_ASSERT_NOT, /* 70 Negative lookahead */ | |
429 | OP_ASSERTBACK, /* 71 Positive lookbehind */ | |
430 | OP_ASSERTBACK_NOT, /* 72 Negative lookbehind */ | |
431 | OP_REVERSE, /* 73 Move pointer back - used in lookbehind assertions */ | |
432 | ||
433 | /* ONCE and COND must come after the assertions, with ONCE first, as there's | |
434 | a test for >= ONCE for a subpattern that isn't an assertion. */ | |
435 | ||
436 | OP_ONCE, /* 74 Once matched, don't back up into the subpattern */ | |
437 | OP_COND, /* 75 Conditional group */ | |
438 | OP_CREF, /* 76 Used to hold an extraction string number (cond ref) */ | |
439 | ||
440 | OP_BRAZERO, /* 77 These two must remain together and in this */ | |
441 | OP_BRAMINZERO, /* 78 order. */ | |
442 | ||
443 | OP_BRANUMBER, /* 79 Used for extracting brackets whose number is greater | |
444 | than can fit into an opcode. */ | |
445 | ||
446 | OP_BRA /* 80 This and greater values are used for brackets that | |
447 | extract substrings up to EXTRACT_BASIC_MAX. After | |
448 | that, use is made of OP_BRANUMBER. */ | |
449 | }; | |
450 | ||
451 | /* WARNING WARNING WARNING: There is an implicit assumption in pcre.c and | |
452 | study.c that all opcodes are less than 128 in value. This makes handling UTF-8 | |
453 | character sequences easier. */ | |
454 | ||
455 | /* The highest extraction number before we have to start using additional | |
456 | bytes. (Originally PCRE didn't have support for extraction counts highter than | |
457 | this number.) The value is limited by the number of opcodes left after OP_BRA, | |
458 | i.e. 255 - OP_BRA. We actually set it a bit lower to leave room for additional | |
459 | opcodes. */ | |
460 | ||
461 | #define EXTRACT_BASIC_MAX 100 | |
462 | ||
463 | ||
464 | /* This macro defines textual names for all the opcodes. There are used only | |
465 | for debugging, in pcre.c when DEBUG is defined, and also in pcretest.c. The | |
466 | macro is referenced only in printint.c. */ | |
467 | ||
468 | #define OP_NAME_LIST \ | |
469 | "End", "\\A", "\\G", "\\B", "\\b", "\\D", "\\d", \ | |
470 | "\\S", "\\s", "\\W", "\\w", "Any", "Anybyte", \ | |
471 | "notprop", "prop", "extuni", \ | |
472 | "\\Z", "\\z", \ | |
473 | "Opt", "^", "$", "char", "charnc", "not", \ | |
474 | "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ | |
475 | "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ | |
476 | "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ | |
477 | "*", "*?", "+", "+?", "?", "??", "{", "{", \ | |
478 | "class", "nclass", "xclass", "Ref", "Recurse", "Callout", \ | |
479 | "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", \ | |
480 | "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cond ref",\ | |
481 | "Brazero", "Braminzero", "Branumber", "Bra" | |
482 | ||
483 | ||
484 | /* This macro defines the length of fixed length operations in the compiled | |
485 | regex. The lengths are used when searching for specific things, and also in the | |
486 | debugging printing of a compiled regex. We use a macro so that it can be | |
487 | incorporated both into pcre.c and pcretest.c without being publicly exposed. | |
488 | ||
489 | As things have been extended, some of these are no longer fixed lenths, but are | |
490 | minima instead. For example, the length of a single-character repeat may vary | |
491 | in UTF-8 mode. The code that uses this table must know about such things. */ | |
492 | ||
493 | #define OP_LENGTHS \ | |
494 | 1, /* End */ \ | |
495 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* \A, \G, \B, \B, \D, \d, \S, \s, \W, \w */ \ | |
496 | 1, 1, /* Any, Anybyte */ \ | |
497 | 2, 2, 1, /* NOTPROP, PROP, EXTUNI */ \ | |
498 | 1, 1, 2, 1, 1, /* \Z, \z, Opt, ^, $ */ \ | |
499 | 2, /* Char - the minimum length */ \ | |
500 | 2, /* Charnc - the minimum length */ \ | |
501 | 2, /* not */ \ | |
502 | /* Positive single-char repeats ** These are */ \ | |
503 | 2, 2, 2, 2, 2, 2, /* *, *?, +, +?, ?, ?? ** minima in */ \ | |
504 | 4, 4, 4, /* upto, minupto, exact ** UTF-8 mode */ \ | |
505 | /* Negative single-char repeats - only for chars < 256 */ \ | |
506 | 2, 2, 2, 2, 2, 2, /* NOT *, *?, +, +?, ?, ?? */ \ | |
507 | 4, 4, 4, /* NOT upto, minupto, exact */ \ | |
508 | /* Positive type repeats */ \ | |
509 | 2, 2, 2, 2, 2, 2, /* Type *, *?, +, +?, ?, ?? */ \ | |
510 | 4, 4, 4, /* Type upto, minupto, exact */ \ | |
511 | /* Character class & ref repeats */ \ | |
512 | 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ \ | |
513 | 5, 5, /* CRRANGE, CRMINRANGE */ \ | |
514 | 33, /* CLASS */ \ | |
515 | 33, /* NCLASS */ \ | |
516 | 0, /* XCLASS - variable length */ \ | |
517 | 3, /* REF */ \ | |
518 | 1+LINK_SIZE, /* RECURSE */ \ | |
519 | 2+2*LINK_SIZE, /* CALLOUT */ \ | |
520 | 1+LINK_SIZE, /* Alt */ \ | |
521 | 1+LINK_SIZE, /* Ket */ \ | |
522 | 1+LINK_SIZE, /* KetRmax */ \ | |
523 | 1+LINK_SIZE, /* KetRmin */ \ | |
524 | 1+LINK_SIZE, /* Assert */ \ | |
525 | 1+LINK_SIZE, /* Assert not */ \ | |
526 | 1+LINK_SIZE, /* Assert behind */ \ | |
527 | 1+LINK_SIZE, /* Assert behind not */ \ | |
528 | 1+LINK_SIZE, /* Reverse */ \ | |
529 | 1+LINK_SIZE, /* Once */ \ | |
530 | 1+LINK_SIZE, /* COND */ \ | |
531 | 3, /* CREF */ \ | |
532 | 1, 1, /* BRAZERO, BRAMINZERO */ \ | |
533 | 3, /* BRANUMBER */ \ | |
534 | 1+LINK_SIZE /* BRA */ \ | |
535 | ||
536 | ||
537 | /* A magic value for OP_CREF to indicate the "in recursion" condition. */ | |
538 | ||
539 | #define CREF_RECURSE 0xffff | |
540 | ||
541 | /* The texts of compile-time error messages are defined as macros here so that | |
542 | they can be accessed by the POSIX wrapper and converted into error codes. Yes, | |
543 | I could have used error codes in the first place, but didn't feel like changing | |
544 | just to accommodate the POSIX wrapper. */ | |
545 | ||
546 | #define ERR1 "\\ at end of pattern" | |
547 | #define ERR2 "\\c at end of pattern" | |
548 | #define ERR3 "unrecognized character follows \\" | |
549 | #define ERR4 "numbers out of order in {} quantifier" | |
550 | #define ERR5 "number too big in {} quantifier" | |
551 | #define ERR6 "missing terminating ] for character class" | |
552 | #define ERR7 "invalid escape sequence in character class" | |
553 | #define ERR8 "range out of order in character class" | |
554 | #define ERR9 "nothing to repeat" | |
555 | #define ERR10 "operand of unlimited repeat could match the empty string" | |
556 | #define ERR11 "internal error: unexpected repeat" | |
557 | #define ERR12 "unrecognized character after (?" | |
558 | #define ERR13 "POSIX named classes are supported only within a class" | |
559 | #define ERR14 "missing )" | |
560 | #define ERR15 "reference to non-existent subpattern" | |
561 | #define ERR16 "erroffset passed as NULL" | |
562 | #define ERR17 "unknown option bit(s) set" | |
563 | #define ERR18 "missing ) after comment" | |
564 | #define ERR19 "parentheses nested too deeply" | |
565 | #define ERR20 "regular expression too large" | |
566 | #define ERR21 "failed to get memory" | |
567 | #define ERR22 "unmatched parentheses" | |
568 | #define ERR23 "internal error: code overflow" | |
569 | #define ERR24 "unrecognized character after (?<" | |
570 | #define ERR25 "lookbehind assertion is not fixed length" | |
571 | #define ERR26 "malformed number after (?(" | |
572 | #define ERR27 "conditional group contains more than two branches" | |
573 | #define ERR28 "assertion expected after (?(" | |
574 | #define ERR29 "(?R or (?digits must be followed by )" | |
575 | #define ERR30 "unknown POSIX class name" | |
576 | #define ERR31 "POSIX collating elements are not supported" | |
577 | #define ERR32 "this version of PCRE is not compiled with PCRE_UTF8 support" | |
578 | #define ERR33 "spare error" | |
579 | #define ERR34 "character value in \\x{...} sequence is too large" | |
580 | #define ERR35 "invalid condition (?(0)" | |
581 | #define ERR36 "\\C not allowed in lookbehind assertion" | |
582 | #define ERR37 "PCRE does not support \\L, \\l, \\N, \\U, or \\u" | |
583 | #define ERR38 "number after (?C is > 255" | |
584 | #define ERR39 "closing ) for (?C expected" | |
585 | #define ERR40 "recursive call could loop indefinitely" | |
586 | #define ERR41 "unrecognized character after (?P" | |
587 | #define ERR42 "syntax error after (?P" | |
588 | #define ERR43 "two named groups have the same name" | |
589 | #define ERR44 "invalid UTF-8 string" | |
590 | #define ERR45 "support for \\P, \\p, and \\X has not been compiled" | |
591 | #define ERR46 "malformed \\P or \\p sequence" | |
592 | #define ERR47 "unknown property name after \\P or \\p" | |
593 | ||
594 | /* The real format of the start of the pcre block; the index of names and the | |
595 | code vector run on as long as necessary after the end. We store an explicit | |
596 | offset to the name table so that if a regex is compiled on one host, saved, and | |
597 | then run on another where the size of pointers is different, all might still | |
598 | be well. For the case of compiled-on-4 and run-on-8, we include an extra | |
599 | pointer that is always NULL. For future-proofing, we also include a few dummy | |
600 | fields - even though you can never get this planning right! | |
601 | ||
602 | NOTE NOTE NOTE: | |
603 | Because people can now save and re-use compiled patterns, any additions to this | |
604 | structure should be made at the end, and something earlier (e.g. a new | |
605 | flag in the options or one of the dummy fields) should indicate that the new | |
606 | fields are present. Currently PCRE always sets the dummy fields to zero. | |
607 | NOTE NOTE NOTE: | |
608 | */ | |
609 | ||
610 | typedef struct real_pcre { | |
611 | pcre_uint32 magic_number; | |
612 | pcre_uint32 size; /* Total that was malloced */ | |
613 | pcre_uint32 options; | |
614 | pcre_uint32 dummy1; /* For future use, maybe */ | |
615 | ||
616 | pcre_uint16 top_bracket; | |
617 | pcre_uint16 top_backref; | |
618 | pcre_uint16 first_byte; | |
619 | pcre_uint16 req_byte; | |
620 | pcre_uint16 name_table_offset; /* Offset to name table that follows */ | |
621 | pcre_uint16 name_entry_size; /* Size of any name items */ | |
622 | pcre_uint16 name_count; /* Number of name items */ | |
623 | pcre_uint16 dummy2; /* For future use, maybe */ | |
624 | ||
625 | const unsigned char *tables; /* Pointer to tables or NULL for std */ | |
626 | const unsigned char *nullpad; /* NULL padding */ | |
627 | } real_pcre; | |
628 | ||
629 | /* The format of the block used to store data from pcre_study(). The same | |
630 | remark (see NOTE above) about extending this structure applies. */ | |
631 | ||
632 | typedef struct pcre_study_data { | |
633 | pcre_uint32 size; /* Total that was malloced */ | |
634 | pcre_uint32 options; | |
635 | uschar start_bits[32]; | |
636 | } pcre_study_data; | |
637 | ||
638 | /* Structure for passing "static" information around between the functions | |
639 | doing the compiling, so that they are thread-safe. */ | |
640 | ||
641 | typedef struct compile_data { | |
642 | const uschar *lcc; /* Points to lower casing table */ | |
643 | const uschar *fcc; /* Points to case-flipping table */ | |
644 | const uschar *cbits; /* Points to character type table */ | |
645 | const uschar *ctypes; /* Points to table of type maps */ | |
646 | const uschar *start_code; /* The start of the compiled code */ | |
647 | const uschar *start_pattern; /* The start of the pattern */ | |
648 | uschar *name_table; /* The name/number table */ | |
649 | int names_found; /* Number of entries so far */ | |
650 | int name_entry_size; /* Size of each entry */ | |
651 | int top_backref; /* Maximum back reference */ | |
652 | unsigned int backref_map; /* Bitmap of low back refs */ | |
653 | int req_varyopt; /* "After variable item" flag for reqbyte */ | |
654 | BOOL nopartial; /* Set TRUE if partial won't work */ | |
655 | } compile_data; | |
656 | ||
657 | /* Structure for maintaining a chain of pointers to the currently incomplete | |
658 | branches, for testing for left recursion. */ | |
659 | ||
660 | typedef struct branch_chain { | |
661 | struct branch_chain *outer; | |
662 | uschar *current; | |
663 | } branch_chain; | |
664 | ||
665 | /* Structure for items in a linked list that represents an explicit recursive | |
666 | call within the pattern. */ | |
667 | ||
668 | typedef struct recursion_info { | |
669 | struct recursion_info *prevrec; /* Previous recursion record (or NULL) */ | |
670 | int group_num; /* Number of group that was called */ | |
671 | const uschar *after_call; /* "Return value": points after the call in the expr */ | |
672 | const uschar *save_start; /* Old value of md->start_match */ | |
673 | int *offset_save; /* Pointer to start of saved offsets */ | |
674 | int saved_max; /* Number of saved offsets */ | |
675 | } recursion_info; | |
676 | ||
677 | /* When compiling in a mode that doesn't use recursive calls to match(), | |
678 | a structure is used to remember local variables on the heap. It is defined in | |
679 | pcre.c, close to the match() function, so that it is easy to keep it in step | |
680 | with any changes of local variable. However, the pointer to the current frame | |
681 | must be saved in some "static" place over a longjmp(). We declare the | |
682 | structure here so that we can put a pointer in the match_data structure. | |
683 | NOTE: This isn't used for a "normal" compilation of pcre. */ | |
684 | ||
685 | struct heapframe; | |
686 | ||
687 | /* Structure for passing "static" information around between the functions | |
688 | doing the matching, so that they are thread-safe. */ | |
689 | ||
690 | typedef struct match_data { | |
691 | unsigned long int match_call_count; /* As it says */ | |
692 | unsigned long int match_limit;/* As it says */ | |
693 | int *offset_vector; /* Offset vector */ | |
694 | int offset_end; /* One past the end */ | |
695 | int offset_max; /* The maximum usable for return data */ | |
696 | const uschar *lcc; /* Points to lower casing table */ | |
697 | const uschar *ctypes; /* Points to table of type maps */ | |
698 | BOOL offset_overflow; /* Set if too many extractions */ | |
699 | BOOL notbol; /* NOTBOL flag */ | |
700 | BOOL noteol; /* NOTEOL flag */ | |
701 | BOOL utf8; /* UTF8 flag */ | |
702 | BOOL endonly; /* Dollar not before final \n */ | |
703 | BOOL notempty; /* Empty string match not wanted */ | |
704 | BOOL partial; /* PARTIAL flag */ | |
705 | BOOL hitend; /* Hit the end of the subject at some point */ | |
706 | const uschar *start_code; /* For use when recursing */ | |
707 | const uschar *start_subject; /* Start of the subject string */ | |
708 | const uschar *end_subject; /* End of the subject string */ | |
709 | const uschar *start_match; /* Start of this match attempt */ | |
710 | const uschar *end_match_ptr; /* Subject position at end match */ | |
711 | int end_offset_top; /* Highwater mark at end of match */ | |
712 | int capture_last; /* Most recent capture number */ | |
713 | int start_offset; /* The start offset value */ | |
714 | recursion_info *recursive; /* Linked list of recursion data */ | |
715 | void *callout_data; /* To pass back to callouts */ | |
716 | struct heapframe *thisframe; /* Used only when compiling for no recursion */ | |
717 | } match_data; | |
718 | ||
719 | /* Bit definitions for entries in the pcre_ctypes table. */ | |
720 | ||
721 | #define ctype_space 0x01 | |
722 | #define ctype_letter 0x02 | |
723 | #define ctype_digit 0x04 | |
724 | #define ctype_xdigit 0x08 | |
725 | #define ctype_word 0x10 /* alphameric or '_' */ | |
726 | #define ctype_meta 0x80 /* regexp meta char or zero (end pattern) */ | |
727 | ||
728 | /* Offsets for the bitmap tables in pcre_cbits. Each table contains a set | |
729 | of bits for a class map. Some classes are built by combining these tables. */ | |
730 | ||
731 | #define cbit_space 0 /* [:space:] or \s */ | |
732 | #define cbit_xdigit 32 /* [:xdigit:] */ | |
733 | #define cbit_digit 64 /* [:digit:] or \d */ | |
734 | #define cbit_upper 96 /* [:upper:] */ | |
735 | #define cbit_lower 128 /* [:lower:] */ | |
736 | #define cbit_word 160 /* [:word:] or \w */ | |
737 | #define cbit_graph 192 /* [:graph:] */ | |
738 | #define cbit_print 224 /* [:print:] */ | |
739 | #define cbit_punct 256 /* [:punct:] */ | |
740 | #define cbit_cntrl 288 /* [:cntrl:] */ | |
741 | #define cbit_length 320 /* Length of the cbits table */ | |
742 | ||
743 | /* Offsets of the various tables from the base tables pointer, and | |
744 | total length. */ | |
745 | ||
746 | #define lcc_offset 0 | |
747 | #define fcc_offset 256 | |
748 | #define cbits_offset 512 | |
749 | #define ctypes_offset (cbits_offset + cbit_length) | |
750 | #define tables_length (ctypes_offset + 256) | |
751 | ||
752 | /* End of internal.h */ |