Install PCRE 6.2.
[exim.git] / src / src / pcre / pcre_internal.h
1 /* $Cambridge: exim/src/src/pcre/pcre_internal.h,v 1.2 2005/08/08 10:22:14 ph10 Exp $ */
2
3 /*************************************************
4 * Perl-Compatible Regular Expressions *
5 *************************************************/
6
7
8 /* PCRE is a library of functions to support regular expressions whose syntax
9 and semantics are as close as possible to those of the Perl 5 language.
10
11 Written by Philip Hazel
12 Copyright (c) 1997-2005 University of Cambridge
13
14 -----------------------------------------------------------------------------
15 Redistribution and use in source and binary forms, with or without
16 modification, are permitted provided that the following conditions are met:
17
18 * Redistributions of source code must retain the above copyright notice,
19 this list of conditions and the following disclaimer.
20
21 * Redistributions in binary form must reproduce the above copyright
22 notice, this list of conditions and the following disclaimer in the
23 documentation and/or other materials provided with the distribution.
24
25 * Neither the name of the University of Cambridge nor the names of its
26 contributors may be used to endorse or promote products derived from
27 this software without specific prior written permission.
28
29 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
30 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
33 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
34 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
35 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
36 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
37 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
38 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
39 POSSIBILITY OF SUCH DAMAGE.
40 -----------------------------------------------------------------------------
41 */
42
43 /* This header contains definitions that are shared between the different
44 modules, but which are not relevant to the exported API. This includes some
45 functions whose names all begin with "_pcre_". */
46
47
48 /* Define DEBUG to get debugging output on stdout. */
49
50 /****
51 #define DEBUG
52 ****/
53
54 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
55 inline, and there are *still* stupid compilers about that don't like indented
56 pre-processor statements, or at least there were when I first wrote this. After
57 all, it had only been about 10 years then... */
58
59 #ifdef DEBUG
60 #define DPRINTF(p) printf p
61 #else
62 #define DPRINTF(p) /*nothing*/
63 #endif
64
65
66 /* Get the definitions provided by running "configure" */
67
68 #include "config.h"
69
70 /* Standard C headers plus the external interface definition. The only time
71 setjmp and stdarg are used is when NO_RECURSE is set. */
72
73 #include <ctype.h>
74 #include <limits.h>
75 #include <setjmp.h>
76 #include <stdarg.h>
77 #include <stddef.h>
78 #include <stdio.h>
79 #include <stdlib.h>
80 #include <string.h>
81
82 #ifndef PCRE_SPY
83 #define PCRE_DEFINITION /* Win32 __declspec(export) trigger for .dll */
84 #endif
85
86 /* We need to have types that specify unsigned 16-bit and 32-bit integers. We
87 cannot determine these outside the compilation (e.g. by running a program as
88 part of "configure") because PCRE is often cross-compiled for use on other
89 systems. Instead we make use of the maximum sizes that are available at
90 preprocessor time in standard C environments. */
91
92 #if USHRT_MAX == 65535
93 typedef unsigned short pcre_uint16;
94 #elif UINT_MAX == 65535
95 typedef unsigned int pcre_uint16;
96 #else
97 #error Cannot determine a type for 16-bit unsigned integers
98 #endif
99
100 #if UINT_MAX == 4294967295
101 typedef unsigned int pcre_uint32;
102 #elif ULONG_MAX == 4294967295
103 typedef unsigned long int pcre_uint32;
104 #else
105 #error Cannot determine a type for 32-bit unsigned integers
106 #endif
107
108 /* All character handling must be done as unsigned characters. Otherwise there
109 are problems with top-bit-set characters and functions such as isspace().
110 However, we leave the interface to the outside world as char *, because that
111 should make things easier for callers. We define a short type for unsigned char
112 to save lots of typing. I tried "uchar", but it causes problems on Digital
113 Unix, where it is defined in sys/types, so use "uschar" instead. */
114
115 typedef unsigned char uschar;
116
117 /* Include the public PCRE header */
118
119 #include "pcre.h"
120
121 /* Include the (copy of) the public ucp header, changing the external name into
122 a private one. This does no harm, even if we aren't compiling UCP support. */
123
124 #define ucp_findchar _pcre_ucp_findchar
125 #include "ucp.h"
126
127 /* When compiling for use with the Virtual Pascal compiler, these functions
128 need to have their names changed. PCRE must be compiled with the -DVPCOMPAT
129 option on the command line. */
130
131 #ifdef VPCOMPAT
132 #define strncmp(s1,s2,m) _strncmp(s1,s2,m)
133 #define memcpy(d,s,n) _memcpy(d,s,n)
134 #define memmove(d,s,n) _memmove(d,s,n)
135 #define memset(s,c,n) _memset(s,c,n)
136 #else /* VPCOMPAT */
137
138 /* To cope with SunOS4 and other systems that lack memmove() but have bcopy(),
139 define a macro for memmove() if HAVE_MEMMOVE is false, provided that HAVE_BCOPY
140 is set. Otherwise, include an emulating function for those systems that have
141 neither (there some non-Unix environments where this is the case). This assumes
142 that all calls to memmove are moving strings upwards in store, which is the
143 case in PCRE. */
144
145 #if ! HAVE_MEMMOVE
146 #undef memmove /* some systems may have a macro */
147 #if HAVE_BCOPY
148 #define memmove(a, b, c) bcopy(b, a, c)
149 #else /* HAVE_BCOPY */
150 void *
151 pcre_memmove(unsigned char *dest, const unsigned char *src, size_t n)
152 {
153 int i;
154 dest += n;
155 src += n;
156 for (i = 0; i < n; ++i) *(--dest) = *(--src);
157 }
158 #define memmove(a, b, c) pcre_memmove(a, b, c)
159 #endif /* not HAVE_BCOPY */
160 #endif /* not HAVE_MEMMOVE */
161 #endif /* not VPCOMPAT */
162
163
164 /* PCRE keeps offsets in its compiled code as 2-byte quantities (always stored
165 in big-endian order) by default. These are used, for example, to link from the
166 start of a subpattern to its alternatives and its end. The use of 2 bytes per
167 offset limits the size of the compiled regex to around 64K, which is big enough
168 for almost everybody. However, I received a request for an even bigger limit.
169 For this reason, and also to make the code easier to maintain, the storing and
170 loading of offsets from the byte string is now handled by the macros that are
171 defined here.
172
173 The macros are controlled by the value of LINK_SIZE. This defaults to 2 in
174 the config.h file, but can be overridden by using -D on the command line. This
175 is automated on Unix systems via the "configure" command. */
176
177 #if LINK_SIZE == 2
178
179 #define PUT(a,n,d) \
180 (a[n] = (d) >> 8), \
181 (a[(n)+1] = (d) & 255)
182
183 #define GET(a,n) \
184 (((a)[n] << 8) | (a)[(n)+1])
185
186 #define MAX_PATTERN_SIZE (1 << 16)
187
188
189 #elif LINK_SIZE == 3
190
191 #define PUT(a,n,d) \
192 (a[n] = (d) >> 16), \
193 (a[(n)+1] = (d) >> 8), \
194 (a[(n)+2] = (d) & 255)
195
196 #define GET(a,n) \
197 (((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2])
198
199 #define MAX_PATTERN_SIZE (1 << 24)
200
201
202 #elif LINK_SIZE == 4
203
204 #define PUT(a,n,d) \
205 (a[n] = (d) >> 24), \
206 (a[(n)+1] = (d) >> 16), \
207 (a[(n)+2] = (d) >> 8), \
208 (a[(n)+3] = (d) & 255)
209
210 #define GET(a,n) \
211 (((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3])
212
213 #define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */
214
215
216 #else
217 #error LINK_SIZE must be either 2, 3, or 4
218 #endif
219
220
221 /* Convenience macro defined in terms of the others */
222
223 #define PUTINC(a,n,d) PUT(a,n,d), a += LINK_SIZE
224
225
226 /* PCRE uses some other 2-byte quantities that do not change when the size of
227 offsets changes. There are used for repeat counts and for other things such as
228 capturing parenthesis numbers in back references. */
229
230 #define PUT2(a,n,d) \
231 a[n] = (d) >> 8; \
232 a[(n)+1] = (d) & 255
233
234 #define GET2(a,n) \
235 (((a)[n] << 8) | (a)[(n)+1])
236
237 #define PUT2INC(a,n,d) PUT2(a,n,d), a += 2
238
239
240 /* When UTF-8 encoding is being used, a character is no longer just a single
241 byte. The macros for character handling generate simple sequences when used in
242 byte-mode, and more complicated ones for UTF-8 characters. */
243
244 #ifndef SUPPORT_UTF8
245 #define GETCHAR(c, eptr) c = *eptr;
246 #define GETCHARTEST(c, eptr) c = *eptr;
247 #define GETCHARINC(c, eptr) c = *eptr++;
248 #define GETCHARINCTEST(c, eptr) c = *eptr++;
249 #define GETCHARLEN(c, eptr, len) c = *eptr;
250 #define BACKCHAR(eptr)
251
252 #else /* SUPPORT_UTF8 */
253
254 /* Get the next UTF-8 character, not advancing the pointer. This is called when
255 we know we are in UTF-8 mode. */
256
257 #define GETCHAR(c, eptr) \
258 c = *eptr; \
259 if ((c & 0xc0) == 0xc0) \
260 { \
261 int gcii; \
262 int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
263 int gcss = 6*gcaa; \
264 c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
265 for (gcii = 1; gcii <= gcaa; gcii++) \
266 { \
267 gcss -= 6; \
268 c |= (eptr[gcii] & 0x3f) << gcss; \
269 } \
270 }
271
272 /* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the
273 pointer. */
274
275 #define GETCHARTEST(c, eptr) \
276 c = *eptr; \
277 if (utf8 && (c & 0xc0) == 0xc0) \
278 { \
279 int gcii; \
280 int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
281 int gcss = 6*gcaa; \
282 c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
283 for (gcii = 1; gcii <= gcaa; gcii++) \
284 { \
285 gcss -= 6; \
286 c |= (eptr[gcii] & 0x3f) << gcss; \
287 } \
288 }
289
290 /* Get the next UTF-8 character, advancing the pointer. This is called when we
291 know we are in UTF-8 mode. */
292
293 #define GETCHARINC(c, eptr) \
294 c = *eptr++; \
295 if ((c & 0xc0) == 0xc0) \
296 { \
297 int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
298 int gcss = 6*gcaa; \
299 c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
300 while (gcaa-- > 0) \
301 { \
302 gcss -= 6; \
303 c |= (*eptr++ & 0x3f) << gcss; \
304 } \
305 }
306
307 /* Get the next character, testing for UTF-8 mode, and advancing the pointer */
308
309 #define GETCHARINCTEST(c, eptr) \
310 c = *eptr++; \
311 if (utf8 && (c & 0xc0) == 0xc0) \
312 { \
313 int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
314 int gcss = 6*gcaa; \
315 c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
316 while (gcaa-- > 0) \
317 { \
318 gcss -= 6; \
319 c |= (*eptr++ & 0x3f) << gcss; \
320 } \
321 }
322
323 /* Get the next UTF-8 character, not advancing the pointer, incrementing length
324 if there are extra bytes. This is called when we know we are in UTF-8 mode. */
325
326 #define GETCHARLEN(c, eptr, len) \
327 c = *eptr; \
328 if ((c & 0xc0) == 0xc0) \
329 { \
330 int gcii; \
331 int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
332 int gcss = 6*gcaa; \
333 c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
334 for (gcii = 1; gcii <= gcaa; gcii++) \
335 { \
336 gcss -= 6; \
337 c |= (eptr[gcii] & 0x3f) << gcss; \
338 } \
339 len += gcaa; \
340 }
341
342 /* If the pointer is not at the start of a character, move it back until
343 it is. Called only in UTF-8 mode. */
344
345 #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
346
347 #endif
348
349
350 /* In case there is no definition of offsetof() provided - though any proper
351 Standard C system should have one. */
352
353 #ifndef offsetof
354 #define offsetof(p_type,field) ((size_t)&(((p_type *)0)->field))
355 #endif
356
357
358 /* These are the public options that can change during matching. */
359
360 #define PCRE_IMS (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL)
361
362 /* Private options flags start at the most significant end of the four bytes,
363 but skip the top bit so we can use ints for convenience without getting tangled
364 with negative values. The public options defined in pcre.h start at the least
365 significant end. Make sure they don't overlap! */
366
367 #define PCRE_FIRSTSET 0x40000000 /* first_byte is set */
368 #define PCRE_REQCHSET 0x20000000 /* req_byte is set */
369 #define PCRE_STARTLINE 0x10000000 /* start after \n for multiline */
370 #define PCRE_ICHANGED 0x08000000 /* i option changes within regex */
371 #define PCRE_NOPARTIAL 0x04000000 /* can't use partial with this regex */
372
373 /* Options for the "extra" block produced by pcre_study(). */
374
375 #define PCRE_STUDY_MAPPED 0x01 /* a map of starting chars exists */
376
377 /* Masks for identifying the public options that are permitted at compile
378 time, run time, or study time, respectively. */
379
380 #define PUBLIC_OPTIONS \
381 (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \
382 PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \
383 PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE)
384
385 #define PUBLIC_EXEC_OPTIONS \
386 (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \
387 PCRE_PARTIAL)
388
389 #define PUBLIC_DFA_EXEC_OPTIONS \
390 (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \
391 PCRE_PARTIAL|PCRE_DFA_SHORTEST|PCRE_DFA_RESTART)
392
393 #define PUBLIC_STUDY_OPTIONS 0 /* None defined */
394
395 /* Magic number to provide a small check against being handed junk. Also used
396 to detect whether a pattern was compiled on a host of different endianness. */
397
398 #define MAGIC_NUMBER 0x50435245UL /* 'PCRE' */
399
400 /* Negative values for the firstchar and reqchar variables */
401
402 #define REQ_UNSET (-2)
403 #define REQ_NONE (-1)
404
405 /* The maximum remaining length of subject we are prepared to search for a
406 req_byte match. */
407
408 #define REQ_BYTE_MAX 1000
409
410 /* Flags added to firstbyte or reqbyte; a "non-literal" item is either a
411 variable-length repeat, or a anything other than literal characters. */
412
413 #define REQ_CASELESS 0x0100 /* indicates caselessness */
414 #define REQ_VARY 0x0200 /* reqbyte followed non-literal item */
415
416 /* Miscellaneous definitions */
417
418 typedef int BOOL;
419
420 #define FALSE 0
421 #define TRUE 1
422
423 /* Escape items that are just an encoding of a particular data value. Note that
424 ESC_n is defined as yet another macro, which is set in config.h to either \n
425 (the default) or \r (which some people want). */
426
427 #ifndef ESC_e
428 #define ESC_e 27
429 #endif
430
431 #ifndef ESC_f
432 #define ESC_f '\f'
433 #endif
434
435 #ifndef ESC_n
436 #define ESC_n NEWLINE
437 #endif
438
439 #ifndef ESC_r
440 #define ESC_r '\r'
441 #endif
442
443 /* We can't officially use ESC_t because it is a POSIX reserved identifier
444 (presumably because of all the others like size_t). */
445
446 #ifndef ESC_tee
447 #define ESC_tee '\t'
448 #endif
449
450 /* These are escaped items that aren't just an encoding of a particular data
451 value such as \n. They must have non-zero values, as check_escape() returns
452 their negation. Also, they must appear in the same order as in the opcode
453 definitions below, up to ESC_z. There's a dummy for OP_ANY because it
454 corresponds to "." rather than an escape sequence. The final one must be
455 ESC_REF as subsequent values are used for \1, \2, \3, etc. There is are two
456 tests in the code for an escape greater than ESC_b and less than ESC_Z to
457 detect the types that may be repeated. These are the types that consume
458 characters. If any new escapes are put in between that don't consume a
459 character, that code will have to change. */
460
461 enum { ESC_A = 1, ESC_G, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W,
462 ESC_w, ESC_dum1, ESC_C, ESC_P, ESC_p, ESC_X, ESC_Z, ESC_z, ESC_E,
463 ESC_Q, ESC_REF };
464
465 /* Flag bits and data types for the extended class (OP_XCLASS) for classes that
466 contain UTF-8 characters with values greater than 255. */
467
468 #define XCL_NOT 0x01 /* Flag: this is a negative class */
469 #define XCL_MAP 0x02 /* Flag: a 32-byte map is present */
470
471 #define XCL_END 0 /* Marks end of individual items */
472 #define XCL_SINGLE 1 /* Single item (one multibyte char) follows */
473 #define XCL_RANGE 2 /* A range (two multibyte chars) follows */
474 #define XCL_PROP 3 /* Unicode property (one property code) follows */
475 #define XCL_NOTPROP 4 /* Unicode inverted property (ditto) */
476
477
478 /* Opcode table: OP_BRA must be last, as all values >= it are used for brackets
479 that extract substrings. Starting from 1 (i.e. after OP_END), the values up to
480 OP_EOD must correspond in order to the list of escapes immediately above.
481 Note that whenever this list is updated, the two macro definitions that follow
482 must also be updated to match. */
483
484 enum {
485 OP_END, /* 0 End of pattern */
486
487 /* Values corresponding to backslashed metacharacters */
488
489 OP_SOD, /* 1 Start of data: \A */
490 OP_SOM, /* 2 Start of match (subject + offset): \G */
491 OP_NOT_WORD_BOUNDARY, /* 3 \B */
492 OP_WORD_BOUNDARY, /* 4 \b */
493 OP_NOT_DIGIT, /* 5 \D */
494 OP_DIGIT, /* 6 \d */
495 OP_NOT_WHITESPACE, /* 7 \S */
496 OP_WHITESPACE, /* 8 \s */
497 OP_NOT_WORDCHAR, /* 9 \W */
498 OP_WORDCHAR, /* 10 \w */
499 OP_ANY, /* 11 Match any character */
500 OP_ANYBYTE, /* 12 Match any byte (\C); different to OP_ANY for UTF-8 */
501 OP_NOTPROP, /* 13 \P (not Unicode property) */
502 OP_PROP, /* 14 \p (Unicode property) */
503 OP_EXTUNI, /* 15 \X (extended Unicode sequence */
504 OP_EODN, /* 16 End of data or \n at end of data: \Z. */
505 OP_EOD, /* 17 End of data: \z */
506
507 OP_OPT, /* 18 Set runtime options */
508 OP_CIRC, /* 19 Start of line - varies with multiline switch */
509 OP_DOLL, /* 20 End of line - varies with multiline switch */
510 OP_CHAR, /* 21 Match one character, casefully */
511 OP_CHARNC, /* 22 Match one character, caselessly */
512 OP_NOT, /* 23 Match anything but the following char */
513
514 OP_STAR, /* 24 The maximizing and minimizing versions of */
515 OP_MINSTAR, /* 25 all these opcodes must come in pairs, with */
516 OP_PLUS, /* 26 the minimizing one second. */
517 OP_MINPLUS, /* 27 This first set applies to single characters */
518 OP_QUERY, /* 28 */
519 OP_MINQUERY, /* 29 */
520 OP_UPTO, /* 30 From 0 to n matches */
521 OP_MINUPTO, /* 31 */
522 OP_EXACT, /* 32 Exactly n matches */
523
524 OP_NOTSTAR, /* 33 The maximizing and minimizing versions of */
525 OP_NOTMINSTAR, /* 34 all these opcodes must come in pairs, with */
526 OP_NOTPLUS, /* 35 the minimizing one second. */
527 OP_NOTMINPLUS, /* 36 This set applies to "not" single characters */
528 OP_NOTQUERY, /* 37 */
529 OP_NOTMINQUERY, /* 38 */
530 OP_NOTUPTO, /* 39 From 0 to n matches */
531 OP_NOTMINUPTO, /* 40 */
532 OP_NOTEXACT, /* 41 Exactly n matches */
533
534 OP_TYPESTAR, /* 42 The maximizing and minimizing versions of */
535 OP_TYPEMINSTAR, /* 43 all these opcodes must come in pairs, with */
536 OP_TYPEPLUS, /* 44 the minimizing one second. These codes must */
537 OP_TYPEMINPLUS, /* 45 be in exactly the same order as those above. */
538 OP_TYPEQUERY, /* 46 This set applies to character types such as \d */
539 OP_TYPEMINQUERY, /* 47 */
540 OP_TYPEUPTO, /* 48 From 0 to n matches */
541 OP_TYPEMINUPTO, /* 49 */
542 OP_TYPEEXACT, /* 50 Exactly n matches */
543
544 OP_CRSTAR, /* 51 The maximizing and minimizing versions of */
545 OP_CRMINSTAR, /* 52 all these opcodes must come in pairs, with */
546 OP_CRPLUS, /* 53 the minimizing one second. These codes must */
547 OP_CRMINPLUS, /* 54 be in exactly the same order as those above. */
548 OP_CRQUERY, /* 55 These are for character classes and back refs */
549 OP_CRMINQUERY, /* 56 */
550 OP_CRRANGE, /* 57 These are different to the three sets above. */
551 OP_CRMINRANGE, /* 58 */
552
553 OP_CLASS, /* 59 Match a character class, chars < 256 only */
554 OP_NCLASS, /* 60 Same, but the bitmap was created from a negative
555 class - the difference is relevant only when a UTF-8
556 character > 255 is encountered. */
557
558 OP_XCLASS, /* 61 Extended class for handling UTF-8 chars within the
559 class. This does both positive and negative. */
560
561 OP_REF, /* 62 Match a back reference */
562 OP_RECURSE, /* 63 Match a numbered subpattern (possibly recursive) */
563 OP_CALLOUT, /* 64 Call out to external function if provided */
564
565 OP_ALT, /* 65 Start of alternation */
566 OP_KET, /* 66 End of group that doesn't have an unbounded repeat */
567 OP_KETRMAX, /* 67 These two must remain together and in this */
568 OP_KETRMIN, /* 68 order. They are for groups the repeat for ever. */
569
570 /* The assertions must come before ONCE and COND */
571
572 OP_ASSERT, /* 69 Positive lookahead */
573 OP_ASSERT_NOT, /* 70 Negative lookahead */
574 OP_ASSERTBACK, /* 71 Positive lookbehind */
575 OP_ASSERTBACK_NOT, /* 72 Negative lookbehind */
576 OP_REVERSE, /* 73 Move pointer back - used in lookbehind assertions */
577
578 /* ONCE and COND must come after the assertions, with ONCE first, as there's
579 a test for >= ONCE for a subpattern that isn't an assertion. */
580
581 OP_ONCE, /* 74 Once matched, don't back up into the subpattern */
582 OP_COND, /* 75 Conditional group */
583 OP_CREF, /* 76 Used to hold an extraction string number (cond ref) */
584
585 OP_BRAZERO, /* 77 These two must remain together and in this */
586 OP_BRAMINZERO, /* 78 order. */
587
588 OP_BRANUMBER, /* 79 Used for extracting brackets whose number is greater
589 than can fit into an opcode. */
590
591 OP_BRA /* 80 This and greater values are used for brackets that
592 extract substrings up to EXTRACT_BASIC_MAX. After
593 that, use is made of OP_BRANUMBER. */
594 };
595
596 /* WARNING WARNING WARNING: There is an implicit assumption in pcre.c and
597 study.c that all opcodes are less than 128 in value. This makes handling UTF-8
598 character sequences easier. */
599
600 /* The highest extraction number before we have to start using additional
601 bytes. (Originally PCRE didn't have support for extraction counts highter than
602 this number.) The value is limited by the number of opcodes left after OP_BRA,
603 i.e. 255 - OP_BRA. We actually set it a bit lower to leave room for additional
604 opcodes. */
605
606 #define EXTRACT_BASIC_MAX 100
607
608
609 /* This macro defines textual names for all the opcodes. These are used only
610 for debugging. The macro is referenced only in pcre_printint.c. */
611
612 #define OP_NAME_LIST \
613 "End", "\\A", "\\G", "\\B", "\\b", "\\D", "\\d", \
614 "\\S", "\\s", "\\W", "\\w", "Any", "Anybyte", \
615 "notprop", "prop", "extuni", \
616 "\\Z", "\\z", \
617 "Opt", "^", "$", "char", "charnc", "not", \
618 "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
619 "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
620 "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
621 "*", "*?", "+", "+?", "?", "??", "{", "{", \
622 "class", "nclass", "xclass", "Ref", "Recurse", "Callout", \
623 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", \
624 "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cond ref",\
625 "Brazero", "Braminzero", "Branumber", "Bra"
626
627
628 /* This macro defines the length of fixed length operations in the compiled
629 regex. The lengths are used when searching for specific things, and also in the
630 debugging printing of a compiled regex. We use a macro so that it can be
631 defined close to the definitions of the opcodes themselves.
632
633 As things have been extended, some of these are no longer fixed lenths, but are
634 minima instead. For example, the length of a single-character repeat may vary
635 in UTF-8 mode. The code that uses this table must know about such things. */
636
637 #define OP_LENGTHS \
638 1, /* End */ \
639 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* \A, \G, \B, \B, \D, \d, \S, \s, \W, \w */ \
640 1, 1, /* Any, Anybyte */ \
641 2, 2, 1, /* NOTPROP, PROP, EXTUNI */ \
642 1, 1, 2, 1, 1, /* \Z, \z, Opt, ^, $ */ \
643 2, /* Char - the minimum length */ \
644 2, /* Charnc - the minimum length */ \
645 2, /* not */ \
646 /* Positive single-char repeats ** These are */ \
647 2, 2, 2, 2, 2, 2, /* *, *?, +, +?, ?, ?? ** minima in */ \
648 4, 4, 4, /* upto, minupto, exact ** UTF-8 mode */ \
649 /* Negative single-char repeats - only for chars < 256 */ \
650 2, 2, 2, 2, 2, 2, /* NOT *, *?, +, +?, ?, ?? */ \
651 4, 4, 4, /* NOT upto, minupto, exact */ \
652 /* Positive type repeats */ \
653 2, 2, 2, 2, 2, 2, /* Type *, *?, +, +?, ?, ?? */ \
654 4, 4, 4, /* Type upto, minupto, exact */ \
655 /* Character class & ref repeats */ \
656 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ \
657 5, 5, /* CRRANGE, CRMINRANGE */ \
658 33, /* CLASS */ \
659 33, /* NCLASS */ \
660 0, /* XCLASS - variable length */ \
661 3, /* REF */ \
662 1+LINK_SIZE, /* RECURSE */ \
663 2+2*LINK_SIZE, /* CALLOUT */ \
664 1+LINK_SIZE, /* Alt */ \
665 1+LINK_SIZE, /* Ket */ \
666 1+LINK_SIZE, /* KetRmax */ \
667 1+LINK_SIZE, /* KetRmin */ \
668 1+LINK_SIZE, /* Assert */ \
669 1+LINK_SIZE, /* Assert not */ \
670 1+LINK_SIZE, /* Assert behind */ \
671 1+LINK_SIZE, /* Assert behind not */ \
672 1+LINK_SIZE, /* Reverse */ \
673 1+LINK_SIZE, /* Once */ \
674 1+LINK_SIZE, /* COND */ \
675 3, /* CREF */ \
676 1, 1, /* BRAZERO, BRAMINZERO */ \
677 3, /* BRANUMBER */ \
678 1+LINK_SIZE /* BRA */ \
679
680
681 /* A magic value for OP_CREF to indicate the "in recursion" condition. */
682
683 #define CREF_RECURSE 0xffff
684
685 /* Error code numbers. They are given names so that they can more easily be
686 tracked. */
687
688 enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,
689 ERR10, ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19,
690 ERR20, ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29,
691 ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39,
692 ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47 };
693
694 /* The real format of the start of the pcre block; the index of names and the
695 code vector run on as long as necessary after the end. We store an explicit
696 offset to the name table so that if a regex is compiled on one host, saved, and
697 then run on another where the size of pointers is different, all might still
698 be well. For the case of compiled-on-4 and run-on-8, we include an extra
699 pointer that is always NULL. For future-proofing, a few dummy fields were
700 originally included - even though you can never get this planning right - but
701 there is only one left now.
702
703 NOTE NOTE NOTE:
704 Because people can now save and re-use compiled patterns, any additions to this
705 structure should be made at the end, and something earlier (e.g. a new
706 flag in the options or one of the dummy fields) should indicate that the new
707 fields are present. Currently PCRE always sets the dummy fields to zero.
708 NOTE NOTE NOTE:
709 */
710
711 typedef struct real_pcre {
712 pcre_uint32 magic_number;
713 pcre_uint32 size; /* Total that was malloced */
714 pcre_uint32 options;
715 pcre_uint32 dummy1; /* For future use, maybe */
716
717 pcre_uint16 top_bracket;
718 pcre_uint16 top_backref;
719 pcre_uint16 first_byte;
720 pcre_uint16 req_byte;
721 pcre_uint16 name_table_offset; /* Offset to name table that follows */
722 pcre_uint16 name_entry_size; /* Size of any name items */
723 pcre_uint16 name_count; /* Number of name items */
724 pcre_uint16 ref_count; /* Reference count */
725
726 const unsigned char *tables; /* Pointer to tables or NULL for std */
727 const unsigned char *nullpad; /* NULL padding */
728 } real_pcre;
729
730 /* The format of the block used to store data from pcre_study(). The same
731 remark (see NOTE above) about extending this structure applies. */
732
733 typedef struct pcre_study_data {
734 pcre_uint32 size; /* Total that was malloced */
735 pcre_uint32 options;
736 uschar start_bits[32];
737 } pcre_study_data;
738
739 /* Structure for passing "static" information around between the functions
740 doing the compiling, so that they are thread-safe. */
741
742 typedef struct compile_data {
743 const uschar *lcc; /* Points to lower casing table */
744 const uschar *fcc; /* Points to case-flipping table */
745 const uschar *cbits; /* Points to character type table */
746 const uschar *ctypes; /* Points to table of type maps */
747 const uschar *start_code; /* The start of the compiled code */
748 const uschar *start_pattern; /* The start of the pattern */
749 uschar *name_table; /* The name/number table */
750 int names_found; /* Number of entries so far */
751 int name_entry_size; /* Size of each entry */
752 int top_backref; /* Maximum back reference */
753 unsigned int backref_map; /* Bitmap of low back refs */
754 int req_varyopt; /* "After variable item" flag for reqbyte */
755 BOOL nopartial; /* Set TRUE if partial won't work */
756 } compile_data;
757
758 /* Structure for maintaining a chain of pointers to the currently incomplete
759 branches, for testing for left recursion. */
760
761 typedef struct branch_chain {
762 struct branch_chain *outer;
763 uschar *current;
764 } branch_chain;
765
766 /* Structure for items in a linked list that represents an explicit recursive
767 call within the pattern. */
768
769 typedef struct recursion_info {
770 struct recursion_info *prevrec; /* Previous recursion record (or NULL) */
771 int group_num; /* Number of group that was called */
772 const uschar *after_call; /* "Return value": points after the call in the expr */
773 const uschar *save_start; /* Old value of md->start_match */
774 int *offset_save; /* Pointer to start of saved offsets */
775 int saved_max; /* Number of saved offsets */
776 } recursion_info;
777
778 /* When compiling in a mode that doesn't use recursive calls to match(),
779 a structure is used to remember local variables on the heap. It is defined in
780 pcre.c, close to the match() function, so that it is easy to keep it in step
781 with any changes of local variable. However, the pointer to the current frame
782 must be saved in some "static" place over a longjmp(). We declare the
783 structure here so that we can put a pointer in the match_data structure.
784 NOTE: This isn't used for a "normal" compilation of pcre. */
785
786 struct heapframe;
787
788 /* Structure for passing "static" information around between the functions
789 doing traditional NFA matching, so that they are thread-safe. */
790
791 typedef struct match_data {
792 unsigned long int match_call_count; /* As it says */
793 unsigned long int match_limit;/* As it says */
794 int *offset_vector; /* Offset vector */
795 int offset_end; /* One past the end */
796 int offset_max; /* The maximum usable for return data */
797 const uschar *lcc; /* Points to lower casing table */
798 const uschar *ctypes; /* Points to table of type maps */
799 BOOL offset_overflow; /* Set if too many extractions */
800 BOOL notbol; /* NOTBOL flag */
801 BOOL noteol; /* NOTEOL flag */
802 BOOL utf8; /* UTF8 flag */
803 BOOL endonly; /* Dollar not before final \n */
804 BOOL notempty; /* Empty string match not wanted */
805 BOOL partial; /* PARTIAL flag */
806 BOOL hitend; /* Hit the end of the subject at some point */
807 const uschar *start_code; /* For use when recursing */
808 const uschar *start_subject; /* Start of the subject string */
809 const uschar *end_subject; /* End of the subject string */
810 const uschar *start_match; /* Start of this match attempt */
811 const uschar *end_match_ptr; /* Subject position at end match */
812 int end_offset_top; /* Highwater mark at end of match */
813 int capture_last; /* Most recent capture number */
814 int start_offset; /* The start offset value */
815 recursion_info *recursive; /* Linked list of recursion data */
816 void *callout_data; /* To pass back to callouts */
817 struct heapframe *thisframe; /* Used only when compiling for no recursion */
818 } match_data;
819
820 /* A similar structure is used for the same purpose by the DFA matching
821 functions. */
822
823 typedef struct dfa_match_data {
824 const uschar *start_code; /* Start of the compiled pattern */
825 const uschar *start_subject; /* Start of the subject string */
826 const uschar *end_subject; /* End of subject string */
827 const uschar *tables; /* Character tables */
828 int moptions; /* Match options */
829 int poptions; /* Pattern options */
830 void *callout_data; /* To pass back to callouts */
831 } dfa_match_data;
832
833 /* Bit definitions for entries in the pcre_ctypes table. */
834
835 #define ctype_space 0x01
836 #define ctype_letter 0x02
837 #define ctype_digit 0x04
838 #define ctype_xdigit 0x08
839 #define ctype_word 0x10 /* alphameric or '_' */
840 #define ctype_meta 0x80 /* regexp meta char or zero (end pattern) */
841
842 /* Offsets for the bitmap tables in pcre_cbits. Each table contains a set
843 of bits for a class map. Some classes are built by combining these tables. */
844
845 #define cbit_space 0 /* [:space:] or \s */
846 #define cbit_xdigit 32 /* [:xdigit:] */
847 #define cbit_digit 64 /* [:digit:] or \d */
848 #define cbit_upper 96 /* [:upper:] */
849 #define cbit_lower 128 /* [:lower:] */
850 #define cbit_word 160 /* [:word:] or \w */
851 #define cbit_graph 192 /* [:graph:] */
852 #define cbit_print 224 /* [:print:] */
853 #define cbit_punct 256 /* [:punct:] */
854 #define cbit_cntrl 288 /* [:cntrl:] */
855 #define cbit_length 320 /* Length of the cbits table */
856
857 /* Offsets of the various tables from the base tables pointer, and
858 total length. */
859
860 #define lcc_offset 0
861 #define fcc_offset 256
862 #define cbits_offset 512
863 #define ctypes_offset (cbits_offset + cbit_length)
864 #define tables_length (ctypes_offset + 256)
865
866 /* Layout of the UCP type table that translates property names into codes for
867 ucp_findchar(). */
868
869 typedef struct {
870 const char *name;
871 int value;
872 } ucp_type_table;
873
874
875 /* Internal shared data tables. These are tables that are used by more than one
876 of the exported public functions. They have to be "external" in the C sense,
877 but are not part of the PCRE public API. The data for these tables is in the
878 pcre_tables.c module. */
879
880 extern const int _pcre_utf8_table1[];
881 extern const int _pcre_utf8_table2[];
882 extern const int _pcre_utf8_table3[];
883 extern const uschar _pcre_utf8_table4[];
884
885 extern const int _pcre_utf8_table1_size;
886
887 extern const ucp_type_table _pcre_utt[];
888 extern const int _pcre_utt_size;
889
890 extern const uschar _pcre_default_tables[];
891
892 extern const uschar _pcre_OP_lengths[];
893
894
895 /* Internal shared functions. These are functions that are used by more than
896 one of the exported public functions. They have to be "external" in the C
897 sense, but are not part of the PCRE public API. */
898
899 extern int _pcre_ord2utf8(int, uschar *);
900 extern void _pcre_printint(pcre *, FILE *);
901 extern real_pcre * _pcre_try_flipped(const real_pcre *, real_pcre *,
902 const pcre_study_data *, pcre_study_data *);
903 extern int _pcre_ucp_findchar(const int, int *, int *);
904 extern int _pcre_valid_utf8(const uschar *, int);
905 extern BOOL _pcre_xclass(int, const uschar *);
906
907 /* End of pcre_internal.h */