Commit | Line | Data |
---|---|---|
c86f6258 PH |
1 | /************************************************* |
2 | * Perl-Compatible Regular Expressions * | |
3 | *************************************************/ | |
4 | ||
5 | /* | |
6 | This is a library of functions to support regular expressions whose syntax | |
7 | and semantics are as close as possible to those of the Perl 5 language. See | |
8 | the file Tech.Notes for some information on the internals. | |
9 | ||
10 | Written by: Philip Hazel <ph10@cam.ac.uk> | |
11 | ||
12 | Copyright (c) 1997-2004 University of Cambridge | |
13 | ||
14 | ----------------------------------------------------------------------------- | |
15 | Redistribution and use in source and binary forms, with or without | |
16 | modification, are permitted provided that the following conditions are met: | |
17 | ||
18 | * Redistributions of source code must retain the above copyright notice, | |
19 | this list of conditions and the following disclaimer. | |
20 | ||
21 | * Redistributions in binary form must reproduce the above copyright | |
22 | notice, this list of conditions and the following disclaimer in the | |
23 | documentation and/or other materials provided with the distribution. | |
24 | ||
25 | * Neither the name of the University of Cambridge nor the names of its | |
26 | contributors may be used to endorse or promote products derived from | |
27 | this software without specific prior written permission. | |
28 | ||
29 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
30 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
31 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
32 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |
33 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
34 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
35 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
36 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
37 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
38 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
39 | POSSIBILITY OF SUCH DAMAGE. | |
40 | ----------------------------------------------------------------------------- | |
41 | */ | |
42 | ||
43 | ||
44 | /* Define DEBUG to get debugging output on stdout. */ | |
45 | /* #define DEBUG */ | |
46 | ||
47 | /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef | |
48 | inline, and there are *still* stupid compilers about that don't like indented | |
49 | pre-processor statements. I suppose it's only been 10 years... */ | |
50 | ||
51 | #ifdef DEBUG | |
52 | #define DPRINTF(p) printf p | |
53 | #else | |
54 | #define DPRINTF(p) /*nothing*/ | |
55 | #endif | |
56 | ||
57 | /* Include the internals header, which itself includes "config.h", the Standard | |
58 | C headers, and the external pcre header. */ | |
59 | ||
60 | #include "internal.h" | |
61 | ||
62 | /* If Unicode Property support is wanted, include a private copy of the | |
63 | function that does it, and the table that translates names to numbers. */ | |
64 | ||
65 | #ifdef SUPPORT_UCP | |
66 | #include "ucp.c" | |
67 | #include "ucptypetable.c" | |
68 | #endif | |
69 | ||
70 | /* Maximum number of items on the nested bracket stacks at compile time. This | |
71 | applies to the nesting of all kinds of parentheses. It does not limit | |
72 | un-nested, non-capturing parentheses. This number can be made bigger if | |
73 | necessary - it is used to dimension one int and one unsigned char vector at | |
74 | compile time. */ | |
75 | ||
76 | #define BRASTACK_SIZE 200 | |
77 | ||
78 | ||
79 | /* Maximum number of ints of offset to save on the stack for recursive calls. | |
80 | If the offset vector is bigger, malloc is used. This should be a multiple of 3, | |
81 | because the offset vector is always a multiple of 3 long. */ | |
82 | ||
83 | #define REC_STACK_SAVE_MAX 30 | |
84 | ||
85 | ||
86 | /* The maximum remaining length of subject we are prepared to search for a | |
87 | req_byte match. */ | |
88 | ||
89 | #define REQ_BYTE_MAX 1000 | |
90 | ||
91 | ||
92 | /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that | |
93 | the definition is next to the definition of the opcodes in internal.h. */ | |
94 | ||
95 | static const uschar OP_lengths[] = { OP_LENGTHS }; | |
96 | ||
97 | /* Min and max values for the common repeats; for the maxima, 0 => infinity */ | |
98 | ||
99 | static const char rep_min[] = { 0, 0, 1, 1, 0, 0 }; | |
100 | static const char rep_max[] = { 0, 0, 0, 0, 1, 1 }; | |
101 | ||
102 | /* Table for handling escaped characters in the range '0'-'z'. Positive returns | |
103 | are simple data values; negative values are for special things like \d and so | |
104 | on. Zero means further processing is needed (for things like \x), or the escape | |
105 | is invalid. */ | |
106 | ||
107 | #if !EBCDIC /* This is the "normal" table for ASCII systems */ | |
108 | static const short int escapes[] = { | |
109 | 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */ | |
110 | 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */ | |
111 | '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */ | |
112 | 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */ | |
113 | -ESC_P, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */ | |
114 | -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */ | |
115 | '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */ | |
116 | 0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */ | |
117 | -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */ | |
118 | 0, 0, -ESC_z /* x - z */ | |
119 | }; | |
120 | ||
121 | #else /* This is the "abnormal" table for EBCDIC systems */ | |
122 | static const short int escapes[] = { | |
123 | /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|', | |
124 | /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0, | |
125 | /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~', | |
126 | /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0, | |
127 | /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?', | |
128 | /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0, | |
129 | /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"', | |
130 | /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, | |
131 | /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0, | |
132 | /* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, -ESC_p, | |
133 | /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0, | |
134 | /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0, | |
135 | /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0, | |
136 | /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0, | |
137 | /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-', | |
138 | /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G, | |
139 | /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0, | |
140 | /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P, | |
141 | /* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0, | |
142 | /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X, | |
143 | /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0, | |
144 | /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0, | |
145 | /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0 | |
146 | }; | |
147 | #endif | |
148 | ||
149 | ||
150 | /* Tables of names of POSIX character classes and their lengths. The list is | |
151 | terminated by a zero length entry. The first three must be alpha, upper, lower, | |
152 | as this is assumed for handling case independence. */ | |
153 | ||
154 | static const char *const posix_names[] = { | |
155 | "alpha", "lower", "upper", | |
156 | "alnum", "ascii", "blank", "cntrl", "digit", "graph", | |
157 | "print", "punct", "space", "word", "xdigit" }; | |
158 | ||
159 | static const uschar posix_name_lengths[] = { | |
160 | 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 }; | |
161 | ||
162 | /* Table of class bit maps for each POSIX class; up to three may be combined | |
163 | to form the class. The table for [:blank:] is dynamically modified to remove | |
164 | the vertical space characters. */ | |
165 | ||
166 | static const int posix_class_maps[] = { | |
167 | cbit_lower, cbit_upper, -1, /* alpha */ | |
168 | cbit_lower, -1, -1, /* lower */ | |
169 | cbit_upper, -1, -1, /* upper */ | |
170 | cbit_digit, cbit_lower, cbit_upper, /* alnum */ | |
171 | cbit_print, cbit_cntrl, -1, /* ascii */ | |
172 | cbit_space, -1, -1, /* blank - a GNU extension */ | |
173 | cbit_cntrl, -1, -1, /* cntrl */ | |
174 | cbit_digit, -1, -1, /* digit */ | |
175 | cbit_graph, -1, -1, /* graph */ | |
176 | cbit_print, -1, -1, /* print */ | |
177 | cbit_punct, -1, -1, /* punct */ | |
178 | cbit_space, -1, -1, /* space */ | |
179 | cbit_word, -1, -1, /* word - a Perl extension */ | |
180 | cbit_xdigit,-1, -1 /* xdigit */ | |
181 | }; | |
182 | ||
183 | /* Table to identify digits and hex digits. This is used when compiling | |
184 | patterns. Note that the tables in chartables are dependent on the locale, and | |
185 | may mark arbitrary characters as digits - but the PCRE compiling code expects | |
186 | to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have | |
187 | a private table here. It costs 256 bytes, but it is a lot faster than doing | |
188 | character value tests (at least in some simple cases I timed), and in some | |
189 | applications one wants PCRE to compile efficiently as well as match | |
190 | efficiently. | |
191 | ||
192 | For convenience, we use the same bit definitions as in chartables: | |
193 | ||
194 | 0x04 decimal digit | |
195 | 0x08 hexadecimal digit | |
196 | ||
197 | Then we can use ctype_digit and ctype_xdigit in the code. */ | |
198 | ||
199 | #if !EBCDIC /* This is the "normal" case, for ASCII systems */ | |
200 | static const unsigned char digitab[] = | |
201 | { | |
202 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */ | |
203 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */ | |
204 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */ | |
205 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */ | |
206 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */ | |
207 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */ | |
208 | 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */ | |
209 | 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */ | |
210 | 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */ | |
211 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */ | |
212 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */ | |
213 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */ | |
214 | 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */ | |
215 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */ | |
216 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */ | |
217 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */ | |
218 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */ | |
219 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */ | |
220 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */ | |
221 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */ | |
222 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */ | |
223 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */ | |
224 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */ | |
225 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */ | |
226 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */ | |
227 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */ | |
228 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */ | |
229 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */ | |
230 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */ | |
231 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */ | |
232 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */ | |
233 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */ | |
234 | ||
235 | #else /* This is the "abnormal" case, for EBCDIC systems */ | |
236 | static const unsigned char digitab[] = | |
237 | { | |
238 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */ | |
239 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */ | |
240 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */ | |
241 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */ | |
242 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */ | |
243 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */ | |
244 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */ | |
245 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */ | |
246 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */ | |
247 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */ | |
248 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */ | |
249 |