Commit | Line | Data |
---|---|---|
6bf342e1 | 1 | /* $Cambridge: exim/src/src/pcre/pcre_compile.c,v 1.4 2007/01/23 15:08:45 ph10 Exp $ */ |
8ac170f3 PH |
2 | |
3 | /************************************************* | |
4 | * Perl-Compatible Regular Expressions * | |
5 | *************************************************/ | |
6 | ||
7 | /* PCRE is a library of functions to support regular expressions whose syntax | |
8 | and semantics are as close as possible to those of the Perl 5 language. | |
9 | ||
10 | Written by Philip Hazel | |
aa41d2de | 11 | Copyright (c) 1997-2006 University of Cambridge |
8ac170f3 PH |
12 | |
13 | ----------------------------------------------------------------------------- | |
14 | Redistribution and use in source and binary forms, with or without | |
15 | modification, are permitted provided that the following conditions are met: | |
16 | ||
17 | * Redistributions of source code must retain the above copyright notice, | |
18 | this list of conditions and the following disclaimer. | |
19 | ||
20 | * Redistributions in binary form must reproduce the above copyright | |
21 | notice, this list of conditions and the following disclaimer in the | |
22 | documentation and/or other materials provided with the distribution. | |
23 | ||
24 | * Neither the name of the University of Cambridge nor the names of its | |
25 | contributors may be used to endorse or promote products derived from | |
26 | this software without specific prior written permission. | |
27 | ||
28 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
29 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
30 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
31 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |
32 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
33 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
34 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
35 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
36 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
37 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
38 | POSSIBILITY OF SUCH DAMAGE. | |
39 | ----------------------------------------------------------------------------- | |
40 | */ | |
41 | ||
42 | ||
43 | /* This module contains the external function pcre_compile(), along with | |
44 | supporting internal functions that are not used by other modules. */ | |
45 | ||
46 | ||
6bf342e1 PH |
47 | #define NLBLOCK cd /* Block containing newline information */ |
48 | #define PSSTART start_pattern /* Field containing processed string start */ | |
49 | #define PSEND end_pattern /* Field containing processed string end */ | |
50 | ||
51 | ||
8ac170f3 PH |
52 | #include "pcre_internal.h" |
53 | ||
54 | ||
aa41d2de PH |
55 | /* When DEBUG is defined, we need the pcre_printint() function, which is also |
56 | used by pcretest. DEBUG is not defined when building a production library. */ | |
57 | ||
58 | #ifdef DEBUG | |
59 | #include "pcre_printint.src" | |
60 | #endif | |
61 | ||
62 | ||
8ac170f3 PH |
63 | /************************************************* |
64 | * Code parameters and static tables * | |
65 | *************************************************/ | |
66 | ||
6bf342e1 PH |
67 | /* This value specifies the size of stack workspace that is used during the |
68 | first pre-compile phase that determines how much memory is required. The regex | |
69 | is partly compiled into this space, but the compiled parts are discarded as | |
70 | soon as they can be, so that hopefully there will never be an overrun. The code | |
71 | does, however, check for an overrun. The largest amount I've seen used is 218, | |
72 | so this number is very generous. | |
73 | ||
74 | The same workspace is used during the second, actual compile phase for | |
75 | remembering forward references to groups so that they can be filled in at the | |
76 | end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE | |
77 | is 4 there is plenty of room. */ | |
8ac170f3 | 78 | |
6bf342e1 | 79 | #define COMPILE_WORK_SIZE (4096) |
8ac170f3 PH |
80 | |
81 | ||
82 | /* Table for handling escaped characters in the range '0'-'z'. Positive returns | |
83 | are simple data values; negative values are for special things like \d and so | |
84 | on. Zero means further processing is needed (for things like \x), or the escape | |
85 | is invalid. */ | |
86 | ||
87 | #if !EBCDIC /* This is the "normal" table for ASCII systems */ | |
88 | static const short int escapes[] = { | |
89 | 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */ | |
90 | 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */ | |
91 | '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */ | |
92 | 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */ | |
6bf342e1 | 93 | -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */ |
8ac170f3 PH |
94 | -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */ |
95 | '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */ | |
6bf342e1 | 96 | 0, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */ |
8ac170f3 PH |
97 | -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */ |
98 | 0, 0, -ESC_z /* x - z */ | |
99 | }; | |
100 | ||
101 | #else /* This is the "abnormal" table for EBCDIC systems */ | |
102 | static const short int escapes[] = { | |
103 | /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|', | |
104 | /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0, | |
105 | /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~', | |
106 | /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0, | |
107 | /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?', | |
108 | /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0, | |
109 | /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"', | |
110 | /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, | |
111 | /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0, | |
6bf342e1 | 112 | /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p, |
8ac170f3 PH |
113 | /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0, |
114 | /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0, | |
115 | /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0, | |
116 | /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0, | |
117 | /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-', | |
118 | /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G, | |
119 | /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0, | |
120 | /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P, | |
6bf342e1 | 121 | /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0, |
8ac170f3 PH |
122 | /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X, |
123 | /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0, | |
124 | /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0, | |
125 | /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0 | |
126 | }; | |
127 | #endif | |
128 | ||
129 | ||
130 | /* Tables of names of POSIX character classes and their lengths. The list is | |
aa41d2de | 131 | terminated by a zero length entry. The first three must be alpha, lower, upper, |
8ac170f3 PH |
132 | as this is assumed for handling case independence. */ |
133 | ||
134 | static const char *const posix_names[] = { | |
135 | "alpha", "lower", "upper", | |
136 | "alnum", "ascii", "blank", "cntrl", "digit", "graph", | |
137 | "print", "punct", "space", "word", "xdigit" }; | |
138 | ||
139 | static const uschar posix_name_lengths[] = { | |
140 | 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 }; | |
141 | ||
aa41d2de PH |
142 | /* Table of class bit maps for each POSIX class. Each class is formed from a |
143 | base map, with an optional addition or removal of another map. Then, for some | |
144 | classes, there is some additional tweaking: for [:blank:] the vertical space | |
145 | characters are removed, and for [:alpha:] and [:alnum:] the underscore | |
146 | character is removed. The triples in the table consist of the base map offset, | |
147 | second map offset or -1 if no second map, and a non-negative value for map | |
148 | addition or a negative value for map subtraction (if there are two maps). The | |
149 | absolute value of the third field has these meanings: 0 => no tweaking, 1 => | |
150 | remove vertical space characters, 2 => remove underscore. */ | |
8ac170f3 PH |
151 | |
152 | static const int posix_class_maps[] = { | |
aa41d2de PH |
153 | cbit_word, cbit_digit, -2, /* alpha */ |
154 | cbit_lower, -1, 0, /* lower */ | |
155 | cbit_upper, -1, 0, /* upper */ | |
156 | cbit_word, -1, 2, /* alnum - word without underscore */ | |
157 | cbit_print, cbit_cntrl, 0, /* ascii */ | |
158 | cbit_space, -1, 1, /* blank - a GNU extension */ | |
159 | cbit_cntrl, -1, 0, /* cntrl */ | |
160 | cbit_digit, -1, 0, /* digit */ | |
161 | cbit_graph, -1, 0, /* graph */ | |
162 | cbit_print, -1, 0, /* print */ | |
163 | cbit_punct, -1, 0, /* punct */ | |
164 | cbit_space, -1, 0, /* space */ | |
165 | cbit_word, -1, 0, /* word - a Perl extension */ | |
166 | cbit_xdigit,-1, 0 /* xdigit */ | |
8ac170f3 PH |
167 | }; |
168 | ||
169 | ||
6bf342e1 PH |
170 | #define STRING(a) # a |
171 | #define XSTRING(s) STRING(s) | |
172 | ||
8ac170f3 | 173 | /* The texts of compile-time error messages. These are "char *" because they |
6bf342e1 PH |
174 | are passed to the outside world. Do not ever re-use any error number, because |
175 | they are documented. Always add a new error instead. Messages marked DEAD below | |
176 | are no longer used. */ | |
8ac170f3 PH |
177 | |
178 | static const char *error_texts[] = { | |
179 | "no error", | |
180 | "\\ at end of pattern", | |
181 | "\\c at end of pattern", | |
182 | "unrecognized character follows \\", | |
183 | "numbers out of order in {} quantifier", | |
184 | /* 5 */ | |
185 | "number too big in {} quantifier", | |
186 | "missing terminating ] for character class", | |
187 | "invalid escape sequence in character class", | |
188 | "range out of order in character class", | |
189 | "nothing to repeat", | |
190 | /* 10 */ | |
6bf342e1 | 191 | "operand of unlimited repeat could match the empty string", /** DEAD **/ |
8ac170f3 PH |
192 | "internal error: unexpected repeat", |
193 | "unrecognized character after (?", | |
194 | "POSIX named classes are supported only within a class", | |
195 | "missing )", | |
196 | /* 15 */ | |
197 | "reference to non-existent subpattern", | |
198 | "erroffset passed as NULL", | |
199 | "unknown option bit(s) set", | |
200 | "missing ) after comment", | |
6bf342e1 | 201 | "parentheses nested too deeply", /** DEAD **/ |
8ac170f3 PH |
202 | /* 20 */ |
203 | "regular expression too large", | |
204 | "failed to get memory", | |
205 | "unmatched parentheses", | |
206 | "internal error: code overflow", | |
207 | "unrecognized character after (?<", | |
208 | /* 25 */ | |
209 | "lookbehind assertion is not fixed length", | |
aa41d2de | 210 | "malformed number or name after (?(", |
8ac170f3 PH |
211 | "conditional group contains more than two branches", |
212 | "assertion expected after (?(", | |
213 | "(?R or (?digits must be followed by )", | |
214 | /* 30 */ | |
215 | "unknown POSIX class name", | |
216 | "POSIX collating elements are not supported", | |
217 | "this version of PCRE is not compiled with PCRE_UTF8 support", | |
6bf342e1 | 218 | "spare error", /** DEAD **/ |
8ac170f3 PH |
219 | "character value in \\x{...} sequence is too large", |
220 | /* 35 */ | |
221 | "invalid condition (?(0)", | |
222 | "\\C not allowed in lookbehind assertion", | |
223 | "PCRE does not support \\L, \\l, \\N, \\U, or \\u", | |
224 | "number after (?C is > 255", | |
225 | "closing ) for (?C expected", | |
226 | /* 40 */ | |
227 | "recursive call could loop indefinitely", | |
228 | "unrecognized character after (?P", | |
6bf342e1 | 229 | "syntax error in subpattern name (missing terminator)", |
aa41d2de | 230 | "two named subpatterns have the same name", |
8ac170f3 PH |
231 | "invalid UTF-8 string", |
232 | /* 45 */ | |
233 | "support for \\P, \\p, and \\X has not been compiled", | |
234 | "malformed \\P or \\p sequence", | |
aa41d2de | 235 | "unknown property name after \\P or \\p", |
6bf342e1 PH |
236 | "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)", |
237 | "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")", | |
aa41d2de PH |
238 | /* 50 */ |
239 | "repeated subpattern is too long", | |
6bf342e1 PH |
240 | "octal value is greater than \\377 (not in UTF-8 mode)", |
241 | "internal error: overran compiling workspace", | |
242 | "internal error: previously-checked referenced subpattern not found", | |
243 | "DEFINE group contains more than one branch", | |
244 | /* 55 */ | |
245 | "repeating a DEFINE group is not allowed", | |
246 | "inconsistent NEWLINE options", | |
247 | "\\g is not followed by an (optionally braced) non-zero number" | |
8ac170f3 PH |
248 | }; |
249 | ||
250 | ||
251 | /* Table to identify digits and hex digits. This is used when compiling | |
252 | patterns. Note that the tables in chartables are dependent on the locale, and | |
253 | may mark arbitrary characters as digits - but the PCRE compiling code expects | |
254 | to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have | |
255 | a private table here. It costs 256 bytes, but it is a lot faster than doing | |
256 | character value tests (at least in some simple cases I timed), and in some | |
257 | applications one wants PCRE to compile efficiently as well as match | |
258 | efficiently. | |
259 | ||
260 | For convenience, we use the same bit definitions as in chartables: | |
261 | ||
262 | 0x04 decimal digit | |
263 | 0x08 hexadecimal digit | |
264 | ||
265 | Then we can use ctype_digit and ctype_xdigit in the code. */ | |
266 | ||
267 | #if !EBCDIC /* This is the "normal" case, for ASCII systems */ | |
268 | static const unsigned char digitab[] = | |
269 | { | |
270 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */ | |
271 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */ | |
272 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */ | |
273 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */ | |
274 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */ | |
275 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */ | |
276 | 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */ | |
277 | 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */ | |
278 | 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */ | |
279 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */ | |
280 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */ | |
281 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */ | |
282 | 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */ | |
283 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */ | |
284 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */ | |
285 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */ | |
286 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */ | |
287 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */ | |
288 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */ | |
289 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */ | |
290 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */ | |
291 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */ | |
292 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */ | |
293 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */ | |
294 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */ | |
295 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */ | |
296 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */ | |
297 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */ | |
298 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */ | |
299 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */ | |
300 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */ | |
301 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */ | |
302 | ||
303 | #else /* This is the "abnormal" case, for EBCDIC systems */ | |
304 | static const unsigned char digitab[] = | |
305 | { | |
306 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */ | |
307 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */ | |
308 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */ | |
309 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */ | |
310 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */ | |
311 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */ | |
312 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */ | |
313 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */ | |
314 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */ | |
315 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */ | |
316 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */ | |
317 |