Commit | Line | Data |
---|---|---|
aa41d2de | 1 | /* $Cambridge: exim/src/src/pcre/pcre_compile.c,v 1.3 2006/11/07 16:50:36 ph10 Exp $ */ |
8ac170f3 PH |
2 | |
3 | /************************************************* | |
4 | * Perl-Compatible Regular Expressions * | |
5 | *************************************************/ | |
6 | ||
7 | /* PCRE is a library of functions to support regular expressions whose syntax | |
8 | and semantics are as close as possible to those of the Perl 5 language. | |
9 | ||
10 | Written by Philip Hazel | |
aa41d2de | 11 | Copyright (c) 1997-2006 University of Cambridge |
8ac170f3 PH |
12 | |
13 | ----------------------------------------------------------------------------- | |
14 | Redistribution and use in source and binary forms, with or without | |
15 | modification, are permitted provided that the following conditions are met: | |
16 | ||
17 | * Redistributions of source code must retain the above copyright notice, | |
18 | this list of conditions and the following disclaimer. | |
19 | ||
20 | * Redistributions in binary form must reproduce the above copyright | |
21 | notice, this list of conditions and the following disclaimer in the | |
22 | documentation and/or other materials provided with the distribution. | |
23 | ||
24 | * Neither the name of the University of Cambridge nor the names of its | |
25 | contributors may be used to endorse or promote products derived from | |
26 | this software without specific prior written permission. | |
27 | ||
28 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
29 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
30 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
31 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |
32 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
33 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
34 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
35 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
36 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
37 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
38 | POSSIBILITY OF SUCH DAMAGE. | |
39 | ----------------------------------------------------------------------------- | |
40 | */ | |
41 | ||
42 | ||
43 | /* This module contains the external function pcre_compile(), along with | |
44 | supporting internal functions that are not used by other modules. */ | |
45 | ||
46 | ||
aa41d2de | 47 | #define NLBLOCK cd /* The block containing newline information */ |
8ac170f3 PH |
48 | #include "pcre_internal.h" |
49 | ||
50 | ||
aa41d2de PH |
51 | /* When DEBUG is defined, we need the pcre_printint() function, which is also |
52 | used by pcretest. DEBUG is not defined when building a production library. */ | |
53 | ||
54 | #ifdef DEBUG | |
55 | #include "pcre_printint.src" | |
56 | #endif | |
57 | ||
58 | ||
59 | ||
8ac170f3 PH |
60 | /************************************************* |
61 | * Code parameters and static tables * | |
62 | *************************************************/ | |
63 | ||
64 | /* Maximum number of items on the nested bracket stacks at compile time. This | |
65 | applies to the nesting of all kinds of parentheses. It does not limit | |
66 | un-nested, non-capturing parentheses. This number can be made bigger if | |
67 | necessary - it is used to dimension one int and one unsigned char vector at | |
68 | compile time. */ | |
69 | ||
70 | #define BRASTACK_SIZE 200 | |
71 | ||
72 | ||
73 | /* Table for handling escaped characters in the range '0'-'z'. Positive returns | |
74 | are simple data values; negative values are for special things like \d and so | |
75 | on. Zero means further processing is needed (for things like \x), or the escape | |
76 | is invalid. */ | |
77 | ||
78 | #if !EBCDIC /* This is the "normal" table for ASCII systems */ | |
79 | static const short int escapes[] = { | |
80 | 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */ | |
81 | 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */ | |
82 | '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */ | |
83 | 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */ | |
84 | -ESC_P, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */ | |
85 | -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */ | |
86 | '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */ | |
87 | 0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */ | |
88 | -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */ | |
89 | 0, 0, -ESC_z /* x - z */ | |
90 | }; | |
91 | ||
92 | #else /* This is the "abnormal" table for EBCDIC systems */ | |
93 | static const short int escapes[] = { | |
94 | /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|', | |
95 | /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0, | |
96 | /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~', | |
97 | /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0, | |
98 | /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?', | |
99 | /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0, | |
100 | /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"', | |
101 | /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, | |
102 | /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0, | |
103 | /* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, -ESC_p, | |
104 | /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0, | |
105 | /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0, | |
106 | /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0, | |
107 | /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0, | |
108 | /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-', | |
109 | /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G, | |
110 | /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0, | |
111 | /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P, | |
112 | /* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0, | |
113 | /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X, | |
114 | /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0, | |
115 | /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0, | |
116 | /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0 | |
117 | }; | |
118 | #endif | |
119 | ||
120 | ||
121 | /* Tables of names of POSIX character classes and their lengths. The list is | |
aa41d2de | 122 | terminated by a zero length entry. The first three must be alpha, lower, upper, |
8ac170f3 PH |
123 | as this is assumed for handling case independence. */ |
124 | ||
125 | static const char *const posix_names[] = { | |
126 | "alpha", "lower", "upper", | |
127 | "alnum", "ascii", "blank", "cntrl", "digit", "graph", | |
128 | "print", "punct", "space", "word", "xdigit" }; | |
129 | ||
130 | static const uschar posix_name_lengths[] = { | |
131 | 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 }; | |
132 | ||
aa41d2de PH |
133 | /* Table of class bit maps for each POSIX class. Each class is formed from a |
134 | base map, with an optional addition or removal of another map. Then, for some | |
135 | classes, there is some additional tweaking: for [:blank:] the vertical space | |
136 | characters are removed, and for [:alpha:] and [:alnum:] the underscore | |
137 | character is removed. The triples in the table consist of the base map offset, | |
138 | second map offset or -1 if no second map, and a non-negative value for map | |
139 | addition or a negative value for map subtraction (if there are two maps). The | |
140 | absolute value of the third field has these meanings: 0 => no tweaking, 1 => | |
141 | remove vertical space characters, 2 => remove underscore. */ | |
8ac170f3 PH |
142 | |
143 | static const int posix_class_maps[] = { | |
aa41d2de PH |
144 | cbit_word, cbit_digit, -2, /* alpha */ |
145 | cbit_lower, -1, 0, /* lower */ | |
146 | cbit_upper, -1, 0, /* upper */ | |
147 | cbit_word, -1, 2, /* alnum - word without underscore */ | |
148 | cbit_print, cbit_cntrl, 0, /* ascii */ | |
149 | cbit_space, -1, 1, /* blank - a GNU extension */ | |
150 | cbit_cntrl, -1, 0, /* cntrl */ | |
151 | cbit_digit, -1, 0, /* digit */ | |
152 | cbit_graph, -1, 0, /* graph */ | |
153 | cbit_print, -1, 0, /* print */ | |
154 | cbit_punct, -1, 0, /* punct */ | |
155 | cbit_space, -1, 0, /* space */ | |
156 | cbit_word, -1, 0, /* word - a Perl extension */ | |
157 | cbit_xdigit,-1, 0 /* xdigit */ | |
8ac170f3 PH |
158 | }; |
159 | ||
160 | ||
161 | /* The texts of compile-time error messages. These are "char *" because they | |
162 | are passed to the outside world. */ | |
163 | ||
164 | static const char *error_texts[] = { | |
165 | "no error", | |
166 | "\\ at end of pattern", | |
167 | "\\c at end of pattern", | |
168 | "unrecognized character follows \\", | |
169 | "numbers out of order in {} quantifier", | |
170 | /* 5 */ | |
171 | "number too big in {} quantifier", | |
172 | "missing terminating ] for character class", | |
173 | "invalid escape sequence in character class", | |
174 | "range out of order in character class", | |
175 | "nothing to repeat", | |
176 | /* 10 */ | |
177 | "operand of unlimited repeat could match the empty string", | |
178 | "internal error: unexpected repeat", | |
179 | "unrecognized character after (?", | |
180 | "POSIX named classes are supported only within a class", | |
181 | "missing )", | |
182 | /* 15 */ | |
183 | "reference to non-existent subpattern", | |
184 | "erroffset passed as NULL", | |
185 | "unknown option bit(s) set", | |
186 | "missing ) after comment", | |
187 | "parentheses nested too deeply", | |
188 | /* 20 */ | |
189 | "regular expression too large", | |
190 | "failed to get memory", | |
191 | "unmatched parentheses", | |
192 | "internal error: code overflow", | |
193 | "unrecognized character after (?<", | |
194 | /* 25 */ | |
195 | "lookbehind assertion is not fixed length", | |
aa41d2de | 196 | "malformed number or name after (?(", |
8ac170f3 PH |
197 | "conditional group contains more than two branches", |
198 | "assertion expected after (?(", | |
199 | "(?R or (?digits must be followed by )", | |
200 | /* 30 */ | |
201 | "unknown POSIX class name", | |
202 | "POSIX collating elements are not supported", | |
203 | "this version of PCRE is not compiled with PCRE_UTF8 support", | |
204 | "spare error", | |
205 | "character value in \\x{...} sequence is too large", | |
206 | /* 35 */ | |
207 | "invalid condition (?(0)", | |
208 | "\\C not allowed in lookbehind assertion", | |
209 | "PCRE does not support \\L, \\l, \\N, \\U, or \\u", | |
210 | "number after (?C is > 255", | |
211 | "closing ) for (?C expected", | |
212 | /* 40 */ | |
213 | "recursive call could loop indefinitely", | |
214 | "unrecognized character after (?P", | |
215 | "syntax error after (?P", | |
aa41d2de | 216 | "two named subpatterns have the same name", |
8ac170f3 PH |
217 | "invalid UTF-8 string", |
218 | /* 45 */ | |
219 | "support for \\P, \\p, and \\X has not been compiled", | |
220 | "malformed \\P or \\p sequence", | |
aa41d2de PH |
221 | "unknown property name after \\P or \\p", |
222 | "subpattern name is too long (maximum 32 characters)", | |
223 | "too many named subpatterns (maximum 10,000)", | |
224 | /* 50 */ | |
225 | "repeated subpattern is too long", | |
226 | "octal value is greater than \\377 (not in UTF-8 mode)" | |
8ac170f3 PH |
227 | }; |
228 | ||
229 | ||
230 | /* Table to identify digits and hex digits. This is used when compiling | |
231 | patterns. Note that the tables in chartables are dependent on the locale, and | |
232 | may mark arbitrary characters as digits - but the PCRE compiling code expects | |
233 | to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have | |
234 | a private table here. It costs 256 bytes, but it is a lot faster than doing | |
235 | character value tests (at least in some simple cases I timed), and in some | |
236 | applications one wants PCRE to compile efficiently as well as match | |
237 | efficiently. | |
238 | ||
239 | For convenience, we use the same bit definitions as in chartables: | |
240 | ||
241 | 0x04 decimal digit | |
242 | 0x08 hexadecimal digit | |
243 | ||
244 | Then we can use ctype_digit and ctype_xdigit in the code. */ | |
245 | ||
246 | #if !EBCDIC /* This is the "normal" case, for ASCII systems */ | |
247 | static const unsigned char digitab[] = | |
248 | { | |
249 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */ | |
250 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */ | |
251 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */ | |
252 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */ | |
253 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */ | |
254 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */ | |
255 | 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */ | |
256 | 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */ | |
257 | 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */ | |
258 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */ | |
259 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */ | |
260 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */ | |
261 | 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */ | |
262 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */ | |
263 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */ | |
264 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */ | |
265 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */ | |
266 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */ | |
267 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */ | |
268 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */ | |
269 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */ | |
270 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */ | |
271 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */ | |
272 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */ | |
273 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */ | |
274 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */ | |
275 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */ | |
276 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */ | |
277 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */ | |
278 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */ | |
279 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */ | |
280 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */ | |
281 | ||
282 | #else /* This is the "abnormal" case, for EBCDIC systems */ | |
283 | static const unsigned char digitab[] = | |
284 | { | |
285 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */ | |
286 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */ | |
287 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */ | |
288 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */ | |
289 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */ | |
290 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */ | |
291 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */ | |
292 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */ | |
293 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */ | |
294 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */ | |
295 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */ | |
296 |