Commit | Line | Data |
---|---|---|
92e772ff | 1 | /* $Cambridge: exim/src/src/pcre/pcre_compile.c,v 1.2 2005/08/08 10:22:14 ph10 Exp $ */ |
8ac170f3 PH |
2 | |
3 | /************************************************* | |
4 | * Perl-Compatible Regular Expressions * | |
5 | *************************************************/ | |
6 | ||
7 | /* PCRE is a library of functions to support regular expressions whose syntax | |
8 | and semantics are as close as possible to those of the Perl 5 language. | |
9 | ||
10 | Written by Philip Hazel | |
11 | Copyright (c) 1997-2005 University of Cambridge | |
12 | ||
13 | ----------------------------------------------------------------------------- | |
14 | Redistribution and use in source and binary forms, with or without | |
15 | modification, are permitted provided that the following conditions are met: | |
16 | ||
17 | * Redistributions of source code must retain the above copyright notice, | |
18 | this list of conditions and the following disclaimer. | |
19 | ||
20 | * Redistributions in binary form must reproduce the above copyright | |
21 | notice, this list of conditions and the following disclaimer in the | |
22 | documentation and/or other materials provided with the distribution. | |
23 | ||
24 | * Neither the name of the University of Cambridge nor the names of its | |
25 | contributors may be used to endorse or promote products derived from | |
26 | this software without specific prior written permission. | |
27 | ||
28 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
29 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
30 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
31 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |
32 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
33 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
34 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
35 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
36 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
37 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
38 | POSSIBILITY OF SUCH DAMAGE. | |
39 | ----------------------------------------------------------------------------- | |
40 | */ | |
41 | ||
42 | ||
43 | /* This module contains the external function pcre_compile(), along with | |
44 | supporting internal functions that are not used by other modules. */ | |
45 | ||
46 | ||
47 | #include "pcre_internal.h" | |
48 | ||
49 | ||
50 | /************************************************* | |
51 | * Code parameters and static tables * | |
52 | *************************************************/ | |
53 | ||
54 | /* Maximum number of items on the nested bracket stacks at compile time. This | |
55 | applies to the nesting of all kinds of parentheses. It does not limit | |
56 | un-nested, non-capturing parentheses. This number can be made bigger if | |
57 | necessary - it is used to dimension one int and one unsigned char vector at | |
58 | compile time. */ | |
59 | ||
60 | #define BRASTACK_SIZE 200 | |
61 | ||
62 | ||
63 | /* Table for handling escaped characters in the range '0'-'z'. Positive returns | |
64 | are simple data values; negative values are for special things like \d and so | |
65 | on. Zero means further processing is needed (for things like \x), or the escape | |
66 | is invalid. */ | |
67 | ||
68 | #if !EBCDIC /* This is the "normal" table for ASCII systems */ | |
69 | static const short int escapes[] = { | |
70 | 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */ | |
71 | 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */ | |
72 | '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */ | |
73 | 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */ | |
74 | -ESC_P, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */ | |
75 | -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */ | |
76 | '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */ | |
77 | 0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */ | |
78 | -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */ | |
79 | 0, 0, -ESC_z /* x - z */ | |
80 | }; | |
81 | ||
82 | #else /* This is the "abnormal" table for EBCDIC systems */ | |
83 | static const short int escapes[] = { | |
84 | /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|', | |
85 | /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0, | |
86 | /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~', | |
87 | /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0, | |
88 | /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?', | |
89 | /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0, | |
90 | /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"', | |
91 | /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, | |
92 | /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0, | |
93 | /* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, -ESC_p, | |
94 | /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0, | |
95 | /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0, | |
96 | /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0, | |
97 | /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0, | |
98 | /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-', | |
99 | /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G, | |
100 | /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0, | |
101 | /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P, | |
102 | /* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0, | |
103 | /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X, | |
104 | /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0, | |
105 | /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0, | |
106 | /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0 | |
107 | }; | |
108 | #endif | |
109 | ||
110 | ||
111 | /* Tables of names of POSIX character classes and their lengths. The list is | |
112 | terminated by a zero length entry. The first three must be alpha, upper, lower, | |
113 | as this is assumed for handling case independence. */ | |
114 | ||
115 | static const char *const posix_names[] = { | |
116 | "alpha", "lower", "upper", | |
117 | "alnum", "ascii", "blank", "cntrl", "digit", "graph", | |
118 | "print", "punct", "space", "word", "xdigit" }; | |
119 | ||
120 | static const uschar posix_name_lengths[] = { | |
121 | 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 }; | |
122 | ||
123 | /* Table of class bit maps for each POSIX class; up to three may be combined | |
124 | to form the class. The table for [:blank:] is dynamically modified to remove | |
125 | the vertical space characters. */ | |
126 | ||
127 | static const int posix_class_maps[] = { | |
128 | cbit_lower, cbit_upper, -1, /* alpha */ | |
129 | cbit_lower, -1, -1, /* lower */ | |
130 | cbit_upper, -1, -1, /* upper */ | |
131 | cbit_digit, cbit_lower, cbit_upper, /* alnum */ | |
132 | cbit_print, cbit_cntrl, -1, /* ascii */ | |
133 | cbit_space, -1, -1, /* blank - a GNU extension */ | |
134 | cbit_cntrl, -1, -1, /* cntrl */ | |
135 | cbit_digit, -1, -1, /* digit */ | |
136 | cbit_graph, -1, -1, /* graph */ | |
137 | cbit_print, -1, -1, /* print */ | |
138 | cbit_punct, -1, -1, /* punct */ | |
139 | cbit_space, -1, -1, /* space */ | |
140 | cbit_word, -1, -1, /* word - a Perl extension */ | |
141 | cbit_xdigit,-1, -1 /* xdigit */ | |
142 | }; | |
143 | ||
144 | ||
145 | /* The texts of compile-time error messages. These are "char *" because they | |
146 | are passed to the outside world. */ | |
147 | ||
148 | static const char *error_texts[] = { | |
149 | "no error", | |
150 | "\\ at end of pattern", | |
151 | "\\c at end of pattern", | |
152 | "unrecognized character follows \\", | |
153 | "numbers out of order in {} quantifier", | |
154 | /* 5 */ | |
155 | "number too big in {} quantifier", | |
156 | "missing terminating ] for character class", | |
157 | "invalid escape sequence in character class", | |
158 | "range out of order in character class", | |
159 | "nothing to repeat", | |
160 | /* 10 */ | |
161 | "operand of unlimited repeat could match the empty string", | |
162 | "internal error: unexpected repeat", | |
163 | "unrecognized character after (?", | |
164 | "POSIX named classes are supported only within a class", | |
165 | "missing )", | |
166 | /* 15 */ | |
167 | "reference to non-existent subpattern", | |
168 | "erroffset passed as NULL", | |
169 | "unknown option bit(s) set", | |
170 | "missing ) after comment", | |
171 | "parentheses nested too deeply", | |
172 | /* 20 */ | |
173 | "regular expression too large", | |
174 | "failed to get memory", | |
175 | "unmatched parentheses", | |
176 | "internal error: code overflow", | |
177 | "unrecognized character after (?<", | |
178 | /* 25 */ | |
179 | "lookbehind assertion is not fixed length", | |
180 | "malformed number after (?(", | |
181 | "conditional group contains more than two branches", | |
182 | "assertion expected after (?(", | |
183 | "(?R or (?digits must be followed by )", | |
184 | /* 30 */ | |
185 | "unknown POSIX class name", | |
186 | "POSIX collating elements are not supported", | |
187 | "this version of PCRE is not compiled with PCRE_UTF8 support", | |
188 | "spare error", | |
189 | "character value in \\x{...} sequence is too large", | |
190 | /* 35 */ | |
191 | "invalid condition (?(0)", | |
192 | "\\C not allowed in lookbehind assertion", | |
193 | "PCRE does not support \\L, \\l, \\N, \\U, or \\u", | |
194 | "number after (?C is > 255", | |
195 | "closing ) for (?C expected", | |
196 | /* 40 */ | |
197 | "recursive call could loop indefinitely", | |
198 | "unrecognized character after (?P", | |
199 | "syntax error after (?P", | |
200 | "two named groups have the same name", | |
201 | "invalid UTF-8 string", | |
202 | /* 45 */ | |
203 | "support for \\P, \\p, and \\X has not been compiled", | |
204 | "malformed \\P or \\p sequence", | |
205 | "unknown property name after \\P or \\p" | |
206 | }; | |
207 | ||
208 | ||
209 | /* Table to identify digits and hex digits. This is used when compiling | |
210 | patterns. Note that the tables in chartables are dependent on the locale, and | |
211 | may mark arbitrary characters as digits - but the PCRE compiling code expects | |
212 | to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have | |
213 | a private table here. It costs 256 bytes, but it is a lot faster than doing | |
214 | character value tests (at least in some simple cases I timed), and in some | |
215 | applications one wants PCRE to compile efficiently as well as match | |
216 | efficiently. | |
217 | ||
218 | For convenience, we use the same bit definitions as in chartables: | |
219 | ||
220 | 0x04 decimal digit | |
221 | 0x08 hexadecimal digit | |
222 | ||
223 | Then we can use ctype_digit and ctype_xdigit in the code. */ | |
224 | ||
225 | #if !EBCDIC /* This is the "normal" case, for ASCII systems */ | |
226 | static const unsigned char digitab[] = | |
227 | { | |
228 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */ | |
229 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */ | |
230 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */ | |
231 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */ | |
232 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */ | |
233 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */ | |
234 | 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */ | |
235 | 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */ | |
236 | 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */ | |
237 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */ | |
238 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */ | |
239 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */ | |
240 | 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */ | |
241 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */ | |
242 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */ | |
243 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */ | |
244 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */ | |
245 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */ | |
246 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */ | |
247 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */ | |
248 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */ | |
249 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */ | |
250 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */ | |
251 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */ | |
252 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */ | |
253 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */ | |
254 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */ | |
255 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */ | |
256 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */ | |
257 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */ | |
258 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */ | |
259 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */ | |
260 | ||
261 | #else /* This is the "abnormal" case, for EBCDIC systems */ | |
262 | static const unsigned char digitab[] = | |
263 | { | |
264 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */ | |
265 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */ | |
266 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */ | |
267 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */ | |
268 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */ | |
269 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */ | |
270 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */ | |
271 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */ | |
272 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */ | |
273 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */ | |
274 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */ | |
275 |