MH's patch for allowing control characters as list separators.
[exim.git] / src / src / pcre / pcre_compile.c
CommitLineData
6bf342e1 1/* $Cambridge: exim/src/src/pcre/pcre_compile.c,v 1.4 2007/01/23 15:08:45 ph10 Exp $ */
8ac170f3
PH
2
3/*************************************************
4* Perl-Compatible Regular Expressions *
5*************************************************/
6
7/* PCRE is a library of functions to support regular expressions whose syntax
8and semantics are as close as possible to those of the Perl 5 language.
9
10 Written by Philip Hazel
aa41d2de 11 Copyright (c) 1997-2006 University of Cambridge
8ac170f3
PH
12
13-----------------------------------------------------------------------------
14Redistribution and use in source and binary forms, with or without
15modification, are permitted provided that the following conditions are met:
16
17 * Redistributions of source code must retain the above copyright notice,
18 this list of conditions and the following disclaimer.
19
20 * Redistributions in binary form must reproduce the above copyright
21 notice, this list of conditions and the following disclaimer in the
22 documentation and/or other materials provided with the distribution.
23
24 * Neither the name of the University of Cambridge nor the names of its
25 contributors may be used to endorse or promote products derived from
26 this software without specific prior written permission.
27
28THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38POSSIBILITY OF SUCH DAMAGE.
39-----------------------------------------------------------------------------
40*/
41
42
43/* This module contains the external function pcre_compile(), along with
44supporting internal functions that are not used by other modules. */
45
46
6bf342e1
PH
47#define NLBLOCK cd /* Block containing newline information */
48#define PSSTART start_pattern /* Field containing processed string start */
49#define PSEND end_pattern /* Field containing processed string end */
50
51
8ac170f3
PH
52#include "pcre_internal.h"
53
54
aa41d2de
PH
55/* When DEBUG is defined, we need the pcre_printint() function, which is also
56used by pcretest. DEBUG is not defined when building a production library. */
57
58#ifdef DEBUG
59#include "pcre_printint.src"
60#endif
61
62
8ac170f3
PH
63/*************************************************
64* Code parameters and static tables *
65*************************************************/
66
6bf342e1
PH
67/* This value specifies the size of stack workspace that is used during the
68first pre-compile phase that determines how much memory is required. The regex
69is partly compiled into this space, but the compiled parts are discarded as
70soon as they can be, so that hopefully there will never be an overrun. The code
71does, however, check for an overrun. The largest amount I've seen used is 218,
72so this number is very generous.
73
74The same workspace is used during the second, actual compile phase for
75remembering forward references to groups so that they can be filled in at the
76end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
77is 4 there is plenty of room. */
8ac170f3 78
6bf342e1 79#define COMPILE_WORK_SIZE (4096)
8ac170f3
PH
80
81
82/* Table for handling escaped characters in the range '0'-'z'. Positive returns
83are simple data values; negative values are for special things like \d and so
84on. Zero means further processing is needed (for things like \x), or the escape
85is invalid. */
86
87#if !EBCDIC /* This is the "normal" table for ASCII systems */
88static const short int escapes[] = {
89 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
90 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
91 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
92 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
6bf342e1 93-ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
8ac170f3
PH
94-ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
95 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
6bf342e1 96 0, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
8ac170f3
PH
97-ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
98 0, 0, -ESC_z /* x - z */
99};
100
101#else /* This is the "abnormal" table for EBCDIC systems */
102static const short int escapes[] = {
103/* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
104/* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
105/* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
106/* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
107/* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
108/* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
109/* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
110/* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
111/* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
6bf342e1 112/* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
8ac170f3
PH
113/* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
114/* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
115/* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
116/* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
117/* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
118/* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
119/* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
120/* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
6bf342e1 121/* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
8ac170f3
PH
122/* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
123/* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
124/* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
125/* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
126};
127#endif
128
129
130/* Tables of names of POSIX character classes and their lengths. The list is
aa41d2de 131terminated by a zero length entry. The first three must be alpha, lower, upper,
8ac170f3
PH
132as this is assumed for handling case independence. */
133
134static const char *const posix_names[] = {
135 "alpha", "lower", "upper",
136 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
137 "print", "punct", "space", "word", "xdigit" };
138
139static const uschar posix_name_lengths[] = {
140 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
141
aa41d2de
PH
142/* Table of class bit maps for each POSIX class. Each class is formed from a
143base map, with an optional addition or removal of another map. Then, for some
144classes, there is some additional tweaking: for [:blank:] the vertical space
145characters are removed, and for [:alpha:] and [:alnum:] the underscore
146character is removed. The triples in the table consist of the base map offset,
147second map offset or -1 if no second map, and a non-negative value for map
148addition or a negative value for map subtraction (if there are two maps). The
149absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
150remove vertical space characters, 2 => remove underscore. */
8ac170f3
PH
151
152static const int posix_class_maps[] = {
aa41d2de
PH
153 cbit_word, cbit_digit, -2, /* alpha */
154 cbit_lower, -1, 0, /* lower */
155 cbit_upper, -1, 0, /* upper */
156 cbit_word, -1, 2, /* alnum - word without underscore */
157 cbit_print, cbit_cntrl, 0, /* ascii */
158 cbit_space, -1, 1, /* blank - a GNU extension */
159 cbit_cntrl, -1, 0, /* cntrl */
160 cbit_digit, -1, 0, /* digit */
161 cbit_graph, -1, 0, /* graph */
162 cbit_print, -1, 0, /* print */
163 cbit_punct, -1, 0, /* punct */
164 cbit_space, -1, 0, /* space */
165 cbit_word, -1, 0, /* word - a Perl extension */
166 cbit_xdigit,-1, 0 /* xdigit */
8ac170f3
PH
167};
168
169
6bf342e1
PH
170#define STRING(a) # a
171#define XSTRING(s) STRING(s)
172
8ac170f3 173/* The texts of compile-time error messages. These are "char *" because they
6bf342e1
PH
174are passed to the outside world. Do not ever re-use any error number, because
175they are documented. Always add a new error instead. Messages marked DEAD below
176are no longer used. */
8ac170f3
PH
177
178static const char *error_texts[] = {
179 "no error",
180 "\\ at end of pattern",
181 "\\c at end of pattern",
182 "unrecognized character follows \\",
183 "numbers out of order in {} quantifier",
184 /* 5 */
185 "number too big in {} quantifier",
186 "missing terminating ] for character class",
187 "invalid escape sequence in character class",
188 "range out of order in character class",
189 "nothing to repeat",
190 /* 10 */
6bf342e1 191 "operand of unlimited repeat could match the empty string", /** DEAD **/
8ac170f3
PH
192 "internal error: unexpected repeat",
193 "unrecognized character after (?",
194 "POSIX named classes are supported only within a class",
195 "missing )",
196 /* 15 */
197 "reference to non-existent subpattern",
198 "erroffset passed as NULL",
199 "unknown option bit(s) set",
200 "missing ) after comment",
6bf342e1 201 "parentheses nested too deeply", /** DEAD **/
8ac170f3
PH
202 /* 20 */
203 "regular expression too large",
204 "failed to get memory",
205 "unmatched parentheses",
206 "internal error: code overflow",
207 "unrecognized character after (?<",
208 /* 25 */
209 "lookbehind assertion is not fixed length",
aa41d2de 210 "malformed number or name after (?(",
8ac170f3
PH
211 "conditional group contains more than two branches",
212 "assertion expected after (?(",
213 "(?R or (?digits must be followed by )",
214 /* 30 */
215 "unknown POSIX class name",
216 "POSIX collating elements are not supported",
217 "this version of PCRE is not compiled with PCRE_UTF8 support",
6bf342e1 218 "spare error", /** DEAD **/
8ac170f3
PH
219 "character value in \\x{...} sequence is too large",
220 /* 35 */
221 "invalid condition (?(0)",
222 "\\C not allowed in lookbehind assertion",
223 "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
224 "number after (?C is > 255",
225 "closing ) for (?C expected",
226 /* 40 */
227 "recursive call could loop indefinitely",
228 "unrecognized character after (?P",
6bf342e1 229 "syntax error in subpattern name (missing terminator)",
aa41d2de 230 "two named subpatterns have the same name",
8ac170f3
PH
231 "invalid UTF-8 string",
232 /* 45 */
233 "support for \\P, \\p, and \\X has not been compiled",
234 "malformed \\P or \\p sequence",
aa41d2de 235 "unknown property name after \\P or \\p",
6bf342e1
PH
236 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
237 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
aa41d2de
PH
238 /* 50 */
239 "repeated subpattern is too long",
6bf342e1
PH
240 "octal value is greater than \\377 (not in UTF-8 mode)",
241 "internal error: overran compiling workspace",
242 "internal error: previously-checked referenced subpattern not found",
243 "DEFINE group contains more than one branch",
244 /* 55 */
245 "repeating a DEFINE group is not allowed",
246 "inconsistent NEWLINE options",
247 "\\g is not followed by an (optionally braced) non-zero number"
8ac170f3
PH
248};
249
250
251/* Table to identify digits and hex digits. This is used when compiling
252patterns. Note that the tables in chartables are dependent on the locale, and
253may mark arbitrary characters as digits - but the PCRE compiling code expects
254to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
255a private table here. It costs 256 bytes, but it is a lot faster than doing
256character value tests (at least in some simple cases I timed), and in some
257applications one wants PCRE to compile efficiently as well as match
258efficiently.
259
260For convenience, we use the same bit definitions as in chartables:
261
262 0x04 decimal digit
263 0x08 hexadecimal digit
264
265Then we can use ctype_digit and ctype_xdigit in the code. */
266
267#if !EBCDIC /* This is the "normal" case, for ASCII systems */
268static const unsigned char digitab[] =
269 {
270 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
271 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
272 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
273 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
274 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
275 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
276 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
277 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
278 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
279 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
280 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
281 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
282 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
283 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
284 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
285 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
286 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
287 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
288 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
289 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
290 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
291 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
292 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
293 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
294 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
295 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
296 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
297 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
298 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
299 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
300 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
301 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
302
303#else /* This is the "abnormal" case, for EBCDIC systems */
304static const unsigned char digitab[] =
305 {
306 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
307 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
308 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
309 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
310 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
311 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
312 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
313 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
314 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
315 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
316 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
317