Add some tolerance for QT= value in tests.
[exim.git] / src / src / pcre / pcre_compile.c
CommitLineData
aa41d2de 1/* $Cambridge: exim/src/src/pcre/pcre_compile.c,v 1.3 2006/11/07 16:50:36 ph10 Exp $ */
8ac170f3
PH
2
3/*************************************************
4* Perl-Compatible Regular Expressions *
5*************************************************/
6
7/* PCRE is a library of functions to support regular expressions whose syntax
8and semantics are as close as possible to those of the Perl 5 language.
9
10 Written by Philip Hazel
aa41d2de 11 Copyright (c) 1997-2006 University of Cambridge
8ac170f3
PH
12
13-----------------------------------------------------------------------------
14Redistribution and use in source and binary forms, with or without
15modification, are permitted provided that the following conditions are met:
16
17 * Redistributions of source code must retain the above copyright notice,
18 this list of conditions and the following disclaimer.
19
20 * Redistributions in binary form must reproduce the above copyright
21 notice, this list of conditions and the following disclaimer in the
22 documentation and/or other materials provided with the distribution.
23
24 * Neither the name of the University of Cambridge nor the names of its
25 contributors may be used to endorse or promote products derived from
26 this software without specific prior written permission.
27
28THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38POSSIBILITY OF SUCH DAMAGE.
39-----------------------------------------------------------------------------
40*/
41
42
43/* This module contains the external function pcre_compile(), along with
44supporting internal functions that are not used by other modules. */
45
46
aa41d2de 47#define NLBLOCK cd /* The block containing newline information */
8ac170f3
PH
48#include "pcre_internal.h"
49
50
aa41d2de
PH
51/* When DEBUG is defined, we need the pcre_printint() function, which is also
52used by pcretest. DEBUG is not defined when building a production library. */
53
54#ifdef DEBUG
55#include "pcre_printint.src"
56#endif
57
58
59
8ac170f3
PH
60/*************************************************
61* Code parameters and static tables *
62*************************************************/
63
64/* Maximum number of items on the nested bracket stacks at compile time. This
65applies to the nesting of all kinds of parentheses. It does not limit
66un-nested, non-capturing parentheses. This number can be made bigger if
67necessary - it is used to dimension one int and one unsigned char vector at
68compile time. */
69
70#define BRASTACK_SIZE 200
71
72
73/* Table for handling escaped characters in the range '0'-'z'. Positive returns
74are simple data values; negative values are for special things like \d and so
75on. Zero means further processing is needed (for things like \x), or the escape
76is invalid. */
77
78#if !EBCDIC /* This is the "normal" table for ASCII systems */
79static const short int escapes[] = {
80 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
81 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
82 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
83 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
84-ESC_P, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
85-ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
86 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
87 0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
88-ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
89 0, 0, -ESC_z /* x - z */
90};
91
92#else /* This is the "abnormal" table for EBCDIC systems */
93static const short int escapes[] = {
94/* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
95/* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
96/* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
97/* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
98/* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
99/* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
100/* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
101/* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
102/* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
103/* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, -ESC_p,
104/* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
105/* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
106/* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
107/* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
108/* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
109/* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
110/* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
111/* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
112/* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0,
113/* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
114/* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
115/* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
116/* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
117};
118#endif
119
120
121/* Tables of names of POSIX character classes and their lengths. The list is
aa41d2de 122terminated by a zero length entry. The first three must be alpha, lower, upper,
8ac170f3
PH
123as this is assumed for handling case independence. */
124
125static const char *const posix_names[] = {
126 "alpha", "lower", "upper",
127 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
128 "print", "punct", "space", "word", "xdigit" };
129
130static const uschar posix_name_lengths[] = {
131 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
132
aa41d2de
PH
133/* Table of class bit maps for each POSIX class. Each class is formed from a
134base map, with an optional addition or removal of another map. Then, for some
135classes, there is some additional tweaking: for [:blank:] the vertical space
136characters are removed, and for [:alpha:] and [:alnum:] the underscore
137character is removed. The triples in the table consist of the base map offset,
138second map offset or -1 if no second map, and a non-negative value for map
139addition or a negative value for map subtraction (if there are two maps). The
140absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
141remove vertical space characters, 2 => remove underscore. */
8ac170f3
PH
142
143static const int posix_class_maps[] = {
aa41d2de
PH
144 cbit_word, cbit_digit, -2, /* alpha */
145 cbit_lower, -1, 0, /* lower */
146 cbit_upper, -1, 0, /* upper */
147 cbit_word, -1, 2, /* alnum - word without underscore */
148 cbit_print, cbit_cntrl, 0, /* ascii */
149 cbit_space, -1, 1, /* blank - a GNU extension */
150 cbit_cntrl, -1, 0, /* cntrl */
151 cbit_digit, -1, 0, /* digit */
152 cbit_graph, -1, 0, /* graph */
153 cbit_print, -1, 0, /* print */
154 cbit_punct, -1, 0, /* punct */
155 cbit_space, -1, 0, /* space */
156 cbit_word, -1, 0, /* word - a Perl extension */
157 cbit_xdigit,-1, 0 /* xdigit */
8ac170f3
PH
158};
159
160
161/* The texts of compile-time error messages. These are "char *" because they
162are passed to the outside world. */
163
164static const char *error_texts[] = {
165 "no error",
166 "\\ at end of pattern",
167 "\\c at end of pattern",
168 "unrecognized character follows \\",
169 "numbers out of order in {} quantifier",
170 /* 5 */
171 "number too big in {} quantifier",
172 "missing terminating ] for character class",
173 "invalid escape sequence in character class",
174 "range out of order in character class",
175 "nothing to repeat",
176 /* 10 */
177 "operand of unlimited repeat could match the empty string",
178 "internal error: unexpected repeat",
179 "unrecognized character after (?",
180 "POSIX named classes are supported only within a class",
181 "missing )",
182 /* 15 */
183 "reference to non-existent subpattern",
184 "erroffset passed as NULL",
185 "unknown option bit(s) set",
186 "missing ) after comment",
187 "parentheses nested too deeply",
188 /* 20 */
189 "regular expression too large",
190 "failed to get memory",
191 "unmatched parentheses",
192 "internal error: code overflow",
193 "unrecognized character after (?<",
194 /* 25 */
195 "lookbehind assertion is not fixed length",
aa41d2de 196 "malformed number or name after (?(",
8ac170f3
PH
197 "conditional group contains more than two branches",
198 "assertion expected after (?(",
199 "(?R or (?digits must be followed by )",
200 /* 30 */
201 "unknown POSIX class name",
202 "POSIX collating elements are not supported",
203 "this version of PCRE is not compiled with PCRE_UTF8 support",
204 "spare error",
205 "character value in \\x{...} sequence is too large",
206 /* 35 */
207 "invalid condition (?(0)",
208 "\\C not allowed in lookbehind assertion",
209 "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
210 "number after (?C is > 255",
211 "closing ) for (?C expected",
212 /* 40 */
213 "recursive call could loop indefinitely",
214 "unrecognized character after (?P",
215 "syntax error after (?P",
aa41d2de 216 "two named subpatterns have the same name",
8ac170f3
PH
217 "invalid UTF-8 string",
218 /* 45 */
219 "support for \\P, \\p, and \\X has not been compiled",
220 "malformed \\P or \\p sequence",
aa41d2de
PH
221 "unknown property name after \\P or \\p",
222 "subpattern name is too long (maximum 32 characters)",
223 "too many named subpatterns (maximum 10,000)",
224 /* 50 */
225 "repeated subpattern is too long",
226 "octal value is greater than \\377 (not in UTF-8 mode)"
8ac170f3
PH
227};
228
229
230/* Table to identify digits and hex digits. This is used when compiling
231patterns. Note that the tables in chartables are dependent on the locale, and
232may mark arbitrary characters as digits - but the PCRE compiling code expects
233to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
234a private table here. It costs 256 bytes, but it is a lot faster than doing
235character value tests (at least in some simple cases I timed), and in some
236applications one wants PCRE to compile efficiently as well as match
237efficiently.
238
239For convenience, we use the same bit definitions as in chartables:
240
241 0x04 decimal digit
242 0x08 hexadecimal digit
243
244Then we can use ctype_digit and ctype_xdigit in the code. */
245
246#if !EBCDIC /* This is the "normal" case, for ASCII systems */
247static const unsigned char digitab[] =
248 {
249 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
250 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
251 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
252 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
253 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
254 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
255 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
256 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
257 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
258 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
259 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
260 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
261 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
262 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
263 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
264 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
265 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
266 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
267 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
268 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
269 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
270 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
271 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
272 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
273 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
274 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
275 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
276 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
277 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
278 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
279 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
280 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
281
282#else /* This is the "abnormal" case, for EBCDIC systems */
283static const unsigned char digitab[] =
284 {
285 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
286 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
287 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
288 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
289 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
290 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
291 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
292 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
293 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
294 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
295 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
296