Oops, I introduced a bug in ${prvstest changes, shown up by building (new)
[exim.git] / src / src / pcre / pcre_compile.c
CommitLineData
92e772ff 1/* $Cambridge: exim/src/src/pcre/pcre_compile.c,v 1.2 2005/08/08 10:22:14 ph10 Exp $ */
8ac170f3
PH
2
3/*************************************************
4* Perl-Compatible Regular Expressions *
5*************************************************/
6
7/* PCRE is a library of functions to support regular expressions whose syntax
8and semantics are as close as possible to those of the Perl 5 language.
9
10 Written by Philip Hazel
11 Copyright (c) 1997-2005 University of Cambridge
12
13-----------------------------------------------------------------------------
14Redistribution and use in source and binary forms, with or without
15modification, are permitted provided that the following conditions are met:
16
17 * Redistributions of source code must retain the above copyright notice,
18 this list of conditions and the following disclaimer.
19
20 * Redistributions in binary form must reproduce the above copyright
21 notice, this list of conditions and the following disclaimer in the
22 documentation and/or other materials provided with the distribution.
23
24 * Neither the name of the University of Cambridge nor the names of its
25 contributors may be used to endorse or promote products derived from
26 this software without specific prior written permission.
27
28THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38POSSIBILITY OF SUCH DAMAGE.
39-----------------------------------------------------------------------------
40*/
41
42
43/* This module contains the external function pcre_compile(), along with
44supporting internal functions that are not used by other modules. */
45
46
47#include "pcre_internal.h"
48
49
50/*************************************************
51* Code parameters and static tables *
52*************************************************/
53
54/* Maximum number of items on the nested bracket stacks at compile time. This
55applies to the nesting of all kinds of parentheses. It does not limit
56un-nested, non-capturing parentheses. This number can be made bigger if
57necessary - it is used to dimension one int and one unsigned char vector at
58compile time. */
59
60#define BRASTACK_SIZE 200
61
62
63/* Table for handling escaped characters in the range '0'-'z'. Positive returns
64are simple data values; negative values are for special things like \d and so
65on. Zero means further processing is needed (for things like \x), or the escape
66is invalid. */
67
68#if !EBCDIC /* This is the "normal" table for ASCII systems */
69static const short int escapes[] = {
70 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
71 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
72 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
73 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
74-ESC_P, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
75-ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
76 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
77 0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
78-ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
79 0, 0, -ESC_z /* x - z */
80};
81
82#else /* This is the "abnormal" table for EBCDIC systems */
83static const short int escapes[] = {
84/* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
85/* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
86/* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
87/* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
88/* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
89/* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
90/* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
91/* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
92/* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
93/* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, -ESC_p,
94/* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
95/* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
96/* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
97/* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
98/* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
99/* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
100/* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
101/* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
102/* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0,
103/* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
104/* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
105/* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
106/* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
107};
108#endif
109
110
111/* Tables of names of POSIX character classes and their lengths. The list is
112terminated by a zero length entry. The first three must be alpha, upper, lower,
113as this is assumed for handling case independence. */
114
115static const char *const posix_names[] = {
116 "alpha", "lower", "upper",
117 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
118 "print", "punct", "space", "word", "xdigit" };
119
120static const uschar posix_name_lengths[] = {
121 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
122
123/* Table of class bit maps for each POSIX class; up to three may be combined
124to form the class. The table for [:blank:] is dynamically modified to remove
125the vertical space characters. */
126
127static const int posix_class_maps[] = {
128 cbit_lower, cbit_upper, -1, /* alpha */
129 cbit_lower, -1, -1, /* lower */
130 cbit_upper, -1, -1, /* upper */
131 cbit_digit, cbit_lower, cbit_upper, /* alnum */
132 cbit_print, cbit_cntrl, -1, /* ascii */
133 cbit_space, -1, -1, /* blank - a GNU extension */
134 cbit_cntrl, -1, -1, /* cntrl */
135 cbit_digit, -1, -1, /* digit */
136 cbit_graph, -1, -1, /* graph */
137 cbit_print, -1, -1, /* print */
138 cbit_punct, -1, -1, /* punct */
139 cbit_space, -1, -1, /* space */
140 cbit_word, -1, -1, /* word - a Perl extension */
141 cbit_xdigit,-1, -1 /* xdigit */
142};
143
144
145/* The texts of compile-time error messages. These are "char *" because they
146are passed to the outside world. */
147
148static const char *error_texts[] = {
149 "no error",
150 "\\ at end of pattern",
151 "\\c at end of pattern",
152 "unrecognized character follows \\",
153 "numbers out of order in {} quantifier",
154 /* 5 */
155 "number too big in {} quantifier",
156 "missing terminating ] for character class",
157 "invalid escape sequence in character class",
158 "range out of order in character class",
159 "nothing to repeat",
160 /* 10 */
161 "operand of unlimited repeat could match the empty string",
162 "internal error: unexpected repeat",
163 "unrecognized character after (?",
164 "POSIX named classes are supported only within a class",
165 "missing )",
166 /* 15 */
167 "reference to non-existent subpattern",
168 "erroffset passed as NULL",
169 "unknown option bit(s) set",
170 "missing ) after comment",
171 "parentheses nested too deeply",
172 /* 20 */
173 "regular expression too large",
174 "failed to get memory",
175 "unmatched parentheses",
176 "internal error: code overflow",
177 "unrecognized character after (?<",
178 /* 25 */
179 "lookbehind assertion is not fixed length",
180 "malformed number after (?(",
181 "conditional group contains more than two branches",
182 "assertion expected after (?(",
183 "(?R or (?digits must be followed by )",
184 /* 30 */
185 "unknown POSIX class name",
186 "POSIX collating elements are not supported",
187 "this version of PCRE is not compiled with PCRE_UTF8 support",
188 "spare error",
189 "character value in \\x{...} sequence is too large",
190 /* 35 */
191 "invalid condition (?(0)",
192 "\\C not allowed in lookbehind assertion",
193 "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
194 "number after (?C is > 255",
195 "closing ) for (?C expected",
196 /* 40 */
197 "recursive call could loop indefinitely",
198 "unrecognized character after (?P",
199 "syntax error after (?P",
200 "two named groups have the same name",
201 "invalid UTF-8 string",
202 /* 45 */
203 "support for \\P, \\p, and \\X has not been compiled",
204 "malformed \\P or \\p sequence",
205 "unknown property name after \\P or \\p"
206};
207
208
209/* Table to identify digits and hex digits. This is used when compiling
210patterns. Note that the tables in chartables are dependent on the locale, and
211may mark arbitrary characters as digits - but the PCRE compiling code expects
212to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
213a private table here. It costs 256 bytes, but it is a lot faster than doing
214character value tests (at least in some simple cases I timed), and in some
215applications one wants PCRE to compile efficiently as well as match
216efficiently.
217
218For convenience, we use the same bit definitions as in chartables:
219
220 0x04 decimal digit
221 0x08 hexadecimal digit
222
223Then we can use ctype_digit and ctype_xdigit in the code. */
224
225#if !EBCDIC /* This is the "normal" case, for ASCII systems */
226static const unsigned char digitab[] =
227 {
228 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
229 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
230 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
231 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
232 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
233 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
234 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
235 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
236 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
237 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
238 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
239 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
240 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
241 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
242 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
243 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
244 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
245 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
246 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
247 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
248 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
249 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
250 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
251 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
252 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
253 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
254 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
255 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
256 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
257 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
258 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
259 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
260
261#else /* This is the "abnormal" case, for EBCDIC systems */
262static const unsigned char digitab[] =
263 {
264 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
265 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
266 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
267 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
268 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
269 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
270 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
271 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
272 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
273 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
274 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
275