Call initgroups() when dropping privilege, in order that Exim runs with
[exim.git] / src / src / pcre / pcre.c
CommitLineData
8ac170f3
PH
1/* $Cambridge: exim/src/src/pcre/pcre.c,v 1.2 2005/06/15 08:57:10 ph10 Exp $ */
2
c86f6258
PH
3/*************************************************
4* Perl-Compatible Regular Expressions *
5*************************************************/
6
7/*
8This is a library of functions to support regular expressions whose syntax
9and semantics are as close as possible to those of the Perl 5 language. See
10the file Tech.Notes for some information on the internals.
11
12Written by: Philip Hazel <ph10@cam.ac.uk>
13
14 Copyright (c) 1997-2004 University of Cambridge
15
16-----------------------------------------------------------------------------
17Redistribution and use in source and binary forms, with or without
18modification, are permitted provided that the following conditions are met:
19
20 * Redistributions of source code must retain the above copyright notice,
21 this list of conditions and the following disclaimer.
22
23 * Redistributions in binary form must reproduce the above copyright
24 notice, this list of conditions and the following disclaimer in the
25 documentation and/or other materials provided with the distribution.
26
27 * Neither the name of the University of Cambridge nor the names of its
28 contributors may be used to endorse or promote products derived from
29 this software without specific prior written permission.
30
31THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
32AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
33IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
35LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
36CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
37SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
38INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
39CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
40ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
41POSSIBILITY OF SUCH DAMAGE.
42-----------------------------------------------------------------------------
43*/
44
45
46/* Define DEBUG to get debugging output on stdout. */
47/* #define DEBUG */
48
49/* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
50inline, and there are *still* stupid compilers about that don't like indented
51pre-processor statements. I suppose it's only been 10 years... */
52
53#ifdef DEBUG
54#define DPRINTF(p) printf p
55#else
56#define DPRINTF(p) /*nothing*/
57#endif
58
59/* Include the internals header, which itself includes "config.h", the Standard
60C headers, and the external pcre header. */
61
62#include "internal.h"
63
64/* If Unicode Property support is wanted, include a private copy of the
65function that does it, and the table that translates names to numbers. */
66
67#ifdef SUPPORT_UCP
68#include "ucp.c"
69#include "ucptypetable.c"
70#endif
71
72/* Maximum number of items on the nested bracket stacks at compile time. This
73applies to the nesting of all kinds of parentheses. It does not limit
74un-nested, non-capturing parentheses. This number can be made bigger if
75necessary - it is used to dimension one int and one unsigned char vector at
76compile time. */
77
78#define BRASTACK_SIZE 200
79
80
81/* Maximum number of ints of offset to save on the stack for recursive calls.
82If the offset vector is bigger, malloc is used. This should be a multiple of 3,
83because the offset vector is always a multiple of 3 long. */
84
85#define REC_STACK_SAVE_MAX 30
86
87
88/* The maximum remaining length of subject we are prepared to search for a
89req_byte match. */
90
91#define REQ_BYTE_MAX 1000
92
93
94/* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
95the definition is next to the definition of the opcodes in internal.h. */
96
97static const uschar OP_lengths[] = { OP_LENGTHS };
98
99/* Min and max values for the common repeats; for the maxima, 0 => infinity */
100
101static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
102static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
103
104/* Table for handling escaped characters in the range '0'-'z'. Positive returns
105are simple data values; negative values are for special things like \d and so
106on. Zero means further processing is needed (for things like \x), or the escape
107is invalid. */
108
109#if !EBCDIC /* This is the "normal" table for ASCII systems */
110static const short int escapes[] = {
111 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
112 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
113 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
114 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
115-ESC_P, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
116-ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
117 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
118 0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
119-ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
120 0, 0, -ESC_z /* x - z */
121};
122
123#else /* This is the "abnormal" table for EBCDIC systems */
124static const short int escapes[] = {
125/* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
126/* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
127/* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
128/* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
129/* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
130/* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
131/* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
132/* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
133/* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
134/* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, -ESC_p,
135/* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
136/* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
137/* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
138/* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
139/* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
140/* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
141/* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
142/* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
143/* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0,
144/* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
145/* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
146/* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
147/* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
148};
149#endif
150
151
152/* Tables of names of POSIX character classes and their lengths. The list is
153terminated by a zero length entry. The first three must be alpha, upper, lower,
154as this is assumed for handling case independence. */
155
156static const char *const posix_names[] = {
157 "alpha", "lower", "upper",
158 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
159 "print", "punct", "space", "word", "xdigit" };
160
161static const uschar posix_name_lengths[] = {
162 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
163
164/* Table of class bit maps for each POSIX class; up to three may be combined
165to form the class. The table for [:blank:] is dynamically modified to remove
166the vertical space characters. */
167
168static const int posix_class_maps[] = {
169 cbit_lower, cbit_upper, -1, /* alpha */
170 cbit_lower, -1, -1, /* lower */
171 cbit_upper, -1, -1, /* upper */
172 cbit_digit, cbit_lower, cbit_upper, /* alnum */
173 cbit_print, cbit_cntrl, -1, /* ascii */
174 cbit_space, -1, -1, /* blank - a GNU extension */
175 cbit_cntrl, -1, -1, /* cntrl */
176 cbit_digit, -1, -1, /* digit */
177 cbit_graph, -1, -1, /* graph */
178 cbit_print, -1, -1, /* print */
179 cbit_punct, -1, -1, /* punct */
180 cbit_space, -1, -1, /* space */
181 cbit_word, -1, -1, /* word - a Perl extension */
182 cbit_xdigit,-1, -1 /* xdigit */
183};
184
185/* Table to identify digits and hex digits. This is used when compiling
186patterns. Note that the tables in chartables are dependent on the locale, and
187may mark arbitrary characters as digits - but the PCRE compiling code expects
188to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
189a private table here. It costs 256 bytes, but it is a lot faster than doing
190character value tests (at least in some simple cases I timed), and in some
191applications one wants PCRE to compile efficiently as well as match
192efficiently.
193
194For convenience, we use the same bit definitions as in chartables:
195
196 0x04 decimal digit
197 0x08 hexadecimal digit
198
199Then we can use ctype_digit and ctype_xdigit in the code. */
200
201#if !EBCDIC /* This is the "normal" case, for ASCII systems */
202static const unsigned char digitab[] =
203 {
204 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
205 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
206 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
207 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
208 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
209 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
210 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
211 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
212 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
213 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
214 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
215 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
216 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
217 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
218 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
219 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
220 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
221 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
222 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
223 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
224 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
225 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
226 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
227 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
228 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
229 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
230 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
231 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
232 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
233 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
234 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
235 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
236
237#else /* This is the "abnormal" case, for EBCDIC systems */
238static const unsigned char digitab[] =
239 {
240 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
241 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
242 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
243 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
244 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
245 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
246 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
247 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
248 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
249 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
250 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
251