Add sender to log line for DATA timeout.
[exim.git] / src / src / pcre / pcre.c
CommitLineData
c86f6258
PH
1/*************************************************
2* Perl-Compatible Regular Expressions *
3*************************************************/
4
5/*
6This is a library of functions to support regular expressions whose syntax
7and semantics are as close as possible to those of the Perl 5 language. See
8the file Tech.Notes for some information on the internals.
9
10Written by: Philip Hazel <ph10@cam.ac.uk>
11
12 Copyright (c) 1997-2004 University of Cambridge
13
14-----------------------------------------------------------------------------
15Redistribution and use in source and binary forms, with or without
16modification, are permitted provided that the following conditions are met:
17
18 * Redistributions of source code must retain the above copyright notice,
19 this list of conditions and the following disclaimer.
20
21 * Redistributions in binary form must reproduce the above copyright
22 notice, this list of conditions and the following disclaimer in the
23 documentation and/or other materials provided with the distribution.
24
25 * Neither the name of the University of Cambridge nor the names of its
26 contributors may be used to endorse or promote products derived from
27 this software without specific prior written permission.
28
29THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
30AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
33LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
34CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
35SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
36INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
37CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
38ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
39POSSIBILITY OF SUCH DAMAGE.
40-----------------------------------------------------------------------------
41*/
42
43
44/* Define DEBUG to get debugging output on stdout. */
45/* #define DEBUG */
46
47/* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
48inline, and there are *still* stupid compilers about that don't like indented
49pre-processor statements. I suppose it's only been 10 years... */
50
51#ifdef DEBUG
52#define DPRINTF(p) printf p
53#else
54#define DPRINTF(p) /*nothing*/
55#endif
56
57/* Include the internals header, which itself includes "config.h", the Standard
58C headers, and the external pcre header. */
59
60#include "internal.h"
61
62/* If Unicode Property support is wanted, include a private copy of the
63function that does it, and the table that translates names to numbers. */
64
65#ifdef SUPPORT_UCP
66#include "ucp.c"
67#include "ucptypetable.c"
68#endif
69
70/* Maximum number of items on the nested bracket stacks at compile time. This
71applies to the nesting of all kinds of parentheses. It does not limit
72un-nested, non-capturing parentheses. This number can be made bigger if
73necessary - it is used to dimension one int and one unsigned char vector at
74compile time. */
75
76#define BRASTACK_SIZE 200
77
78
79/* Maximum number of ints of offset to save on the stack for recursive calls.
80If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81because the offset vector is always a multiple of 3 long. */
82
83#define REC_STACK_SAVE_MAX 30
84
85
86/* The maximum remaining length of subject we are prepared to search for a
87req_byte match. */
88
89#define REQ_BYTE_MAX 1000
90
91
92/* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
93the definition is next to the definition of the opcodes in internal.h. */
94
95static const uschar OP_lengths[] = { OP_LENGTHS };
96
97/* Min and max values for the common repeats; for the maxima, 0 => infinity */
98
99static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
100static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
101
102/* Table for handling escaped characters in the range '0'-'z'. Positive returns
103are simple data values; negative values are for special things like \d and so
104on. Zero means further processing is needed (for things like \x), or the escape
105is invalid. */
106
107#if !EBCDIC /* This is the "normal" table for ASCII systems */
108static const short int escapes[] = {
109 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
110 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
111 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
112 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
113-ESC_P, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
114-ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
115 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
116 0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
117-ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
118 0, 0, -ESC_z /* x - z */
119};
120
121#else /* This is the "abnormal" table for EBCDIC systems */
122static const short int escapes[] = {
123/* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
124/* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
125/* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
126/* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
127/* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
128/* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
129/* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
130/* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
131/* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
132/* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, -ESC_p,
133/* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
134/* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
135/* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
136/* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
137/* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
138/* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
139/* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
140/* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
141/* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0,
142/* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
143/* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
144/* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
145/* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
146};
147#endif
148
149
150/* Tables of names of POSIX character classes and their lengths. The list is
151terminated by a zero length entry. The first three must be alpha, upper, lower,
152as this is assumed for handling case independence. */
153
154static const char *const posix_names[] = {
155 "alpha", "lower", "upper",
156 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
157 "print", "punct", "space", "word", "xdigit" };
158
159static const uschar posix_name_lengths[] = {
160 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
161
162/* Table of class bit maps for each POSIX class; up to three may be combined
163to form the class. The table for [:blank:] is dynamically modified to remove
164the vertical space characters. */
165
166static const int posix_class_maps[] = {
167 cbit_lower, cbit_upper, -1, /* alpha */
168 cbit_lower, -1, -1, /* lower */
169 cbit_upper, -1, -1, /* upper */
170 cbit_digit, cbit_lower, cbit_upper, /* alnum */
171 cbit_print, cbit_cntrl, -1, /* ascii */
172 cbit_space, -1, -1, /* blank - a GNU extension */
173 cbit_cntrl, -1, -1, /* cntrl */
174 cbit_digit, -1, -1, /* digit */
175 cbit_graph, -1, -1, /* graph */
176 cbit_print, -1, -1, /* print */
177 cbit_punct, -1, -1, /* punct */
178 cbit_space, -1, -1, /* space */
179 cbit_word, -1, -1, /* word - a Perl extension */
180 cbit_xdigit,-1, -1 /* xdigit */
181};
182
183/* Table to identify digits and hex digits. This is used when compiling
184patterns. Note that the tables in chartables are dependent on the locale, and
185may mark arbitrary characters as digits - but the PCRE compiling code expects
186to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
187a private table here. It costs 256 bytes, but it is a lot faster than doing
188character value tests (at least in some simple cases I timed), and in some
189applications one wants PCRE to compile efficiently as well as match
190efficiently.
191
192For convenience, we use the same bit definitions as in chartables:
193
194 0x04 decimal digit
195 0x08 hexadecimal digit
196
197Then we can use ctype_digit and ctype_xdigit in the code. */
198
199#if !EBCDIC /* This is the "normal" case, for ASCII systems */
200static const unsigned char digitab[] =
201 {
202 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
203 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
204 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
205 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
206 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
207 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
208 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
209 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
210 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
211 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
212 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
213 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
214 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
215 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
216 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
217 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
218 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
219 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
220 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
221 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
222 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
223 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
224 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
225 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
226 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
227 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
228 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
229 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
230 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
231 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
232 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
233 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
234
235#else /* This is the "abnormal" case, for EBCDIC systems */
236static const unsigned char digitab[] =
237 {
238 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
239 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
240 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
241 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
242 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
243 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
244 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
245 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
246 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
247 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
248 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
249