Commit | Line | Data |
---|---|---|
8ac170f3 PH |
1 | /* $Cambridge: exim/src/src/pcre/pcre.c,v 1.2 2005/06/15 08:57:10 ph10 Exp $ */ |
2 | ||
c86f6258 PH |
3 | /************************************************* |
4 | * Perl-Compatible Regular Expressions * | |
5 | *************************************************/ | |
6 | ||
7 | /* | |
8 | This is a library of functions to support regular expressions whose syntax | |
9 | and semantics are as close as possible to those of the Perl 5 language. See | |
10 | the file Tech.Notes for some information on the internals. | |
11 | ||
12 | Written by: Philip Hazel <ph10@cam.ac.uk> | |
13 | ||
14 | Copyright (c) 1997-2004 University of Cambridge | |
15 | ||
16 | ----------------------------------------------------------------------------- | |
17 | Redistribution and use in source and binary forms, with or without | |
18 | modification, are permitted provided that the following conditions are met: | |
19 | ||
20 | * Redistributions of source code must retain the above copyright notice, | |
21 | this list of conditions and the following disclaimer. | |
22 | ||
23 | * Redistributions in binary form must reproduce the above copyright | |
24 | notice, this list of conditions and the following disclaimer in the | |
25 | documentation and/or other materials provided with the distribution. | |
26 | ||
27 | * Neither the name of the University of Cambridge nor the names of its | |
28 | contributors may be used to endorse or promote products derived from | |
29 | this software without specific prior written permission. | |
30 | ||
31 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
32 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
33 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
34 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |
35 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
36 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
37 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
38 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
39 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
40 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
41 | POSSIBILITY OF SUCH DAMAGE. | |
42 | ----------------------------------------------------------------------------- | |
43 | */ | |
44 | ||
45 | ||
46 | /* Define DEBUG to get debugging output on stdout. */ | |
47 | /* #define DEBUG */ | |
48 | ||
49 | /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef | |
50 | inline, and there are *still* stupid compilers about that don't like indented | |
51 | pre-processor statements. I suppose it's only been 10 years... */ | |
52 | ||
53 | #ifdef DEBUG | |
54 | #define DPRINTF(p) printf p | |
55 | #else | |
56 | #define DPRINTF(p) /*nothing*/ | |
57 | #endif | |
58 | ||
59 | /* Include the internals header, which itself includes "config.h", the Standard | |
60 | C headers, and the external pcre header. */ | |
61 | ||
62 | #include "internal.h" | |
63 | ||
64 | /* If Unicode Property support is wanted, include a private copy of the | |
65 | function that does it, and the table that translates names to numbers. */ | |
66 | ||
67 | #ifdef SUPPORT_UCP | |
68 | #include "ucp.c" | |
69 | #include "ucptypetable.c" | |
70 | #endif | |
71 | ||
72 | /* Maximum number of items on the nested bracket stacks at compile time. This | |
73 | applies to the nesting of all kinds of parentheses. It does not limit | |
74 | un-nested, non-capturing parentheses. This number can be made bigger if | |
75 | necessary - it is used to dimension one int and one unsigned char vector at | |
76 | compile time. */ | |
77 | ||
78 | #define BRASTACK_SIZE 200 | |
79 | ||
80 | ||
81 | /* Maximum number of ints of offset to save on the stack for recursive calls. | |
82 | If the offset vector is bigger, malloc is used. This should be a multiple of 3, | |
83 | because the offset vector is always a multiple of 3 long. */ | |
84 | ||
85 | #define REC_STACK_SAVE_MAX 30 | |
86 | ||
87 | ||
88 | /* The maximum remaining length of subject we are prepared to search for a | |
89 | req_byte match. */ | |
90 | ||
91 | #define REQ_BYTE_MAX 1000 | |
92 | ||
93 | ||
94 | /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that | |
95 | the definition is next to the definition of the opcodes in internal.h. */ | |
96 | ||
97 | static const uschar OP_lengths[] = { OP_LENGTHS }; | |
98 | ||
99 | /* Min and max values for the common repeats; for the maxima, 0 => infinity */ | |
100 | ||
101 | static const char rep_min[] = { 0, 0, 1, 1, 0, 0 }; | |
102 | static const char rep_max[] = { 0, 0, 0, 0, 1, 1 }; | |
103 | ||
104 | /* Table for handling escaped characters in the range '0'-'z'. Positive returns | |
105 | are simple data values; negative values are for special things like \d and so | |
106 | on. Zero means further processing is needed (for things like \x), or the escape | |
107 | is invalid. */ | |
108 | ||
109 | #if !EBCDIC /* This is the "normal" table for ASCII systems */ | |
110 | static const short int escapes[] = { | |
111 | 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */ | |
112 | 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */ | |
113 | '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */ | |
114 | 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */ | |
115 | -ESC_P, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */ | |
116 | -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */ | |
117 | '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */ | |
118 | 0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */ | |
119 | -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */ | |
120 | 0, 0, -ESC_z /* x - z */ | |
121 | }; | |
122 | ||
123 | #else /* This is the "abnormal" table for EBCDIC systems */ | |
124 | static const short int escapes[] = { | |
125 | /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|', | |
126 | /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0, | |
127 | /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~', | |
128 | /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0, | |
129 | /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?', | |
130 | /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0, | |
131 | /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"', | |
132 | /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, | |
133 | /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0, | |
134 | /* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, -ESC_p, | |
135 | /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0, | |
136 | /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0, | |
137 | /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0, | |
138 | /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0, | |
139 | /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-', | |
140 | /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G, | |
141 | /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0, | |
142 | /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P, | |
143 | /* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0, | |
144 | /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X, | |
145 | /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0, | |
146 | /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0, | |
147 | /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0 | |
148 | }; | |
149 | #endif | |
150 | ||
151 | ||
152 | /* Tables of names of POSIX character classes and their lengths. The list is | |
153 | terminated by a zero length entry. The first three must be alpha, upper, lower, | |
154 | as this is assumed for handling case independence. */ | |
155 | ||
156 | static const char *const posix_names[] = { | |
157 | "alpha", "lower", "upper", | |
158 | "alnum", "ascii", "blank", "cntrl", "digit", "graph", | |
159 | "print", "punct", "space", "word", "xdigit" }; | |
160 | ||
161 | static const uschar posix_name_lengths[] = { | |
162 | 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 }; | |
163 | ||
164 | /* Table of class bit maps for each POSIX class; up to three may be combined | |
165 | to form the class. The table for [:blank:] is dynamically modified to remove | |
166 | the vertical space characters. */ | |
167 | ||
168 | static const int posix_class_maps[] = { | |
169 | cbit_lower, cbit_upper, -1, /* alpha */ | |
170 | cbit_lower, -1, -1, /* lower */ | |
171 | cbit_upper, -1, -1, /* upper */ | |
172 | cbit_digit, cbit_lower, cbit_upper, /* alnum */ | |
173 | cbit_print, cbit_cntrl, -1, /* ascii */ | |
174 | cbit_space, -1, -1, /* blank - a GNU extension */ | |
175 | cbit_cntrl, -1, -1, /* cntrl */ | |
176 | cbit_digit, -1, -1, /* digit */ | |
177 | cbit_graph, -1, -1, /* graph */ | |
178 | cbit_print, -1, -1, /* print */ | |
179 | cbit_punct, -1, -1, /* punct */ | |
180 | cbit_space, -1, -1, /* space */ | |
181 | cbit_word, -1, -1, /* word - a Perl extension */ | |
182 | cbit_xdigit,-1, -1 /* xdigit */ | |
183 | }; | |
184 | ||
185 | /* Table to identify digits and hex digits. This is used when compiling | |
186 | patterns. Note that the tables in chartables are dependent on the locale, and | |
187 | may mark arbitrary characters as digits - but the PCRE compiling code expects | |
188 | to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have | |
189 | a private table here. It costs 256 bytes, but it is a lot faster than doing | |
190 | character value tests (at least in some simple cases I timed), and in some | |
191 | applications one wants PCRE to compile efficiently as well as match | |
192 | efficiently. | |
193 | ||
194 | For convenience, we use the same bit definitions as in chartables: | |
195 | ||
196 | 0x04 decimal digit | |
197 | 0x08 hexadecimal digit | |
198 | ||
199 | Then we can use ctype_digit and ctype_xdigit in the code. */ | |
200 | ||
201 | #if !EBCDIC /* This is the "normal" case, for ASCII systems */ | |
202 | static const unsigned char digitab[] = | |
203 | { | |
204 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */ | |
205 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */ | |
206 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */ | |
207 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */ | |
208 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */ | |
209 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */ | |
210 | 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */ | |
211 | 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */ | |
212 | 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */ | |
213 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */ | |
214 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */ | |
215 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */ | |
216 | 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */ | |
217 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */ | |
218 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */ | |
219 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */ | |
220 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */ | |
221 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */ | |
222 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */ | |
223 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */ | |
224 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */ | |
225 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */ | |
226 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */ | |
227 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */ | |
228 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */ | |
229 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */ | |
230 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */ | |
231 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */ | |
232 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */ | |
233 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */ | |
234 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */ | |
235 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */ | |
236 | ||
237 | #else /* This is the "abnormal" case, for EBCDIC systems */ | |
238 | static const unsigned char digitab[] = | |
239 | { | |
240 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */ | |
241 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */ | |
242 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */ | |
243 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */ | |
244 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */ | |
245 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */ | |
246 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */ | |
247 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */ | |
248 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */ | |
249 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */ | |
250 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */ | |
251 |