Commit | Line | Data |
---|---|---|
c86f6258 PH |
1 | /************************************************* |
2 | * Perl-Compatible Regular Expressions * | |
3 | *************************************************/ | |
4 | ||
5 | /* | |
6 | This is a library of functions to support regular expressions whose syntax | |
7 | and semantics are as close as possible to those of the Perl 5 language. See | |
8 | the file Tech.Notes for some information on the internals. | |
9 | ||
10 | Written by: Philip Hazel <ph10@cam.ac.uk> | |
11 | ||
12 | Copyright (c) 1997-2004 University of Cambridge | |
13 | ||
14 | ----------------------------------------------------------------------------- | |
15 | Redistribution and use in source and binary forms, with or without | |
16 | modification, are permitted provided that the following conditions are met: | |
17 | ||
18 | * Redistributions of source code must retain the above copyright notice, | |
19 | this list of conditions and the following disclaimer. | |
20 | ||
21 | * Redistributions in binary form must reproduce the above copyright | |
22 | notice, this list of conditions and the following disclaimer in the | |
23 | documentation and/or other materials provided with the distribution. | |
24 | ||
25 | * Neither the name of the University of Cambridge nor the names of its | |
26 | contributors may be used to endorse or promote products derived from | |
27 | this software without specific prior written permission. | |
28 | ||
29 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
30 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
31 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
32 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |
33 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
34 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
35 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
36 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
37 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
38 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
39 | POSSIBILITY OF SUCH DAMAGE. | |
40 | ----------------------------------------------------------------------------- | |
41 | */ | |
42 | ||
43 | ||
44 | /* Include the internals header, which itself includes Standard C headers plus | |
45 | the external pcre header. */ | |
46 | ||
47 | #include "internal.h" | |
48 | ||
49 | ||
50 | ||
51 | /************************************************* | |
52 | * Set a bit and maybe its alternate case * | |
53 | *************************************************/ | |
54 | ||
55 | /* Given a character, set its bit in the table, and also the bit for the other | |
56 | version of a letter if we are caseless. | |
57 | ||
58 | Arguments: | |
59 | start_bits points to the bit map | |
60 | c is the character | |
61 | caseless the caseless flag | |
62 | cd the block with char table pointers | |
63 | ||
64 | Returns: nothing | |
65 | */ | |
66 | ||
67 | static void | |
68 | set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd) | |
69 | { | |
70 | start_bits[c/8] |= (1 << (c&7)); | |
71 | if (caseless && (cd->ctypes[c] & ctype_letter) != 0) | |
72 | start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7)); | |
73 | } | |
74 | ||
75 | ||
76 | ||
77 | /************************************************* | |
78 | * Create bitmap of starting chars * | |
79 | *************************************************/ | |
80 | ||
81 | /* This function scans a compiled unanchored expression and attempts to build a | |
82 | bitmap of the set of initial characters. If it can't, it returns FALSE. As time | |
83 | goes by, we may be able to get more clever at doing this. | |
84 | ||
85 | Arguments: | |
86 | code points to an expression | |
87 | start_bits points to a 32-byte table, initialized to 0 | |
88 | caseless the current state of the caseless flag | |
89 | utf8 TRUE if in UTF-8 mode | |
90 | cd the block with char table pointers | |
91 | ||
92 | Returns: TRUE if table built, FALSE otherwise | |
93 | */ | |
94 | ||
95 | static BOOL | |
96 | set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless, | |
97 | BOOL utf8, compile_data *cd) | |
98 | { | |
99 | register int c; | |
100 | ||
101 | /* This next statement and the later reference to dummy are here in order to | |
102 | trick the optimizer of the IBM C compiler for OS/2 into generating correct | |
103 | code. Apparently IBM isn't going to fix the problem, and we would rather not | |
104 | disable optimization (in this module it actually makes a big difference, and | |
105 | the pcre module can use all the optimization it can get). */ | |
106 | ||
107 | volatile int dummy; | |
108 | ||
109 | do | |
110 | { | |
111 | const uschar *tcode = code + 1 + LINK_SIZE; | |
112 | BOOL try_next = TRUE; | |
113 | ||
114 | while (try_next) | |
115 | { | |
116 | /* If a branch starts with a bracket or a positive lookahead assertion, | |
117 | recurse to set bits from within them. That's all for this branch. */ | |
118 | ||
119 | if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT) | |
120 | { | |
121 | if (!set_start_bits(tcode, start_bits, caseless, utf8, cd)) | |
122 | return FALSE; | |
123 | try_next = FALSE; | |
124 | } | |
125 | ||
126 | else switch(*tcode) | |
127 | { | |
128 | default: | |
129 | return FALSE; | |
130 | ||
131 | /* Skip over callout */ | |
132 | ||
133 | case OP_CALLOUT: | |
134 | tcode += 2 + 2*LINK_SIZE; | |
135 | break; | |
136 | ||
137 | /* Skip over extended extraction bracket number */ | |
138 | ||
139 | case OP_BRANUMBER: | |
140 | tcode += 3; | |
141 | break; | |
142 | ||
143 | /* Skip over lookbehind and negative lookahead assertions */ | |
144 | ||
145 | case OP_ASSERT_NOT: | |
146 | case OP_ASSERTBACK: | |
147 | case OP_ASSERTBACK_NOT: | |
148 | do tcode += GET(tcode, 1); while (*tcode == OP_ALT); | |
149 | tcode += 1+LINK_SIZE; | |
150 | break; | |
151 | ||
152 | /* Skip over an option setting, changing the caseless flag */ | |
153 | ||
154 | case OP_OPT: | |
155 | caseless = (tcode[1] & PCRE_CASELESS) != 0; | |
156 | tcode += 2; | |
157 | break; | |
158 | ||
159 | /* BRAZERO does the bracket, but carries on. */ | |
160 | ||
161 | case OP_BRAZERO: | |
162 | case OP_BRAMINZERO: | |
163 | if (!set_start_bits(++tcode, start_bits, caseless, utf8, cd)) | |
164 | return FALSE; | |
165 | dummy = 1; | |
166 | do tcode += GET(tcode,1); while (*tcode == OP_ALT); | |
167 | tcode += 1+LINK_SIZE; | |
168 | break; | |
169 | ||
170 | /* Single-char * or ? sets the bit and tries the next item */ | |
171 | ||
172 | case OP_STAR: | |
173 | case OP_MINSTAR: | |
174 | case OP_QUERY: | |
175 | case OP_MINQUERY: | |
176 | set_bit(start_bits, tcode[1], caseless, cd); | |
177 | tcode += 2; | |
178 | #ifdef SUPPORT_UTF8 | |
179 | if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++; | |
180 | #endif | |
181 | break; | |
182 | ||
183 | /* Single-char upto sets the bit and tries the next */ | |
184 | ||
185 | case OP_UPTO: | |
186 | case OP_MINUPTO: | |
187 | set_bit(start_bits, tcode[3], caseless, cd); | |
188 | tcode += 4; | |
189 | #ifdef SUPPORT_UTF8 | |
190 | if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++; | |
191 | #endif | |
192 | break; | |
193 | ||
194 | /* At least one single char sets the bit and stops */ | |
195 | ||
196 | case OP_EXACT: /* Fall through */ | |
197 | tcode += 2; | |
198 | ||
199 | case OP_CHAR: | |
200 | case OP_CHARNC: | |
201 | case OP_PLUS: | |
202 | case OP_MINPLUS: | |
203 | set_bit(start_bits, tcode[1], caseless, cd); | |
204 | try_next = FALSE; | |
205 | break; | |
206 | ||
207 | /* Single character type sets the bits and stops */ | |
208 | ||
209 | case OP_NOT_DIGIT: | |
210 | for (c = 0; c < 32; c++) | |
211 | start_bits[c] |= ~cd->cbits[c+cbit_digit]; | |
212 | try_next = FALSE; | |
213 | break; | |
214 | ||
215 | case OP_DIGIT: | |
216 | for (c = 0; c < 32; c++) | |
217 | start_bits[c] |= cd->cbits[c+cbit_digit]; | |
218 | try_next = FALSE; | |
219 | break; | |
220 | ||
221 | case OP_NOT_WHITESPACE: | |
222 | for (c = 0; c < 32; c++) | |
223 | start_bits[c] |= ~cd->cbits[c+cbit_space]; | |
224 | try_next = FALSE; | |
225 | break; | |
226 | ||
227 | case OP_WHITESPACE: | |
228 | for (c = 0; c < 32; c++) | |
229 | start_bits[c] |= cd->cbits[c+cbit_space]; | |
230 | try_next = FALSE; | |
231 | break; | |
232 | ||
233 | case OP_NOT_WORDCHAR: | |
234 | for (c = 0; c < 32; c++) | |
235 | start_bits[c] |= ~cd->cbits[c+cbit_word]; | |
236 | try_next = FALSE; | |
237 | break; | |
238 | ||
239 | case OP_WORDCHAR: | |
240 | for (c = 0; c < 32; c++) | |
241 | start_bits[c] |= cd->cbits[c+cbit_word]; | |
242 | try_next = FALSE; | |
243 | break; | |
244 | ||
245 | /* One or more character type fudges the pointer and restarts, knowing | |
246 | it will hit a single character type and stop there. */ | |
247 | ||
248 | case OP_TYPEPLUS: | |
249 | case OP_TYPEMINPLUS: | |
250 | tcode++; | |
251 | break; | |
252 | ||
253 | case OP_TYPEEXACT: | |
254 | tcode += 3; | |
255 | break; | |
256 | ||
257 | /* Zero or more repeats of character types set the bits and then | |
258 | try again. */ | |
259 | ||
260 | case OP_TYPEUPTO: | |
261 | case OP_TYPEMINUPTO: | |
262 | tcode += 2; /* Fall through */ | |
263 | ||
264 | case OP_TYPESTAR: | |
265 | case OP_TYPEMINSTAR: | |
266 | case OP_TYPEQUERY: | |
267 | case OP_TYPEMINQUERY: | |
268 | switch(tcode[1]) | |
269 | { | |
270 | case OP_ANY: | |
271 | return FALSE; | |
272 | ||
273 | case OP_NOT_DIGIT: | |
274 | for (c = 0; c < 32; c++) | |
275 | start_bits[c] |= ~cd->cbits[c+cbit_digit]; | |
276 | break; | |
277 | ||
278 | case OP_DIGIT: | |
279 | for (c = 0; c < 32; c++) | |
280 | start_bits[c] |= cd->cbits[c+cbit_digit]; | |
281 | break; | |
282 | ||
283 | case OP_NOT_WHITESPACE: | |
284 | for (c = 0; c < 32; c++) | |
285 | start_bits[c] |= ~cd->cbits[c+cbit_space]; | |
286 | break; | |
287 | ||
288 | case OP_WHITESPACE: | |
289 | for (c = 0; c < 32; c++) | |
290 | start_bits[c] |= cd->cbits[c+cbit_space]; | |
291 | break; | |
292 | ||
293 | case OP_NOT_WORDCHAR: | |
294 | for (c = 0; c < 32; c++) | |
295 | start_bits[c] |= ~cd->cbits[c+cbit_word]; | |
296 | break; | |
297 | ||
298 | case OP_WORDCHAR: | |
299 | for (c = 0; c < 32; c++) | |
300 | start_bits[c] |= cd->cbits[c+cbit_word]; | |
301 | break; | |
302 | } | |
303 | ||
304 | tcode += 2; | |
305 | break; | |
306 | ||
307 | /* Character class where all the information is in a bit map: set the | |
308 | bits and either carry on or not, according to the repeat count. If it was | |
309 | a negative class, and we are operating with UTF-8 characters, any byte | |
310 | with a value >= 0xc4 is a potentially valid starter because it starts a | |
311 | character with a value > 255. */ | |
312 | ||
313 | case OP_NCLASS: | |
314 | if (utf8) | |
315 | { | |
316 | start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */ | |
317 | memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */ | |
318 | } | |
319 | /* Fall through */ | |
320 | ||
321 | case OP_CLASS: | |
322 | { | |
323 | tcode++; | |
324 | ||
325 | /* In UTF-8 mode, the bits in a bit map correspond to character | |
326 | values, not to byte values. However, the bit map we are constructing is | |
327 | for byte values. So we have to do a conversion for characters whose | |
328 | value is > 127. In fact, there are only two possible starting bytes for | |
329 | characters in the range 128 - 255. */ | |
330 | ||
331 | if (utf8) | |
332 | { | |
333 | for (c = 0; c < 16; c++) start_bits[c] |= tcode[c]; | |
334 | for (c = 128; c < 256; c++) | |
335 | { | |
336 | if ((tcode[c/8] && (1 << (c&7))) != 0) | |
337 | { | |
338 | int d = (c >> 6) | 0xc0; /* Set bit for this starter */ | |
339 | start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */ | |
340 | c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */ | |
341 | } | |
342 | } | |
343 | } | |
344 | ||
345 | /* In non-UTF-8 mode, the two bit maps are completely compatible. */ | |
346 | ||
347 | else | |
348 | { | |
349 | for (c = 0; c < 32; c++) start_bits[c] |= tcode[c]; | |
350 | } | |
351 | ||
352 | /* Advance past the bit map, and act on what follows */ | |
353 | ||
354 | tcode += 32; | |
355 | switch (*tcode) | |
356 | { | |
357 | case OP_CRSTAR: | |
358 | case OP_CRMINSTAR: | |
359 | case OP_CRQUERY: | |
360 | case OP_CRMINQUERY: | |
361 | tcode++; | |
362 | break; | |
363 | ||
364 | case OP_CRRANGE: | |
365 | case OP_CRMINRANGE: | |
366 | if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5; | |
367 | else try_next = FALSE; | |
368 | break; | |
369 | ||
370 | default: | |
371 | try_next = FALSE; | |
372 | break; | |
373 | } | |
374 | } | |
375 | break; /* End of bitmap class handling */ | |
376 | ||
377 | } /* End of switch */ | |
378 | } /* End of try_next loop */ | |
379 | ||
380 | code += GET(code, 1); /* Advance to next branch */ | |
381 | } | |
382 | while (*code == OP_ALT); | |
383 | return TRUE; | |
384 | } | |
385 | ||
386 | ||
387 | ||
388 | /************************************************* | |
389 | * Study a compiled expression * | |
390 | *************************************************/ | |
391 | ||
392 | /* This function is handed a compiled expression that it must study to produce | |
393 | information that will speed up the matching. It returns a pcre_extra block | |
394 | which then gets handed back to pcre_exec(). | |
395 | ||
396 | Arguments: | |
397 | re points to the compiled expression | |
398 | options contains option bits | |
399 | errorptr points to where to place error messages; | |
400 | set NULL unless error | |
401 | ||
402 | Returns: pointer to a pcre_extra block, with study_data filled in and the | |
403 | appropriate flag set; | |
404 | NULL on error or if no optimization possible | |
405 | */ | |
406 | ||
407 | EXPORT pcre_extra * | |
408 | pcre_study(const pcre *external_re, int options, const char **errorptr) | |
409 | { | |
410 | uschar start_bits[32]; | |
411 | pcre_extra *extra; | |
412 | pcre_study_data *study; | |
413 | const uschar *tables; | |
414 | const real_pcre *re = (const real_pcre *)external_re; | |
415 | uschar *code = (uschar *)re + re->name_table_offset + | |
416 | (re->name_count * re->name_entry_size); | |
417 | compile_data compile_block; | |
418 | ||
419 | *errorptr = NULL; | |
420 | ||
421 | if (re == NULL || re->magic_number != MAGIC_NUMBER) | |
422 | { | |
423 | *errorptr = "argument is not a compiled regular expression"; | |
424 | return NULL; | |
425 | } | |
426 | ||
427 | if ((options & ~PUBLIC_STUDY_OPTIONS) != 0) | |
428 | { | |
429 | *errorptr = "unknown or incorrect option bit(s) set"; | |
430 | return NULL; | |
431 | } | |
432 | ||
433 | /* For an anchored pattern, or an unanchored pattern that has a first char, or | |
434 | a multiline pattern that matches only at "line starts", no further processing | |
435 | at present. */ | |
436 | ||
437 | if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0) | |
438 | return NULL; | |
439 | ||
440 | /* Set the character tables in the block that is passed around */ | |
441 | ||
442 | tables = re->tables; | |
443 | if (tables == NULL) | |
444 | (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES, &tables); | |
445 | ||
446 | compile_block.lcc = tables + lcc_offset; | |
447 | compile_block.fcc = tables + fcc_offset; | |
448 | compile_block.cbits = tables + cbits_offset; | |
449 | compile_block.ctypes = tables + ctypes_offset; | |
450 | ||
451 | /* See if we can find a fixed set of initial characters for the pattern. */ | |
452 | ||
453 | memset(start_bits, 0, 32 * sizeof(uschar)); | |
454 | if (!set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0, | |
455 | (re->options & PCRE_UTF8) != 0, &compile_block)) return NULL; | |
456 | ||
457 | /* Get a pcre_extra block and a pcre_study_data block. The study data is put in | |
458 | the latter, which is pointed to by the former, which may also get additional | |
459 | data set later by the calling program. At the moment, the size of | |
460 | pcre_study_data is fixed. We nevertheless save it in a field for returning via | |
461 | the pcre_fullinfo() function so that if it becomes variable in the future, we | |
462 | don't have to change that code. */ | |
463 | ||
464 | extra = (pcre_extra *)(pcre_malloc) | |
465 | (sizeof(pcre_extra) + sizeof(pcre_study_data)); | |
466 | ||
467 | if (extra == NULL) | |
468 | { | |
469 | *errorptr = "failed to get memory"; | |
470 | return NULL; | |
471 | } | |
472 | ||
473 | study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra)); | |
474 | extra->flags = PCRE_EXTRA_STUDY_DATA; | |
475 | extra->study_data = study; | |
476 | ||
477 | study->size = sizeof(pcre_study_data); | |
478 | study->options = PCRE_STUDY_MAPPED; | |
479 | memcpy(study->start_bits, start_bits, sizeof(start_bits)); | |
480 | ||
481 | return extra; | |
482 | } | |
483 | ||
484 | /* End of study.c */ |