Commit | Line | Data |
---|---|---|
8ac170f3 PH |
1 | /* $Cambridge: exim/src/src/pcre/study.c,v 1.2 2005/06/15 08:57:10 ph10 Exp $ */ |
2 | ||
c86f6258 PH |
3 | /************************************************* |
4 | * Perl-Compatible Regular Expressions * | |
5 | *************************************************/ | |
6 | ||
7 | /* | |
8 | This is a library of functions to support regular expressions whose syntax | |
9 | and semantics are as close as possible to those of the Perl 5 language. See | |
10 | the file Tech.Notes for some information on the internals. | |
11 | ||
12 | Written by: Philip Hazel <ph10@cam.ac.uk> | |
13 | ||
14 | Copyright (c) 1997-2004 University of Cambridge | |
15 | ||
16 | ----------------------------------------------------------------------------- | |
17 | Redistribution and use in source and binary forms, with or without | |
18 | modification, are permitted provided that the following conditions are met: | |
19 | ||
20 | * Redistributions of source code must retain the above copyright notice, | |
21 | this list of conditions and the following disclaimer. | |
22 | ||
23 | * Redistributions in binary form must reproduce the above copyright | |
24 | notice, this list of conditions and the following disclaimer in the | |
25 | documentation and/or other materials provided with the distribution. | |
26 | ||
27 | * Neither the name of the University of Cambridge nor the names of its | |
28 | contributors may be used to endorse or promote products derived from | |
29 | this software without specific prior written permission. | |
30 | ||
31 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
32 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
33 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
34 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |
35 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
36 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
37 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
38 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
39 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
40 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
41 | POSSIBILITY OF SUCH DAMAGE. | |
42 | ----------------------------------------------------------------------------- | |
43 | */ | |
44 | ||
45 | ||
46 | /* Include the internals header, which itself includes Standard C headers plus | |
47 | the external pcre header. */ | |
48 | ||
49 | #include "internal.h" | |
50 | ||
51 | ||
52 | ||
53 | /************************************************* | |
54 | * Set a bit and maybe its alternate case * | |
55 | *************************************************/ | |
56 | ||
57 | /* Given a character, set its bit in the table, and also the bit for the other | |
58 | version of a letter if we are caseless. | |
59 | ||
60 | Arguments: | |
61 | start_bits points to the bit map | |
62 | c is the character | |
63 | caseless the caseless flag | |
64 | cd the block with char table pointers | |
65 | ||
66 | Returns: nothing | |
67 | */ | |
68 | ||
69 | static void | |
70 | set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd) | |
71 | { | |
72 | start_bits[c/8] |= (1 << (c&7)); | |
73 | if (caseless && (cd->ctypes[c] & ctype_letter) != 0) | |
74 | start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7)); | |
75 | } | |
76 | ||
77 | ||
78 | ||
79 | /************************************************* | |
80 | * Create bitmap of starting chars * | |
81 | *************************************************/ | |
82 | ||
83 | /* This function scans a compiled unanchored expression and attempts to build a | |
84 | bitmap of the set of initial characters. If it can't, it returns FALSE. As time | |
85 | goes by, we may be able to get more clever at doing this. | |
86 | ||
87 | Arguments: | |
88 | code points to an expression | |
89 | start_bits points to a 32-byte table, initialized to 0 | |
90 | caseless the current state of the caseless flag | |
91 | utf8 TRUE if in UTF-8 mode | |
92 | cd the block with char table pointers | |
93 | ||
94 | Returns: TRUE if table built, FALSE otherwise | |
95 | */ | |
96 | ||
97 | static BOOL | |
98 | set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless, | |
99 | BOOL utf8, compile_data *cd) | |
100 | { | |
101 | register int c; | |
102 | ||
103 | /* This next statement and the later reference to dummy are here in order to | |
104 | trick the optimizer of the IBM C compiler for OS/2 into generating correct | |
105 | code. Apparently IBM isn't going to fix the problem, and we would rather not | |
106 | disable optimization (in this module it actually makes a big difference, and | |
107 | the pcre module can use all the optimization it can get). */ | |
108 | ||
109 | volatile int dummy; | |
110 | ||
111 | do | |
112 | { | |
113 | const uschar *tcode = code + 1 + LINK_SIZE; | |
114 | BOOL try_next = TRUE; | |
115 | ||
116 | while (try_next) | |
117 | { | |
118 | /* If a branch starts with a bracket or a positive lookahead assertion, | |
119 | recurse to set bits from within them. That's all for this branch. */ | |
120 | ||
121 | if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT) | |
122 | { | |
123 | if (!set_start_bits(tcode, start_bits, caseless, utf8, cd)) | |
124 | return FALSE; | |
125 | try_next = FALSE; | |
126 | } | |
127 | ||
128 | else switch(*tcode) | |
129 | { | |
130 | default: | |
131 | return FALSE; | |
132 | ||
133 | /* Skip over callout */ | |
134 | ||
135 | case OP_CALLOUT: | |
136 | tcode += 2 + 2*LINK_SIZE; | |
137 | break; | |
138 | ||
139 | /* Skip over extended extraction bracket number */ | |
140 | ||
141 | case OP_BRANUMBER: | |
142 | tcode += 3; | |
143 | break; | |
144 | ||
145 | /* Skip over lookbehind and negative lookahead assertions */ | |
146 | ||
147 | case OP_ASSERT_NOT: | |
148 | case OP_ASSERTBACK: | |
149 | case OP_ASSERTBACK_NOT: | |
150 | do tcode += GET(tcode, 1); while (*tcode == OP_ALT); | |
151 | tcode += 1+LINK_SIZE; | |
152 | break; | |
153 | ||
154 | /* Skip over an option setting, changing the caseless flag */ | |
155 | ||
156 | case OP_OPT: | |
157 | caseless = (tcode[1] & PCRE_CASELESS) != 0; | |
158 | tcode += 2; | |
159 | break; | |
160 | ||
161 | /* BRAZERO does the bracket, but carries on. */ | |
162 | ||
163 | case OP_BRAZERO: | |
164 | case OP_BRAMINZERO: | |
165 | if (!set_start_bits(++tcode, start_bits, caseless, utf8, cd)) | |
166 | return FALSE; | |
167 | dummy = 1; | |
168 | do tcode += GET(tcode,1); while (*tcode == OP_ALT); | |
169 | tcode += 1+LINK_SIZE; | |
170 | break; | |
171 | ||
172 | /* Single-char * or ? sets the bit and tries the next item */ | |
173 | ||
174 | case OP_STAR: | |
175 | case OP_MINSTAR: | |
176 | case OP_QUERY: | |
177 | case OP_MINQUERY: | |
178 | set_bit(start_bits, tcode[1], caseless, cd); | |
179 | tcode += 2; | |
180 | #ifdef SUPPORT_UTF8 | |
181 | if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++; | |
182 | #endif | |
183 | break; | |
184 | ||
185 | /* Single-char upto sets the bit and tries the next */ | |
186 | ||
187 | case OP_UPTO: | |
188 | case OP_MINUPTO: | |
189 | set_bit(start_bits, tcode[3], caseless, cd); | |
190 | tcode += 4; | |
191 | #ifdef SUPPORT_UTF8 | |
192 | if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++; | |
193 | #endif | |
194 | break; | |
195 | ||
196 | /* At least one single char sets the bit and stops */ | |
197 | ||
198 | case OP_EXACT: /* Fall through */ | |
199 | tcode += 2; | |
200 | ||
201 | case OP_CHAR: | |
202 | case OP_CHARNC: | |
203 | case OP_PLUS: | |
204 | case OP_MINPLUS: | |
205 | set_bit(start_bits, tcode[1], caseless, cd); | |
206 | try_next = FALSE; | |
207 | break; | |
208 | ||
209 | /* Single character type sets the bits and stops */ | |
210 | ||
211 | case OP_NOT_DIGIT: | |
212 | for (c = 0; c < 32; c++) | |
213 | start_bits[c] |= ~cd->cbits[c+cbit_digit]; | |
214 | try_next = FALSE; | |
215 | break; | |
216 | ||
217 | case OP_DIGIT: | |
218 | for (c = 0; c < 32; c++) | |
219 | start_bits[c] |= cd->cbits[c+cbit_digit]; | |
220 | try_next = FALSE; | |
221 | break; | |
222 | ||
223 | case OP_NOT_WHITESPACE: | |
224 | for (c = 0; c < 32; c++) | |
225 | start_bits[c] |= ~cd->cbits[c+cbit_space]; | |
226 | try_next = FALSE; | |
227 | break; | |
228 | ||
229 | case OP_WHITESPACE: | |
230 | for (c = 0; c < 32; c++) | |
231 | start_bits[c] |= cd->cbits[c+cbit_space]; | |
232 | try_next = FALSE; | |
233 | break; | |
234 | ||
235 | case OP_NOT_WORDCHAR: | |
236 | for (c = 0; c < 32; c++) | |
237 | start_bits[c] |= ~cd->cbits[c+cbit_word]; | |
238 | try_next = FALSE; | |
239 | break; | |
240 | ||
241 | case OP_WORDCHAR: | |
242 | for (c = 0; c < 32; c++) | |
243 | start_bits[c] |= cd->cbits[c+cbit_word]; | |
244 | try_next = FALSE; | |
245 | break; | |
246 | ||
247 | /* One or more character type fudges the pointer and restarts, knowing | |
248 | it will hit a single character type and stop there. */ | |
249 | ||
250 | case OP_TYPEPLUS: | |
251 | case OP_TYPEMINPLUS: | |
252 | tcode++; | |
253 | break; | |
254 | ||
255 | case OP_TYPEEXACT: | |
256 | tcode += 3; | |
257 | break; | |
258 | ||
259 | /* Zero or more repeats of character types set the bits and then | |
260 | try again. */ | |
261 | ||
262 | case OP_TYPEUPTO: | |
263 | case OP_TYPEMINUPTO: | |
264 | tcode += 2; /* Fall through */ | |
265 | ||
266 | case OP_TYPESTAR: | |
267 | case OP_TYPEMINSTAR: | |
268 | case OP_TYPEQUERY: | |
269 | case OP_TYPEMINQUERY: | |
270 | switch(tcode[1]) | |
271 | { | |
272 | case OP_ANY: | |
273 | return FALSE; | |
274 | ||
275 | case OP_NOT_DIGIT: | |
276 | for (c = 0; c < 32; c++) | |
277 | start_bits[c] |= ~cd->cbits[c+cbit_digit]; | |
278 | break; | |
279 | ||
280 | case OP_DIGIT: | |
281 | for (c = 0; c < 32; c++) | |
282 | start_bits[c] |= cd->cbits[c+cbit_digit]; | |
283 | break; | |
284 | ||
285 | case OP_NOT_WHITESPACE: | |
286 | for (c = 0; c < 32; c++) | |
287 | start_bits[c] |= ~cd->cbits[c+cbit_space]; | |
288 | break; | |
289 | ||
290 | case OP_WHITESPACE: | |
291 | for (c = 0; c < 32; c++) | |
292 | start_bits[c] |= cd->cbits[c+cbit_space]; | |
293 | break; | |
294 | ||
295 | case OP_NOT_WORDCHAR: | |
296 | for (c = 0; c < 32; c++) | |
297 | start_bits[c] |= ~cd->cbits[c+cbit_word]; | |
298 | break; | |
299 | ||
300 | case OP_WORDCHAR: | |
301 | for (c = 0; c < 32; c++) | |
302 | start_bits[c] |= cd->cbits[c+cbit_word]; | |
303 | break; | |
304 | } | |
305 | ||
306 | tcode += 2; | |
307 | break; | |
308 | ||
309 | /* Character class where all the information is in a bit map: set the | |
310 | bits and either carry on or not, according to the repeat count. If it was | |
311 | a negative class, and we are operating with UTF-8 characters, any byte | |
312 | with a value >= 0xc4 is a potentially valid starter because it starts a | |
313 | character with a value > 255. */ | |
314 | ||
315 | case OP_NCLASS: | |
316 | if (utf8) | |
317 | { | |
318 | start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */ | |
319 | memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */ | |
320 | } | |
321 | /* Fall through */ | |
322 | ||
323 | case OP_CLASS: | |
324 | { | |
325 | tcode++; | |
326 | ||
327 | /* In UTF-8 mode, the bits in a bit map correspond to character | |
328 | values, not to byte values. However, the bit map we are constructing is | |
329 | for byte values. So we have to do a conversion for characters whose | |
330 | value is > 127. In fact, there are only two possible starting bytes for | |
331 | characters in the range 128 - 255. */ | |
332 | ||
333 | if (utf8) | |
334 | { | |
335 | for (c = 0; c < 16; c++) start_bits[c] |= tcode[c]; | |
336 | for (c = 128; c < 256; c++) | |
337 | { | |
338 | if ((tcode[c/8] && (1 << (c&7))) != 0) | |
339 | { | |
340 | int d = (c >> 6) | 0xc0; /* Set bit for this starter */ | |
341 | start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */ | |
342 | c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */ | |
343 | } | |
344 | } | |
345 | } | |
346 | ||
347 | /* In non-UTF-8 mode, the two bit maps are completely compatible. */ | |
348 | ||
349 | else | |
350 | { | |
351 | for (c = 0; c < 32; c++) start_bits[c] |= tcode[c]; | |
352 | } | |
353 | ||
354 | /* Advance past the bit map, and act on what follows */ | |
355 | ||
356 | tcode += 32; | |
357 | switch (*tcode) | |
358 | { | |
359 | case OP_CRSTAR: | |
360 | case OP_CRMINSTAR: | |
361 | case OP_CRQUERY: | |
362 | case OP_CRMINQUERY: | |
363 | tcode++; | |
364 | break; | |
365 | ||
366 | case OP_CRRANGE: | |
367 | case OP_CRMINRANGE: | |
368 | if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5; | |
369 | else try_next = FALSE; | |
370 | break; | |
371 | ||
372 | default: | |
373 | try_next = FALSE; | |
374 | break; | |
375 | } | |
376 | } | |
377 | break; /* End of bitmap class handling */ | |
378 | ||
379 | } /* End of switch */ | |
380 | } /* End of try_next loop */ | |
381 | ||
382 | code += GET(code, 1); /* Advance to next branch */ | |
383 | } | |
384 | while (*code == OP_ALT); | |
385 | return TRUE; | |
386 | } | |
387 | ||
388 | ||
389 | ||
390 | /************************************************* | |
391 | * Study a compiled expression * | |
392 | *************************************************/ | |
393 | ||
394 | /* This function is handed a compiled expression that it must study to produce | |
395 | information that will speed up the matching. It returns a pcre_extra block | |
396 | which then gets handed back to pcre_exec(). | |
397 | ||
398 | Arguments: | |
399 | re points to the compiled expression | |
400 | options contains option bits | |
401 | errorptr points to where to place error messages; | |
402 | set NULL unless error | |
403 | ||
404 | Returns: pointer to a pcre_extra block, with study_data filled in and the | |
405 | appropriate flag set; | |
406 | NULL on error or if no optimization possible | |
407 | */ | |
408 | ||
409 | EXPORT pcre_extra * | |
410 | pcre_study(const pcre *external_re, int options, const char **errorptr) | |
411 | { | |
412 | uschar start_bits[32]; | |
413 | pcre_extra *extra; | |
414 | pcre_study_data *study; | |
415 | const uschar *tables; | |
416 | const real_pcre *re = (const real_pcre *)external_re; | |
417 | uschar *code = (uschar *)re + re->name_table_offset + | |
418 | (re->name_count * re->name_entry_size); | |
419 | compile_data compile_block; | |
420 | ||
421 | *errorptr = NULL; | |
422 | ||
423 | if (re == NULL || re->magic_number != MAGIC_NUMBER) | |
424 | { | |
425 | *errorptr = "argument is not a compiled regular expression"; | |
426 | return NULL; | |
427 | } | |
428 | ||
429 | if ((options & ~PUBLIC_STUDY_OPTIONS) != 0) | |
430 | { | |
431 | *errorptr = "unknown or incorrect option bit(s) set"; | |
432 | return NULL; | |
433 | } | |
434 | ||
435 | /* For an anchored pattern, or an unanchored pattern that has a first char, or | |
436 | a multiline pattern that matches only at "line starts", no further processing | |
437 | at present. */ | |
438 | ||
439 | if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0) | |
440 | return NULL; | |
441 | ||
442 | /* Set the character tables in the block that is passed around */ | |
443 | ||
444 | tables = re->tables; | |
445 | if (tables == NULL) | |
446 | (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES, &tables); | |
447 | ||
448 | compile_block.lcc = tables + lcc_offset; | |
449 | compile_block.fcc = tables + fcc_offset; | |
450 | compile_block.cbits = tables + cbits_offset; | |
451 | compile_block.ctypes = tables + ctypes_offset; | |
452 | ||
453 | /* See if we can find a fixed set of initial characters for the pattern. */ | |
454 | ||
455 | memset(start_bits, 0, 32 * sizeof(uschar)); | |
456 | if (!set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0, | |
457 | (re->options & PCRE_UTF8) != 0, &compile_block)) return NULL; | |
458 | ||
459 | /* Get a pcre_extra block and a pcre_study_data block. The study data is put in | |
460 | the latter, which is pointed to by the former, which may also get additional | |
461 | data set later by the calling program. At the moment, the size of | |
462 | pcre_study_data is fixed. We nevertheless save it in a field for returning via | |
463 | the pcre_fullinfo() function so that if it becomes variable in the future, we | |
464 | don't have to change that code. */ | |
465 | ||
466 | extra = (pcre_extra *)(pcre_malloc) | |
467 | (sizeof(pcre_extra) + sizeof(pcre_study_data)); | |
468 | ||
469 | if (extra == NULL) | |
470 | { | |
471 | *errorptr = "failed to get memory"; | |
472 | return NULL; | |
473 | } | |
474 | ||
475 | study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra)); | |
476 | extra->flags = PCRE_EXTRA_STUDY_DATA; | |
477 | extra->study_data = study; | |
478 | ||
479 | study->size = sizeof(pcre_study_data); | |
480 | study->options = PCRE_STUDY_MAPPED; | |
481 | memcpy(study->start_bits, start_bits, sizeof(start_bits)); | |
482 | ||
483 | return extra; | |
484 | } | |
485 | ||
486 | /* End of study.c */ |