Commit | Line | Data |
---|---|---|
8ac170f3 PH |
1 | /* $Cambridge: exim/src/src/pcre/pcre_study.c,v 1.1 2005/06/15 08:57:10 ph10 Exp $ */ |
2 | ||
3 | /************************************************* | |
4 | * Perl-Compatible Regular Expressions * | |
5 | *************************************************/ | |
6 | ||
7 | /* PCRE is a library of functions to support regular expressions whose syntax | |
8 | and semantics are as close as possible to those of the Perl 5 language. | |
9 | ||
10 | Written by Philip Hazel | |
11 | Copyright (c) 1997-2005 University of Cambridge | |
12 | ||
13 | ----------------------------------------------------------------------------- | |
14 | Redistribution and use in source and binary forms, with or without | |
15 | modification, are permitted provided that the following conditions are met: | |
16 | ||
17 | * Redistributions of source code must retain the above copyright notice, | |
18 | this list of conditions and the following disclaimer. | |
19 | ||
20 | * Redistributions in binary form must reproduce the above copyright | |
21 | notice, this list of conditions and the following disclaimer in the | |
22 | documentation and/or other materials provided with the distribution. | |
23 | ||
24 | * Neither the name of the University of Cambridge nor the names of its | |
25 | contributors may be used to endorse or promote products derived from | |
26 | this software without specific prior written permission. | |
27 | ||
28 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
29 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
30 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
31 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |
32 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
33 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
34 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
35 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
36 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
37 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
38 | POSSIBILITY OF SUCH DAMAGE. | |
39 | ----------------------------------------------------------------------------- | |
40 | */ | |
41 | ||
42 | ||
43 | /* This module contains the external function pcre_study(), along with local | |
44 | supporting functions. */ | |
45 | ||
46 | ||
47 | #include "pcre_internal.h" | |
48 | ||
49 | ||
50 | /************************************************* | |
51 | * Set a bit and maybe its alternate case * | |
52 | *************************************************/ | |
53 | ||
54 | /* Given a character, set its bit in the table, and also the bit for the other | |
55 | version of a letter if we are caseless. | |
56 | ||
57 | Arguments: | |
58 | start_bits points to the bit map | |
59 | c is the character | |
60 | caseless the caseless flag | |
61 | cd the block with char table pointers | |
62 | ||
63 | Returns: nothing | |
64 | */ | |
65 | ||
66 | static void | |
67 | set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd) | |
68 | { | |
69 | start_bits[c/8] |= (1 << (c&7)); | |
70 | if (caseless && (cd->ctypes[c] & ctype_letter) != 0) | |
71 | start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7)); | |
72 | } | |
73 | ||
74 | ||
75 | ||
76 | /************************************************* | |
77 | * Create bitmap of starting chars * | |
78 | *************************************************/ | |
79 | ||
80 | /* This function scans a compiled unanchored expression and attempts to build a | |
81 | bitmap of the set of initial characters. If it can't, it returns FALSE. As time | |
82 | goes by, we may be able to get more clever at doing this. | |
83 | ||
84 | Arguments: | |
85 | code points to an expression | |
86 | start_bits points to a 32-byte table, initialized to 0 | |
87 | caseless the current state of the caseless flag | |
88 | utf8 TRUE if in UTF-8 mode | |
89 | cd the block with char table pointers | |
90 | ||
91 | Returns: TRUE if table built, FALSE otherwise | |
92 | */ | |
93 | ||
94 | static BOOL | |
95 | set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless, | |
96 | BOOL utf8, compile_data *cd) | |
97 | { | |
98 | register int c; | |
99 | ||
100 | /* This next statement and the later reference to dummy are here in order to | |
101 | trick the optimizer of the IBM C compiler for OS/2 into generating correct | |
102 | code. Apparently IBM isn't going to fix the problem, and we would rather not | |
103 | disable optimization (in this module it actually makes a big difference, and | |
104 | the pcre module can use all the optimization it can get). */ | |
105 | ||
106 | volatile int dummy; | |
107 | ||
108 | do | |
109 | { | |
110 | const uschar *tcode = code + 1 + LINK_SIZE; | |
111 | BOOL try_next = TRUE; | |
112 | ||
113 | while (try_next) | |
114 | { | |
115 | /* If a branch starts with a bracket or a positive lookahead assertion, | |
116 | recurse to set bits from within them. That's all for this branch. */ | |
117 | ||
118 | if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT) | |
119 | { | |
120 | if (!set_start_bits(tcode, start_bits, caseless, utf8, cd)) | |
121 | return FALSE; | |
122 | try_next = FALSE; | |
123 | } | |
124 | ||
125 | else switch(*tcode) | |
126 | { | |
127 | default: | |
128 | return FALSE; | |
129 | ||
130 | /* Skip over callout */ | |
131 | ||
132 | case OP_CALLOUT: | |
133 | tcode += 2 + 2*LINK_SIZE; | |
134 | break; | |
135 | ||
136 | /* Skip over extended extraction bracket number */ | |
137 | ||
138 | case OP_BRANUMBER: | |
139 | tcode += 3; | |
140 | break; | |
141 | ||
142 | /* Skip over lookbehind and negative lookahead assertions */ | |
143 | ||
144 | case OP_ASSERT_NOT: | |
145 | case OP_ASSERTBACK: | |
146 | case OP_ASSERTBACK_NOT: | |
147 | do tcode += GET(tcode, 1); while (*tcode == OP_ALT); | |
148 | tcode += 1+LINK_SIZE; | |
149 | break; | |
150 | ||
151 | /* Skip over an option setting, changing the caseless flag */ | |
152 | ||
153 | case OP_OPT: | |
154 | caseless = (tcode[1] & PCRE_CASELESS) != 0; | |
155 | tcode += 2; | |
156 | break; | |
157 | ||
158 | /* BRAZERO does the bracket, but carries on. */ | |
159 | ||
160 | case OP_BRAZERO: | |
161 | case OP_BRAMINZERO: | |
162 | if (!set_start_bits(++tcode, start_bits, caseless, utf8, cd)) | |
163 | return FALSE; | |
164 | dummy = 1; | |
165 | do tcode += GET(tcode,1); while (*tcode == OP_ALT); | |
166 | tcode += 1+LINK_SIZE; | |
167 | break; | |
168 | ||
169 | /* Single-char * or ? sets the bit and tries the next item */ | |
170 | ||
171 | case OP_STAR: | |
172 | case OP_MINSTAR: | |
173 | case OP_QUERY: | |
174 | case OP_MINQUERY: | |
175 | set_bit(start_bits, tcode[1], caseless, cd); | |
176 | tcode += 2; | |
177 | #ifdef SUPPORT_UTF8 | |
178 | if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++; | |
179 | #endif | |
180 | break; | |
181 | ||
182 | /* Single-char upto sets the bit and tries the next */ | |
183 | ||
184 | case OP_UPTO: | |
185 | case OP_MINUPTO: | |
186 | set_bit(start_bits, tcode[3], caseless, cd); | |
187 | tcode += 4; | |
188 | #ifdef SUPPORT_UTF8 | |
189 | if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++; | |
190 | #endif | |
191 | break; | |
192 | ||
193 | /* At least one single char sets the bit and stops */ | |
194 | ||
195 | case OP_EXACT: /* Fall through */ | |
196 | tcode += 2; | |
197 | ||
198 | case OP_CHAR: | |
199 | case OP_CHARNC: | |
200 | case OP_PLUS: | |
201 | case OP_MINPLUS: | |
202 | set_bit(start_bits, tcode[1], caseless, cd); | |
203 | try_next = FALSE; | |
204 | break; | |
205 | ||
206 | /* Single character type sets the bits and stops */ | |
207 | ||
208 | case OP_NOT_DIGIT: | |
209 | for (c = 0; c < 32; c++) | |
210 | start_bits[c] |= ~cd->cbits[c+cbit_digit]; | |
211 | try_next = FALSE; | |
212 | break; | |
213 | ||
214 | case OP_DIGIT: | |
215 | for (c = 0; c < 32; c++) | |
216 | start_bits[c] |= cd->cbits[c+cbit_digit]; | |
217 | try_next = FALSE; | |
218 | break; | |
219 | ||
220 | case OP_NOT_WHITESPACE: | |
221 | for (c = 0; c < 32; c++) | |
222 | start_bits[c] |= ~cd->cbits[c+cbit_space]; | |
223 | try_next = FALSE; | |
224 | break; | |
225 | ||
226 | case OP_WHITESPACE: | |
227 | for (c = 0; c < 32; c++) | |
228 | start_bits[c] |= cd->cbits[c+cbit_space]; | |
229 | try_next = FALSE; | |
230 | break; | |
231 | ||
232 | case OP_NOT_WORDCHAR: | |
233 | for (c = 0; c < 32; c++) | |
234 | start_bits[c] |= ~cd->cbits[c+cbit_word]; | |
235 | try_next = FALSE; | |
236 | break; | |
237 | ||
238 | case OP_WORDCHAR: | |
239 | for (c = 0; c < 32; c++) | |
240 | start_bits[c] |= cd->cbits[c+cbit_word]; | |
241 | try_next = FALSE; | |
242 | break; | |
243 | ||
244 | /* One or more character type fudges the pointer and restarts, knowing | |
245 | it will hit a single character type and stop there. */ | |
246 | ||
247 | case OP_TYPEPLUS: | |
248 | case OP_TYPEMINPLUS: | |
249 | tcode++; | |
250 | break; | |
251 | ||
252 | case OP_TYPEEXACT: | |
253 | tcode += 3; | |
254 | break; | |
255 | ||
256 | /* Zero or more repeats of character types set the bits and then | |
257 | try again. */ | |
258 | ||
259 | case OP_TYPEUPTO: | |
260 | case OP_TYPEMINUPTO: | |
261 | tcode += 2; /* Fall through */ | |
262 | ||
263 | case OP_TYPESTAR: | |
264 | case OP_TYPEMINSTAR: | |
265 | case OP_TYPEQUERY: | |
266 | case OP_TYPEMINQUERY: | |
267 | switch(tcode[1]) | |
268 | { | |
269 | case OP_ANY: | |
270 | return FALSE; | |
271 | ||
272 | case OP_NOT_DIGIT: | |
273 | for (c = 0; c < 32; c++) | |
274 | start_bits[c] |= ~cd->cbits[c+cbit_digit]; | |
275 | break; | |
276 | ||
277 | case OP_DIGIT: | |
278 | for (c = 0; c < 32; c++) | |
279 | start_bits[c] |= cd->cbits[c+cbit_digit]; | |
280 | break; | |
281 | ||
282 | case OP_NOT_WHITESPACE: | |
283 | for (c = 0; c < 32; c++) | |
284 | start_bits[c] |= ~cd->cbits[c+cbit_space]; | |
285 | break; | |
286 | ||
287 | case OP_WHITESPACE: | |
288 | for (c = 0; c < 32; c++) | |
289 | start_bits[c] |= cd->cbits[c+cbit_space]; | |
290 | break; | |
291 | ||
292 | case OP_NOT_WORDCHAR: | |
293 | for (c = 0; c < 32; c++) | |
294 | start_bits[c] |= ~cd->cbits[c+cbit_word]; | |
295 | break; | |
296 | ||
297 | case OP_WORDCHAR: | |
298 | for (c = 0; c < 32; c++) | |
299 | start_bits[c] |= cd->cbits[c+cbit_word]; | |
300 | break; | |
301 | } | |
302 | ||
303 | tcode += 2; | |
304 | break; | |
305 | ||
306 | /* Character class where all the information is in a bit map: set the | |
307 | bits and either carry on or not, according to the repeat count. If it was | |
308 | a negative class, and we are operating with UTF-8 characters, any byte | |
309 | with a value >= 0xc4 is a potentially valid starter because it starts a | |
310 | character with a value > 255. */ | |
311 | ||
312 | case OP_NCLASS: | |
313 | if (utf8) | |
314 | { | |
315 | start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */ | |
316 | memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */ | |
317 | } | |
318 | /* Fall through */ | |
319 | ||
320 | case OP_CLASS: | |
321 | { | |
322 | tcode++; | |
323 | ||
324 | /* In UTF-8 mode, the bits in a bit map correspond to character | |
325 | values, not to byte values. However, the bit map we are constructing is | |
326 | for byte values. So we have to do a conversion for characters whose | |
327 | value is > 127. In fact, there are only two possible starting bytes for | |
328 | characters in the range 128 - 255. */ | |
329 | ||
330 | if (utf8) | |
331 | { | |
332 | for (c = 0; c < 16; c++) start_bits[c] |= tcode[c]; | |
333 | for (c = 128; c < 256; c++) | |
334 | { | |
335 | if ((tcode[c/8] && (1 << (c&7))) != 0) | |
336 | { | |
337 | int d = (c >> 6) | 0xc0; /* Set bit for this starter */ | |
338 | start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */ | |
339 | c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */ | |
340 | } | |
341 | } | |
342 | } | |
343 | ||
344 | /* In non-UTF-8 mode, the two bit maps are completely compatible. */ | |
345 | ||
346 | else | |
347 | { | |
348 | for (c = 0; c < 32; c++) start_bits[c] |= tcode[c]; | |
349 | } | |
350 | ||
351 | /* Advance past the bit map, and act on what follows */ | |
352 | ||
353 | tcode += 32; | |
354 | switch (*tcode) | |
355 | { | |
356 | case OP_CRSTAR: | |
357 | case OP_CRMINSTAR: | |
358 | case OP_CRQUERY: | |
359 | case OP_CRMINQUERY: | |
360 | tcode++; | |
361 | break; | |
362 | ||
363 | case OP_CRRANGE: | |
364 | case OP_CRMINRANGE: | |
365 | if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5; | |
366 | else try_next = FALSE; | |
367 | break; | |
368 | ||
369 | default: | |
370 | try_next = FALSE; | |
371 | break; | |
372 | } | |
373 | } | |
374 | break; /* End of bitmap class handling */ | |
375 | ||
376 | } /* End of switch */ | |
377 | } /* End of try_next loop */ | |
378 | ||
379 | code += GET(code, 1); /* Advance to next branch */ | |
380 | } | |
381 | while (*code == OP_ALT); | |
382 | return TRUE; | |
383 | } | |
384 | ||
385 | ||
386 | ||
387 | /************************************************* | |
388 | * Study a compiled expression * | |
389 | *************************************************/ | |
390 | ||
391 | /* This function is handed a compiled expression that it must study to produce | |
392 | information that will speed up the matching. It returns a pcre_extra block | |
393 | which then gets handed back to pcre_exec(). | |
394 | ||
395 | Arguments: | |
396 | re points to the compiled expression | |
397 | options contains option bits | |
398 | errorptr points to where to place error messages; | |
399 | set NULL unless error | |
400 | ||
401 | Returns: pointer to a pcre_extra block, with study_data filled in and the | |
402 | appropriate flag set; | |
403 | NULL on error or if no optimization possible | |
404 | */ | |
405 | ||
406 | EXPORT pcre_extra * | |
407 | pcre_study(const pcre *external_re, int options, const char **errorptr) | |
408 | { | |
409 | uschar start_bits[32]; | |
410 | pcre_extra *extra; | |
411 | pcre_study_data *study; | |
412 | const uschar *tables; | |
413 | const real_pcre *re = (const real_pcre *)external_re; | |
414 | uschar *code = (uschar *)re + re->name_table_offset + | |
415 | (re->name_count * re->name_entry_size); | |
416 | compile_data compile_block; | |
417 | ||
418 | *errorptr = NULL; | |
419 | ||
420 | if (re == NULL || re->magic_number != MAGIC_NUMBER) | |
421 | { | |
422 | *errorptr = "argument is not a compiled regular expression"; | |
423 | return NULL; | |
424 | } | |
425 | ||
426 | if ((options & ~PUBLIC_STUDY_OPTIONS) != 0) | |
427 | { | |
428 | *errorptr = "unknown or incorrect option bit(s) set"; | |
429 | return NULL; | |
430 | } | |
431 | ||
432 | /* For an anchored pattern, or an unanchored pattern that has a first char, or | |
433 | a multiline pattern that matches only at "line starts", no further processing | |
434 | at present. */ | |
435 | ||
436 | if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0) | |
437 | return NULL; | |
438 | ||
439 | /* Set the character tables in the block that is passed around */ | |
440 | ||
441 | tables = re->tables; | |
442 | if (tables == NULL) | |
443 | (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES, | |
444 | (void *)(&tables)); | |
445 | ||
446 | compile_block.lcc = tables + lcc_offset; | |
447 | compile_block.fcc = tables + fcc_offset; | |
448 | compile_block.cbits = tables + cbits_offset; | |
449 | compile_block.ctypes = tables + ctypes_offset; | |
450 | ||
451 | /* See if we can find a fixed set of initial characters for the pattern. */ | |
452 | ||
453 | memset(start_bits, 0, 32 * sizeof(uschar)); | |
454 | if (!set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0, | |
455 | (re->options & PCRE_UTF8) != 0, &compile_block)) return NULL; | |
456 | ||
457 | /* Get a pcre_extra block and a pcre_study_data block. The study data is put in | |
458 | the latter, which is pointed to by the former, which may also get additional | |
459 | data set later by the calling program. At the moment, the size of | |
460 | pcre_study_data is fixed. We nevertheless save it in a field for returning via | |
461 | the pcre_fullinfo() function so that if it becomes variable in the future, we | |
462 | don't have to change that code. */ | |
463 | ||
464 | extra = (pcre_extra *)(pcre_malloc) | |
465 | (sizeof(pcre_extra) + sizeof(pcre_study_data)); | |
466 | ||
467 | if (extra == NULL) | |
468 | { | |
469 | *errorptr = "failed to get memory"; | |
470 | return NULL; | |
471 | } | |
472 | ||
473 | study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra)); | |
474 | extra->flags = PCRE_EXTRA_STUDY_DATA; | |
475 | extra->study_data = study; | |
476 | ||
477 | study->size = sizeof(pcre_study_data); | |
478 | study->options = PCRE_STUDY_MAPPED; | |
479 | memcpy(study->start_bits, start_bits, sizeof(start_bits)); | |
480 | ||
481 | return extra; | |
482 | } | |
483 | ||
484 | /* End of pcre_study.c */ |