Implemented hosts_avoid_pipelining in the smtp transport.
[exim.git] / src / src / pcre / pcre_study.c
CommitLineData
6bf342e1 1/* $Cambridge: exim/src/src/pcre/pcre_study.c,v 1.4 2007/01/23 15:08:45 ph10 Exp $ */
8ac170f3
PH
2
3/*************************************************
4* Perl-Compatible Regular Expressions *
5*************************************************/
6
7/* PCRE is a library of functions to support regular expressions whose syntax
8and semantics are as close as possible to those of the Perl 5 language.
9
10 Written by Philip Hazel
aa41d2de 11 Copyright (c) 1997-2006 University of Cambridge
8ac170f3
PH
12
13-----------------------------------------------------------------------------
14Redistribution and use in source and binary forms, with or without
15modification, are permitted provided that the following conditions are met:
16
17 * Redistributions of source code must retain the above copyright notice,
18 this list of conditions and the following disclaimer.
19
20 * Redistributions in binary form must reproduce the above copyright
21 notice, this list of conditions and the following disclaimer in the
22 documentation and/or other materials provided with the distribution.
23
24 * Neither the name of the University of Cambridge nor the names of its
25 contributors may be used to endorse or promote products derived from
26 this software without specific prior written permission.
27
28THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38POSSIBILITY OF SUCH DAMAGE.
39-----------------------------------------------------------------------------
40*/
41
42
43/* This module contains the external function pcre_study(), along with local
44supporting functions. */
45
46
47#include "pcre_internal.h"
48
49
6bf342e1
PH
50/* Returns from set_start_bits() */
51
52enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
53
54
8ac170f3
PH
55/*************************************************
56* Set a bit and maybe its alternate case *
57*************************************************/
58
59/* Given a character, set its bit in the table, and also the bit for the other
60version of a letter if we are caseless.
61
62Arguments:
63 start_bits points to the bit map
64 c is the character
65 caseless the caseless flag
66 cd the block with char table pointers
67
68Returns: nothing
69*/
70
71static void
72set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd)
73{
74start_bits[c/8] |= (1 << (c&7));
75if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
76 start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
77}
78
79
80
81/*************************************************
6bf342e1 82* Create bitmap of starting bytes *
8ac170f3
PH
83*************************************************/
84
6bf342e1
PH
85/* This function scans a compiled unanchored expression recursively and
86attempts to build a bitmap of the set of possible starting bytes. As time goes
87by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
88useful for parenthesized groups in patterns such as (a*)b where the group
89provides some optional starting bytes but scanning must continue at the outer
90level to find at least one mandatory byte. At the outermost level, this
91function fails unless the result is SSB_DONE.
8ac170f3
PH
92
93Arguments:
94 code points to an expression
95 start_bits points to a 32-byte table, initialized to 0
96 caseless the current state of the caseless flag
97 utf8 TRUE if in UTF-8 mode
98 cd the block with char table pointers
99
6bf342e1
PH
100Returns: SSB_FAIL => Failed to find any starting bytes
101 SSB_DONE => Found mandatory starting bytes
102 SSB_CONTINUE => Found optional starting bytes
8ac170f3
PH
103*/
104
6bf342e1 105static int
8ac170f3
PH
106set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
107 BOOL utf8, compile_data *cd)
108{
109register int c;
6bf342e1 110int yield = SSB_DONE;
8ac170f3 111
aa41d2de
PH
112#if 0
113/* ========================================================================= */
114/* The following comment and code was inserted in January 1999. In May 2006,
115when it was observed to cause compiler warnings about unused values, I took it
116out again. If anybody is still using OS/2, they will have to put it back
117manually. */
118
8ac170f3
PH
119/* This next statement and the later reference to dummy are here in order to
120trick the optimizer of the IBM C compiler for OS/2 into generating correct
121code. Apparently IBM isn't going to fix the problem, and we would rather not
122disable optimization (in this module it actually makes a big difference, and
123the pcre module can use all the optimization it can get). */
124
125volatile int dummy;
aa41d2de
PH
126/* ========================================================================= */
127#endif
8ac170f3
PH
128
129do
130 {
6bf342e1 131 const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
8ac170f3
PH
132 BOOL try_next = TRUE;
133
6bf342e1 134 while (try_next) /* Loop for items in this branch */
8ac170f3 135 {
6bf342e1
PH
136 int rc;
137 switch(*tcode)
8ac170f3 138 {
6bf342e1 139 /* Fail if we reach something we don't understand */
8ac170f3 140
8ac170f3 141 default:
6bf342e1
PH
142 return SSB_FAIL;
143
144 /* If we hit a bracket or a positive lookahead assertion, recurse to set
145 bits from within the subpattern. If it can't find anything, we have to
146 give up. If it finds some mandatory character(s), we are done for this
147 branch. Otherwise, carry on scanning after the subpattern. */
148
149 case OP_BRA:
150 case OP_SBRA:
151 case OP_CBRA:
152 case OP_SCBRA:
153 case OP_ONCE:
154 case OP_ASSERT:
155 rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);
156 if (rc == SSB_FAIL) return SSB_FAIL;
157 if (rc == SSB_DONE) try_next = FALSE; else
158 {
159 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
160 tcode += 1 + LINK_SIZE;
161 }
162 break;
8ac170f3 163
6bf342e1
PH
164 /* If we hit ALT or KET, it means we haven't found anything mandatory in
165 this branch, though we might have found something optional. For ALT, we
166 continue with the next alternative, but we have to arrange that the final
167 result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
168 return SSB_CONTINUE: if this is the top level, that indicates failure,
169 but after a nested subpattern, it causes scanning to continue. */
8ac170f3 170
6bf342e1
PH
171 case OP_ALT:
172 yield = SSB_CONTINUE;
173 try_next = FALSE;
8ac170f3
PH
174 break;
175
6bf342e1
PH
176 case OP_KET:
177 case OP_KETRMAX:
178 case OP_KETRMIN:
179 return SSB_CONTINUE;
8ac170f3 180
6bf342e1
PH
181 /* Skip over callout */
182
183 case OP_CALLOUT:
184 tcode += 2 + 2*LINK_SIZE;
8ac170f3
PH
185 break;
186
187 /* Skip over lookbehind and negative lookahead assertions */
188
189 case OP_ASSERT_NOT:
190 case OP_ASSERTBACK:
191 case OP_ASSERTBACK_NOT:
192 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
6bf342e1 193 tcode += 1 + LINK_SIZE;
8ac170f3
PH
194 break;
195
196 /* Skip over an option setting, changing the caseless flag */
197
198 case OP_OPT:
199 caseless = (tcode[1] & PCRE_CASELESS) != 0;
200 tcode += 2;
201 break;
202
203 /* BRAZERO does the bracket, but carries on. */
204
205 case OP_BRAZERO:
206 case OP_BRAMINZERO:
6bf342e1
PH
207 if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)
208 return SSB_FAIL;
aa41d2de
PH
209/* =========================================================================
210 See the comment at the head of this function concerning the next line,
211 which was an old fudge for the benefit of OS/2.
8ac170f3 212 dummy = 1;
aa41d2de 213 ========================================================================= */
8ac170f3 214 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
6bf342e1 215 tcode += 1 + LINK_SIZE;
8ac170f3
PH
216 break;
217
218 /* Single-char * or ? sets the bit and tries the next item */
219
220 case OP_STAR:
221 case OP_MINSTAR:
6bf342e1 222 case OP_POSSTAR:
8ac170f3
PH
223 case OP_QUERY:
224 case OP_MINQUERY:
6bf342e1 225 case OP_POSQUERY:
8ac170f3
PH
226 set_bit(start_bits, tcode[1], caseless, cd);
227 tcode += 2;
228#ifdef SUPPORT_UTF8
6bf342e1
PH
229 if (utf8 && tcode[-1] >= 0xc0)
230 tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
8ac170f3
PH
231#endif
232 break;
233
234 /* Single-char upto sets the bit and tries the next */
235
236 case OP_UPTO:
237 case OP_MINUPTO:
6bf342e1 238 case OP_POSUPTO:
8ac170f3
PH
239 set_bit(start_bits, tcode[3], caseless, cd);
240 tcode += 4;
241#ifdef SUPPORT_UTF8
6bf342e1
PH
242 if (utf8 && tcode[-1] >= 0xc0)
243 tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
8ac170f3
PH
244#endif
245 break;
246
247 /* At least one single char sets the bit and stops */
248
249 case OP_EXACT: /* Fall through */
250 tcode += 2;
251
252 case OP_CHAR:
253 case OP_CHARNC:
254 case OP_PLUS:
255 case OP_MINPLUS:
6bf342e1 256 case OP_POSPLUS:
8ac170f3
PH
257 set_bit(start_bits, tcode[1], caseless, cd);
258 try_next = FALSE;
259 break;
260
261 /* Single character type sets the bits and stops */
262
263 case OP_NOT_DIGIT:
264 for (c = 0; c < 32; c++)
265 start_bits[c] |= ~cd->cbits[c+cbit_digit];
266 try_next = FALSE;
267 break;
268
269 case OP_DIGIT:
270 for (c = 0; c < 32; c++)
271 start_bits[c] |= cd->cbits[c+cbit_digit];
272 try_next = FALSE;
273 break;
274
aa41d2de
PH
275 /* The cbit_space table has vertical tab as whitespace; we have to
276 discard it. */
277
8ac170f3
PH
278 case OP_NOT_WHITESPACE:
279 for (c = 0; c < 32; c++)
aa41d2de
PH
280 {
281 int d = cd->cbits[c+cbit_space];
282 if (c == 1) d &= ~0x08;
283 start_bits[c] |= ~d;
284 }
8ac170f3
PH
285 try_next = FALSE;
286 break;
287
aa41d2de
PH
288 /* The cbit_space table has vertical tab as whitespace; we have to
289 discard it. */
290
8ac170f3
PH
291 case OP_WHITESPACE:
292 for (c = 0; c < 32; c++)
aa41d2de
PH
293 {
294 int d = cd->cbits[c+cbit_space];
295 if (c == 1) d &= ~0x08;
296 start_bits[c] |= d;
297 }
8ac170f3
PH
298 try_next = FALSE;
299 break;
300
301 case OP_NOT_WORDCHAR:
302 for (c = 0; c < 32; c++)
303 start_bits[c] |= ~cd->cbits[c+cbit_word];
304 try_next = FALSE;
305 break;
306
307 case OP_WORDCHAR:
308 for (c = 0; c < 32; c++)
309 start_bits[c] |= cd->cbits[c+cbit_word];
310 try_next = FALSE;
311 break;
312
313 /* One or more character type fudges the pointer and restarts, knowing
314 it will hit a single character type and stop there. */
315
316 case OP_TYPEPLUS:
317 case OP_TYPEMINPLUS:
318 tcode++;
319 break;
320
321 case OP_TYPEEXACT:
322 tcode += 3;
323 break;
324
325 /* Zero or more repeats of character types set the bits and then
326 try again. */
327
328 case OP_TYPEUPTO:
329 case OP_TYPEMINUPTO:
6bf342e1 330 case OP_TYPEPOSUPTO:
8ac170f3
PH
331 tcode += 2; /* Fall through */
332
333 case OP_TYPESTAR:
334 case OP_TYPEMINSTAR:
6bf342e1 335 case OP_TYPEPOSSTAR:
8ac170f3
PH
336 case OP_TYPEQUERY:
337 case OP_TYPEMINQUERY:
6bf342e1 338 case OP_TYPEPOSQUERY:
8ac170f3
PH
339 switch(tcode[1])
340 {
341 case OP_ANY:
6bf342e1 342 return SSB_FAIL;
8ac170f3
PH
343
344 case OP_NOT_DIGIT:
345 for (c = 0; c < 32; c++)
346 start_bits[c] |= ~cd->cbits[c+cbit_digit];
347 break;
348
349 case OP_DIGIT:
350 for (c = 0; c < 32; c++)
351 start_bits[c] |= cd->cbits[c+cbit_digit];
352 break;
353
aa41d2de
PH
354 /* The cbit_space table has vertical tab as whitespace; we have to
355 discard it. */
356
8ac170f3
PH
357 case OP_NOT_WHITESPACE:
358 for (c = 0; c < 32; c++)
aa41d2de
PH
359 {
360 int d = cd->cbits[c+cbit_space];
361 if (c == 1) d &= ~0x08;
362 start_bits[c] |= ~d;
363 }
8ac170f3
PH
364 break;
365
aa41d2de
PH
366 /* The cbit_space table has vertical tab as whitespace; we have to
367 discard it. */
368
8ac170f3
PH
369 case OP_WHITESPACE:
370 for (c = 0; c < 32; c++)
aa41d2de
PH
371 {
372 int d = cd->cbits[c+cbit_space];
373 if (c == 1) d &= ~0x08;
374 start_bits[c] |= d;
375 }
8ac170f3
PH
376 break;
377
378 case OP_NOT_WORDCHAR:
379 for (c = 0; c < 32; c++)
380 start_bits[c] |= ~cd->cbits[c+cbit_word];
381 break;
382
383 case OP_WORDCHAR:
384 for (c = 0; c < 32; c++)
385 start_bits[c] |= cd->cbits[c+cbit_word];
386 break;
387 }
388
389 tcode += 2;
390 break;
391
392 /* Character class where all the information is in a bit map: set the
393 bits and either carry on or not, according to the repeat count. If it was
394 a negative class, and we are operating with UTF-8 characters, any byte
395 with a value >= 0xc4 is a potentially valid starter because it starts a
396 character with a value > 255. */
397
398 case OP_NCLASS:
399 if (utf8)
400 {
401 start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
402 memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
403 }
404 /* Fall through */
405
406 case OP_CLASS:
407 {
408 tcode++;
409
410 /* In UTF-8 mode, the bits in a bit map correspond to character
411 values, not to byte values. However, the bit map we are constructing is
412 for byte values. So we have to do a conversion for characters whose
413 value is > 127. In fact, there are only two possible starting bytes for
414 characters in the range 128 - 255. */
415
416 if (utf8)
417 {
418 for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
419 for (c = 128; c < 256; c++)
420 {
421 if ((tcode[c/8] && (1 << (c&7))) != 0)
422 {
423 int d = (c >> 6) | 0xc0; /* Set bit for this starter */
424 start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
425 c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
426 }
427 }
428 }
429
430 /* In non-UTF-8 mode, the two bit maps are completely compatible. */
431
432 else
433 {
434 for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
435 }
436
437 /* Advance past the bit map, and act on what follows */
438
439 tcode += 32;
440 switch (*tcode)
441 {
442 case OP_CRSTAR:
443 case OP_CRMINSTAR:
444 case OP_CRQUERY:
445 case OP_CRMINQUERY:
446 tcode++;
447 break;
448
449 case OP_CRRANGE:
450 case OP_CRMINRANGE:
451 if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
452 else try_next = FALSE;
453 break;
454
455 default:
456 try_next = FALSE;
457 break;
458 }
459 }
460 break; /* End of bitmap class handling */
461
462 } /* End of switch */
463 } /* End of try_next loop */
464
465 code += GET(code, 1); /* Advance to next branch */
466 }
467while (*code == OP_ALT);
6bf342e1 468return yield;
8ac170f3
PH
469}
470
471
472
473/*************************************************
474* Study a compiled expression *
475*************************************************/
476
477/* This function is handed a compiled expression that it must study to produce
478information that will speed up the matching. It returns a pcre_extra block
479which then gets handed back to pcre_exec().
480
481Arguments:
482 re points to the compiled expression
483 options contains option bits
484 errorptr points to where to place error messages;
485 set NULL unless error
486
487Returns: pointer to a pcre_extra block, with study_data filled in and the
488 appropriate flag set;
489 NULL on error or if no optimization possible
490*/
491
aa41d2de 492PCRE_DATA_SCOPE pcre_extra *
8ac170f3
PH
493pcre_study(const pcre *external_re, int options, const char **errorptr)
494{
495uschar start_bits[32];
496pcre_extra *extra;
497pcre_study_data *study;
498const uschar *tables;
aa41d2de 499uschar *code;
8ac170f3 500compile_data compile_block;
aa41d2de 501const real_pcre *re = (const real_pcre *)external_re;
8ac170f3
PH
502
503*errorptr = NULL;
504
505if (re == NULL || re->magic_number != MAGIC_NUMBER)
506 {
507 *errorptr = "argument is not a compiled regular expression";
508 return NULL;
509 }
510
511if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
512 {
513 *errorptr = "unknown or incorrect option bit(s) set";
514 return NULL;
515 }
516
aa41d2de
PH
517code = (uschar *)re + re->name_table_offset +
518 (re->name_count * re->name_entry_size);
519
8ac170f3
PH
520/* For an anchored pattern, or an unanchored pattern that has a first char, or
521a multiline pattern that matches only at "line starts", no further processing
522at present. */
523
524if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
525 return NULL;
526
527/* Set the character tables in the block that is passed around */
528
529tables = re->tables;
530if (tables == NULL)
531 (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
532 (void *)(&tables));
533
534compile_block.lcc = tables + lcc_offset;
535compile_block.fcc = tables + fcc_offset;
536compile_block.cbits = tables + cbits_offset;
537compile_block.ctypes = tables + ctypes_offset;
538
539/* See if we can find a fixed set of initial characters for the pattern. */
540
541memset(start_bits, 0, 32 * sizeof(uschar));
6bf342e1
PH
542if (set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
543 (re->options & PCRE_UTF8) != 0, &compile_block) != SSB_DONE) return NULL;
8ac170f3
PH
544
545/* Get a pcre_extra block and a pcre_study_data block. The study data is put in
546the latter, which is pointed to by the former, which may also get additional
547data set later by the calling program. At the moment, the size of
548pcre_study_data is fixed. We nevertheless save it in a field for returning via
549the pcre_fullinfo() function so that if it becomes variable in the future, we
550don't have to change that code. */
551
552extra = (pcre_extra *)(pcre_malloc)
553 (sizeof(pcre_extra) + sizeof(pcre_study_data));
554
555if (extra == NULL)
556 {
557 *errorptr = "failed to get memory";
558 return NULL;
559 }
560
561study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
562extra->flags = PCRE_EXTRA_STUDY_DATA;
563extra->study_data = study;
564
565study->size = sizeof(pcre_study_data);
566study->options = PCRE_STUDY_MAPPED;
567memcpy(study->start_bits, start_bits, sizeof(start_bits));
568
569return extra;
570}
571
572/* End of pcre_study.c */