bugzilla 612 - write recipients list in X-Envelope-To header of MBOX spool file
[exim.git] / src / src / pcre / pcre_study.c
CommitLineData
47db1125 1/* $Cambridge: exim/src/src/pcre/pcre_study.c,v 1.6 2007/11/12 13:02:20 nm4 Exp $ */
8ac170f3
PH
2
3/*************************************************
4* Perl-Compatible Regular Expressions *
5*************************************************/
6
7/* PCRE is a library of functions to support regular expressions whose syntax
8and semantics are as close as possible to those of the Perl 5 language.
9
10 Written by Philip Hazel
64f2600a 11 Copyright (c) 1997-2007 University of Cambridge
8ac170f3
PH
12
13-----------------------------------------------------------------------------
14Redistribution and use in source and binary forms, with or without
15modification, are permitted provided that the following conditions are met:
16
17 * Redistributions of source code must retain the above copyright notice,
18 this list of conditions and the following disclaimer.
19
20 * Redistributions in binary form must reproduce the above copyright
21 notice, this list of conditions and the following disclaimer in the
22 documentation and/or other materials provided with the distribution.
23
24 * Neither the name of the University of Cambridge nor the names of its
25 contributors may be used to endorse or promote products derived from
26 this software without specific prior written permission.
27
28THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38POSSIBILITY OF SUCH DAMAGE.
39-----------------------------------------------------------------------------
40*/
41
42
43/* This module contains the external function pcre_study(), along with local
44supporting functions. */
45
46
47db1125
NM
47#ifdef HAVE_CONFIG_H
48#include "config.h"
49#endif
50
8ac170f3
PH
51#include "pcre_internal.h"
52
53
6bf342e1
PH
54/* Returns from set_start_bits() */
55
56enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
57
58
8ac170f3
PH
59/*************************************************
60* Set a bit and maybe its alternate case *
61*************************************************/
62
63/* Given a character, set its bit in the table, and also the bit for the other
64version of a letter if we are caseless.
65
66Arguments:
67 start_bits points to the bit map
68 c is the character
69 caseless the caseless flag
70 cd the block with char table pointers
71
72Returns: nothing
73*/
74
75static void
76set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd)
77{
78start_bits[c/8] |= (1 << (c&7));
79if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
80 start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
81}
82
83
84
85/*************************************************
6bf342e1 86* Create bitmap of starting bytes *
8ac170f3
PH
87*************************************************/
88
6bf342e1
PH
89/* This function scans a compiled unanchored expression recursively and
90attempts to build a bitmap of the set of possible starting bytes. As time goes
91by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
92useful for parenthesized groups in patterns such as (a*)b where the group
93provides some optional starting bytes but scanning must continue at the outer
94level to find at least one mandatory byte. At the outermost level, this
95function fails unless the result is SSB_DONE.
8ac170f3
PH
96
97Arguments:
98 code points to an expression
99 start_bits points to a 32-byte table, initialized to 0
100 caseless the current state of the caseless flag
101 utf8 TRUE if in UTF-8 mode
102 cd the block with char table pointers
103
6bf342e1
PH
104Returns: SSB_FAIL => Failed to find any starting bytes
105 SSB_DONE => Found mandatory starting bytes
106 SSB_CONTINUE => Found optional starting bytes
8ac170f3
PH
107*/
108
6bf342e1 109static int
8ac170f3
PH
110set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
111 BOOL utf8, compile_data *cd)
112{
113register int c;
6bf342e1 114int yield = SSB_DONE;
8ac170f3 115
aa41d2de
PH
116#if 0
117/* ========================================================================= */
118/* The following comment and code was inserted in January 1999. In May 2006,
119when it was observed to cause compiler warnings about unused values, I took it
120out again. If anybody is still using OS/2, they will have to put it back
121manually. */
122
8ac170f3
PH
123/* This next statement and the later reference to dummy are here in order to
124trick the optimizer of the IBM C compiler for OS/2 into generating correct
125code. Apparently IBM isn't going to fix the problem, and we would rather not
126disable optimization (in this module it actually makes a big difference, and
127the pcre module can use all the optimization it can get). */
128
129volatile int dummy;
aa41d2de
PH
130/* ========================================================================= */
131#endif
8ac170f3
PH
132
133do
134 {
6bf342e1 135 const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
8ac170f3
PH
136 BOOL try_next = TRUE;
137
6bf342e1 138 while (try_next) /* Loop for items in this branch */
8ac170f3 139 {
6bf342e1
PH
140 int rc;
141 switch(*tcode)
8ac170f3 142 {
6bf342e1 143 /* Fail if we reach something we don't understand */
8ac170f3 144
8ac170f3 145 default:
6bf342e1
PH
146 return SSB_FAIL;
147
148 /* If we hit a bracket or a positive lookahead assertion, recurse to set
149 bits from within the subpattern. If it can't find anything, we have to
150 give up. If it finds some mandatory character(s), we are done for this
151 branch. Otherwise, carry on scanning after the subpattern. */
152
153 case OP_BRA:
154 case OP_SBRA:
155 case OP_CBRA:
156 case OP_SCBRA:
157 case OP_ONCE:
158 case OP_ASSERT:
159 rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);
160 if (rc == SSB_FAIL) return SSB_FAIL;
161 if (rc == SSB_DONE) try_next = FALSE; else
162 {
163 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
164 tcode += 1 + LINK_SIZE;
165 }
166 break;
8ac170f3 167
6bf342e1
PH
168 /* If we hit ALT or KET, it means we haven't found anything mandatory in
169 this branch, though we might have found something optional. For ALT, we
170 continue with the next alternative, but we have to arrange that the final
171 result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
172 return SSB_CONTINUE: if this is the top level, that indicates failure,
173 but after a nested subpattern, it causes scanning to continue. */
8ac170f3 174
6bf342e1
PH
175 case OP_ALT:
176 yield = SSB_CONTINUE;
177 try_next = FALSE;
8ac170f3
PH
178 break;
179
6bf342e1
PH
180 case OP_KET:
181 case OP_KETRMAX:
182 case OP_KETRMIN:
183 return SSB_CONTINUE;
8ac170f3 184
6bf342e1
PH
185 /* Skip over callout */
186
187 case OP_CALLOUT:
188 tcode += 2 + 2*LINK_SIZE;
8ac170f3
PH
189 break;
190
191 /* Skip over lookbehind and negative lookahead assertions */
192
193 case OP_ASSERT_NOT:
194 case OP_ASSERTBACK:
195 case OP_ASSERTBACK_NOT:
196 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
6bf342e1 197 tcode += 1 + LINK_SIZE;
8ac170f3
PH
198 break;
199
200 /* Skip over an option setting, changing the caseless flag */
201
202 case OP_OPT:
203 caseless = (tcode[1] & PCRE_CASELESS) != 0;
204 tcode += 2;
205 break;
206
207 /* BRAZERO does the bracket, but carries on. */
208
209 case OP_BRAZERO:
210 case OP_BRAMINZERO:
6bf342e1
PH
211 if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)
212 return SSB_FAIL;
aa41d2de
PH
213/* =========================================================================
214 See the comment at the head of this function concerning the next line,
215 which was an old fudge for the benefit of OS/2.
8ac170f3 216 dummy = 1;
aa41d2de 217 ========================================================================= */
8ac170f3 218 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
6bf342e1 219 tcode += 1 + LINK_SIZE;
8ac170f3
PH
220 break;
221
222 /* Single-char * or ? sets the bit and tries the next item */
223
224 case OP_STAR:
225 case OP_MINSTAR:
6bf342e1 226 case OP_POSSTAR:
8ac170f3
PH
227 case OP_QUERY:
228 case OP_MINQUERY:
6bf342e1 229 case OP_POSQUERY:
8ac170f3
PH
230 set_bit(start_bits, tcode[1], caseless, cd);
231 tcode += 2;
232#ifdef SUPPORT_UTF8
6bf342e1
PH
233 if (utf8 && tcode[-1] >= 0xc0)
234 tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
8ac170f3
PH
235#endif
236 break;
237
238 /* Single-char upto sets the bit and tries the next */
239
240 case OP_UPTO:
241 case OP_MINUPTO:
6bf342e1 242 case OP_POSUPTO:
8ac170f3
PH
243 set_bit(start_bits, tcode[3], caseless, cd);
244 tcode += 4;
245#ifdef SUPPORT_UTF8
6bf342e1
PH
246 if (utf8 && tcode[-1] >= 0xc0)
247 tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
8ac170f3
PH
248#endif
249 break;
250
251 /* At least one single char sets the bit and stops */
252
253 case OP_EXACT: /* Fall through */
254 tcode += 2;
255
256 case OP_CHAR:
257 case OP_CHARNC:
258 case OP_PLUS:
259 case OP_MINPLUS:
6bf342e1 260 case OP_POSPLUS:
8ac170f3
PH
261 set_bit(start_bits, tcode[1], caseless, cd);
262 try_next = FALSE;
263 break;
264
265 /* Single character type sets the bits and stops */
266
267 case OP_NOT_DIGIT:
268 for (c = 0; c < 32; c++)
269 start_bits[c] |= ~cd->cbits[c+cbit_digit];
270 try_next = FALSE;
271 break;
272
273 case OP_DIGIT:
274 for (c = 0; c < 32; c++)
275 start_bits[c] |= cd->cbits[c+cbit_digit];
276 try_next = FALSE;
277 break;
278
aa41d2de
PH
279 /* The cbit_space table has vertical tab as whitespace; we have to
280 discard it. */
281
8ac170f3
PH
282 case OP_NOT_WHITESPACE:
283 for (c = 0; c < 32; c++)
aa41d2de
PH
284 {
285 int d = cd->cbits[c+cbit_space];
286 if (c == 1) d &= ~0x08;
287 start_bits[c] |= ~d;
288 }
8ac170f3
PH
289 try_next = FALSE;
290 break;
291
aa41d2de
PH
292 /* The cbit_space table has vertical tab as whitespace; we have to
293 discard it. */
294
8ac170f3
PH
295 case OP_WHITESPACE:
296 for (c = 0; c < 32; c++)
aa41d2de
PH
297 {
298 int d = cd->cbits[c+cbit_space];
299 if (c == 1) d &= ~0x08;
300 start_bits[c] |= d;
301 }
8ac170f3
PH
302 try_next = FALSE;
303 break;
304
305 case OP_NOT_WORDCHAR:
306 for (c = 0; c < 32; c++)
307 start_bits[c] |= ~cd->cbits[c+cbit_word];
308 try_next = FALSE;
309 break;
310
311 case OP_WORDCHAR:
312 for (c = 0; c < 32; c++)
313 start_bits[c] |= cd->cbits[c+cbit_word];
314 try_next = FALSE;
315 break;
316
317 /* One or more character type fudges the pointer and restarts, knowing
318 it will hit a single character type and stop there. */
319
320 case OP_TYPEPLUS:
321 case OP_TYPEMINPLUS:
322 tcode++;
323 break;
324
325 case OP_TYPEEXACT:
326 tcode += 3;
327 break;
328
329 /* Zero or more repeats of character types set the bits and then
330 try again. */
331
332 case OP_TYPEUPTO:
333 case OP_TYPEMINUPTO:
6bf342e1 334 case OP_TYPEPOSUPTO:
8ac170f3
PH
335 tcode += 2; /* Fall through */
336
337 case OP_TYPESTAR:
338 case OP_TYPEMINSTAR:
6bf342e1 339 case OP_TYPEPOSSTAR:
8ac170f3
PH
340 case OP_TYPEQUERY:
341 case OP_TYPEMINQUERY:
6bf342e1 342 case OP_TYPEPOSQUERY:
8ac170f3
PH
343 switch(tcode[1])
344 {
345 case OP_ANY:
6bf342e1 346 return SSB_FAIL;
8ac170f3
PH
347
348 case OP_NOT_DIGIT:
349 for (c = 0; c < 32; c++)
350 start_bits[c] |= ~cd->cbits[c+cbit_digit];
351 break;
352
353 case OP_DIGIT:
354 for (c = 0; c < 32; c++)
355 start_bits[c] |= cd->cbits[c+cbit_digit];
356 break;
357
aa41d2de
PH
358 /* The cbit_space table has vertical tab as whitespace; we have to
359 discard it. */
360
8ac170f3
PH
361 case OP_NOT_WHITESPACE:
362 for (c = 0; c < 32; c++)
aa41d2de
PH
363 {
364 int d = cd->cbits[c+cbit_space];
365 if (c == 1) d &= ~0x08;
366 start_bits[c] |= ~d;
367 }
8ac170f3
PH
368 break;
369
aa41d2de
PH
370 /* The cbit_space table has vertical tab as whitespace; we have to
371 discard it. */
372
8ac170f3
PH
373 case OP_WHITESPACE:
374 for (c = 0; c < 32; c++)
aa41d2de
PH
375 {
376 int d = cd->cbits[c+cbit_space];
377 if (c == 1) d &= ~0x08;
378 start_bits[c] |= d;
379 }
8ac170f3
PH
380 break;
381
382 case OP_NOT_WORDCHAR:
383 for (c = 0; c < 32; c++)
384 start_bits[c] |= ~cd->cbits[c+cbit_word];
385 break;
386
387 case OP_WORDCHAR:
388 for (c = 0; c < 32; c++)
389 start_bits[c] |= cd->cbits[c+cbit_word];
390 break;
391 }
392
393 tcode += 2;
394 break;
395
396 /* Character class where all the information is in a bit map: set the
397 bits and either carry on or not, according to the repeat count. If it was
398 a negative class, and we are operating with UTF-8 characters, any byte
399 with a value >= 0xc4 is a potentially valid starter because it starts a
400 character with a value > 255. */
401
402 case OP_NCLASS:
64f2600a 403#ifdef SUPPORT_UTF8
8ac170f3
PH
404 if (utf8)
405 {
406 start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
407 memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
408 }
64f2600a 409#endif
8ac170f3
PH
410 /* Fall through */
411
412 case OP_CLASS:
413 {
414 tcode++;
415
416 /* In UTF-8 mode, the bits in a bit map correspond to character
417 values, not to byte values. However, the bit map we are constructing is
418 for byte values. So we have to do a conversion for characters whose
419 value is > 127. In fact, there are only two possible starting bytes for
420 characters in the range 128 - 255. */
421
64f2600a 422#ifdef SUPPORT_UTF8
8ac170f3
PH
423 if (utf8)
424 {
425 for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
426 for (c = 128; c < 256; c++)
427 {
428 if ((tcode[c/8] && (1 << (c&7))) != 0)
429 {
430 int d = (c >> 6) | 0xc0; /* Set bit for this starter */
431 start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
432 c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
433 }
434 }
435 }
436
437 /* In non-UTF-8 mode, the two bit maps are completely compatible. */
438
439 else
64f2600a 440#endif
8ac170f3
PH
441 {
442 for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
443 }
444
445 /* Advance past the bit map, and act on what follows */
446
447 tcode += 32;
448 switch (*tcode)
449 {
450 case OP_CRSTAR:
451 case OP_CRMINSTAR:
452 case OP_CRQUERY:
453 case OP_CRMINQUERY:
454 tcode++;
455 break;
456
457 case OP_CRRANGE:
458 case OP_CRMINRANGE:
459 if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
460 else try_next = FALSE;
461 break;
462
463 default:
464 try_next = FALSE;
465 break;
466 }
467 }
468 break; /* End of bitmap class handling */
469
470 } /* End of switch */
471 } /* End of try_next loop */
472
473 code += GET(code, 1); /* Advance to next branch */
474 }
475while (*code == OP_ALT);
6bf342e1 476return yield;
8ac170f3
PH
477}
478
479
480
481/*************************************************
482* Study a compiled expression *
483*************************************************/
484
485/* This function is handed a compiled expression that it must study to produce
486information that will speed up the matching. It returns a pcre_extra block
487which then gets handed back to pcre_exec().
488
489Arguments:
490 re points to the compiled expression
491 options contains option bits
492 errorptr points to where to place error messages;
493 set NULL unless error
494
495Returns: pointer to a pcre_extra block, with study_data filled in and the
496 appropriate flag set;
497 NULL on error or if no optimization possible
498*/
499
64f2600a 500PCRE_EXP_DEFN pcre_extra *
8ac170f3
PH
501pcre_study(const pcre *external_re, int options, const char **errorptr)
502{
503uschar start_bits[32];
504pcre_extra *extra;
505pcre_study_data *study;
506const uschar *tables;
aa41d2de 507uschar *code;
8ac170f3 508compile_data compile_block;
aa41d2de 509const real_pcre *re = (const real_pcre *)external_re;
8ac170f3
PH
510
511*errorptr = NULL;
512
513if (re == NULL || re->magic_number != MAGIC_NUMBER)
514 {
515 *errorptr = "argument is not a compiled regular expression";
516 return NULL;
517 }
518
519if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
520 {
521 *errorptr = "unknown or incorrect option bit(s) set";
522 return NULL;
523 }
524
aa41d2de
PH
525code = (uschar *)re + re->name_table_offset +
526 (re->name_count * re->name_entry_size);
527
8ac170f3
PH
528/* For an anchored pattern, or an unanchored pattern that has a first char, or
529a multiline pattern that matches only at "line starts", no further processing
530at present. */
531
47db1125
NM
532if ((re->options & PCRE_ANCHORED) != 0 ||
533 (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
8ac170f3
PH
534 return NULL;
535
536/* Set the character tables in the block that is passed around */
537
538tables = re->tables;
539if (tables == NULL)
540 (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
541 (void *)(&tables));
542
543compile_block.lcc = tables + lcc_offset;
544compile_block.fcc = tables + fcc_offset;
545compile_block.cbits = tables + cbits_offset;
546compile_block.ctypes = tables + ctypes_offset;
547
548/* See if we can find a fixed set of initial characters for the pattern. */
549
550memset(start_bits, 0, 32 * sizeof(uschar));
6bf342e1
PH
551if (set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
552 (re->options & PCRE_UTF8) != 0, &compile_block) != SSB_DONE) return NULL;
8ac170f3
PH
553
554/* Get a pcre_extra block and a pcre_study_data block. The study data is put in
555the latter, which is pointed to by the former, which may also get additional
556data set later by the calling program. At the moment, the size of
557pcre_study_data is fixed. We nevertheless save it in a field for returning via
558the pcre_fullinfo() function so that if it becomes variable in the future, we
559don't have to change that code. */
560
561extra = (pcre_extra *)(pcre_malloc)
562 (sizeof(pcre_extra) + sizeof(pcre_study_data));
563
564if (extra == NULL)
565 {
566 *errorptr = "failed to get memory";
567 return NULL;
568 }
569
570study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
571extra->flags = PCRE_EXTRA_STUDY_DATA;
572extra->study_data = study;
573
574study->size = sizeof(pcre_study_data);
575study->options = PCRE_STUDY_MAPPED;
576memcpy(study->start_bits, start_bits, sizeof(start_bits));
577
578return extra;
579}
580
581/* End of pcre_study.c */