Install PCRE 6.2.
[exim.git] / src / src / pcre / pcre_exec.c
1 /* $Cambridge: exim/src/src/pcre/pcre_exec.c,v 1.2 2005/08/08 10:22:14 ph10 Exp $ */
2
3 /*************************************************
4 * Perl-Compatible Regular Expressions *
5 *************************************************/
6
7 /* PCRE is a library of functions to support regular expressions whose syntax
8 and semantics are as close as possible to those of the Perl 5 language.
9
10 Written by Philip Hazel
11 Copyright (c) 1997-2005 University of Cambridge
12
13 -----------------------------------------------------------------------------
14 Redistribution and use in source and binary forms, with or without
15 modification, are permitted provided that the following conditions are met:
16
17 * Redistributions of source code must retain the above copyright notice,
18 this list of conditions and the following disclaimer.
19
20 * Redistributions in binary form must reproduce the above copyright
21 notice, this list of conditions and the following disclaimer in the
22 documentation and/or other materials provided with the distribution.
23
24 * Neither the name of the University of Cambridge nor the names of its
25 contributors may be used to endorse or promote products derived from
26 this software without specific prior written permission.
27
28 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 POSSIBILITY OF SUCH DAMAGE.
39 -----------------------------------------------------------------------------
40 */
41
42
43 /* This module contains pcre_exec(), the externally visible function that does
44 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
45 possible. There are also some static supporting functions. */
46
47
48 #include "pcre_internal.h"
49
50
51 /* Structure for building a chain of data that actually lives on the
52 stack, for holding the values of the subject pointer at the start of each
53 subpattern, so as to detect when an empty string has been matched by a
54 subpattern - to break infinite loops. When NO_RECURSE is set, these blocks
55 are on the heap, not on the stack. */
56
57 typedef struct eptrblock {
58 struct eptrblock *epb_prev;
59 const uschar *epb_saved_eptr;
60 } eptrblock;
61
62 /* Flag bits for the match() function */
63
64 #define match_condassert 0x01 /* Called to check a condition assertion */
65 #define match_isgroup 0x02 /* Set if start of bracketed group */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Maximum number of ints of offset to save on the stack for recursive calls.
74 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
75 because the offset vector is always a multiple of 3 long. */
76
77 #define REC_STACK_SAVE_MAX 30
78
79 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
80
81 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
82 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
83
84
85
86 #ifdef DEBUG
87 /*************************************************
88 * Debugging function to print chars *
89 *************************************************/
90
91 /* Print a sequence of chars in printable format, stopping at the end of the
92 subject if the requested.
93
94 Arguments:
95 p points to characters
96 length number to print
97 is_subject TRUE if printing from within md->start_subject
98 md pointer to matching data block, if is_subject is TRUE
99
100 Returns: nothing
101 */
102
103 static void
104 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
105 {
106 int c;
107 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
108 while (length-- > 0)
109 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
110 }
111 #endif
112
113
114
115 /*************************************************
116 * Match a back-reference *
117 *************************************************/
118
119 /* If a back reference hasn't been set, the length that is passed is greater
120 than the number of characters left in the string, so the match fails.
121
122 Arguments:
123 offset index into the offset vector
124 eptr points into the subject
125 length length to be matched
126 md points to match data block
127 ims the ims flags
128
129 Returns: TRUE if matched
130 */
131
132 static BOOL
133 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
134 unsigned long int ims)
135 {
136 const uschar *p = md->start_subject + md->offset_vector[offset];
137
138 #ifdef DEBUG
139 if (eptr >= md->end_subject)
140 printf("matching subject <null>");
141 else
142 {
143 printf("matching subject ");
144 pchars(eptr, length, TRUE, md);
145 }
146 printf(" against backref ");
147 pchars(p, length, FALSE, md);
148 printf("\n");
149 #endif
150
151 /* Always fail if not enough characters left */
152
153 if (length > md->end_subject - eptr) return FALSE;
154
155 /* Separate the caselesss case for speed */
156
157 if ((ims & PCRE_CASELESS) != 0)
158 {
159 while (length-- > 0)
160 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
161 }
162 else
163 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
164
165 return TRUE;
166 }
167
168
169
170 /***************************************************************************
171 ****************************************************************************
172 RECURSION IN THE match() FUNCTION
173
174 The match() function is highly recursive. Some regular expressions can cause
175 it to recurse thousands of times. I was writing for Unix, so I just let it
176 call itself recursively. This uses the stack for saving everything that has
177 to be saved for a recursive call. On Unix, the stack can be large, and this
178 works fine.
179
180 It turns out that on non-Unix systems there are problems with programs that
181 use a lot of stack. (This despite the fact that every last chip has oodles
182 of memory these days, and techniques for extending the stack have been known
183 for decades.) So....
184
185 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
186 calls by keeping local variables that need to be preserved in blocks of memory
187 obtained from malloc instead instead of on the stack. Macros are used to
188 achieve this so that the actual code doesn't look very different to what it
189 always used to.
190 ****************************************************************************
191 ***************************************************************************/
192
193
194 /* These versions of the macros use the stack, as normal */
195
196 #ifndef NO_RECURSE
197 #define REGISTER register
198 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) rx = match(ra,rb,rc,rd,re,rf,rg)
199 #define RRETURN(ra) return ra
200 #else
201
202
203 /* These versions of the macros manage a private stack on the heap. Note
204 that the rd argument of RMATCH isn't actually used. It's the md argument of
205 match(), which never changes. */
206
207 #define REGISTER
208
209 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg)\
210 {\
211 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
212 if (setjmp(frame->Xwhere) == 0)\
213 {\
214 newframe->Xeptr = ra;\
215 newframe->Xecode = rb;\
216 newframe->Xoffset_top = rc;\
217 newframe->Xims = re;\
218 newframe->Xeptrb = rf;\
219 newframe->Xflags = rg;\
220 newframe->Xprevframe = frame;\
221 frame = newframe;\
222 DPRINTF(("restarting from line %d\n", __LINE__));\
223 goto HEAP_RECURSE;\
224 }\
225 else\
226 {\
227 DPRINTF(("longjumped back to line %d\n", __LINE__));\
228 frame = md->thisframe;\
229 rx = frame->Xresult;\
230 }\
231 }
232
233 #define RRETURN(ra)\
234 {\
235 heapframe *newframe = frame;\
236 frame = newframe->Xprevframe;\
237 (pcre_stack_free)(newframe);\
238 if (frame != NULL)\
239 {\
240 frame->Xresult = ra;\
241 md->thisframe = frame;\
242 longjmp(frame->Xwhere, 1);\
243 }\
244 return ra;\
245 }
246
247
248 /* Structure for remembering the local variables in a private frame */
249
250 typedef struct heapframe {
251 struct heapframe *Xprevframe;
252
253 /* Function arguments that may change */
254
255 const uschar *Xeptr;
256 const uschar *Xecode;
257 int Xoffset_top;
258 long int Xims;
259 eptrblock *Xeptrb;
260 int Xflags;
261
262 /* Function local variables */
263
264 const uschar *Xcallpat;
265 const uschar *Xcharptr;
266 const uschar *Xdata;
267 const uschar *Xnext;
268 const uschar *Xpp;
269 const uschar *Xprev;
270 const uschar *Xsaved_eptr;
271
272 recursion_info Xnew_recursive;
273
274 BOOL Xcur_is_word;
275 BOOL Xcondition;
276 BOOL Xminimize;
277 BOOL Xprev_is_word;
278
279 unsigned long int Xoriginal_ims;
280
281 #ifdef SUPPORT_UCP
282 int Xprop_type;
283 int Xprop_fail_result;
284 int Xprop_category;
285 int Xprop_chartype;
286 int Xprop_othercase;
287 int Xprop_test_against;
288 int *Xprop_test_variable;
289 #endif
290
291 int Xctype;
292 int Xfc;
293 int Xfi;
294 int Xlength;
295 int Xmax;
296 int Xmin;
297 int Xnumber;
298 int Xoffset;
299 int Xop;
300 int Xsave_capture_last;
301 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
302 int Xstacksave[REC_STACK_SAVE_MAX];
303
304 eptrblock Xnewptrb;
305
306 /* Place to pass back result, and where to jump back to */
307
308 int Xresult;
309 jmp_buf Xwhere;
310
311 } heapframe;
312
313 #endif
314
315
316 /***************************************************************************
317 ***************************************************************************/
318
319
320
321 /*************************************************
322 * Match from current position *
323 *************************************************/
324
325 /* On entry ecode points to the first opcode, and eptr to the first character
326 in the subject string, while eptrb holds the value of eptr at the start of the
327 last bracketed group - used for breaking infinite loops matching zero-length
328 strings. This function is called recursively in many circumstances. Whenever it
329 returns a negative (error) response, the outer incarnation must also return the
330 same response.
331
332 Performance note: It might be tempting to extract commonly used fields from the
333 md structure (e.g. utf8, end_subject) into individual variables to improve
334 performance. Tests using gcc on a SPARC disproved this; in the first case, it
335 made performance worse.
336
337 Arguments:
338 eptr pointer in subject
339 ecode position in code
340 offset_top current top pointer
341 md pointer to "static" info for the match
342 ims current /i, /m, and /s options
343 eptrb pointer to chain of blocks containing eptr at start of
344 brackets - for testing for empty matches
345 flags can contain
346 match_condassert - this is an assertion condition
347 match_isgroup - this is the start of a bracketed group
348
349 Returns: MATCH_MATCH if matched ) these values are >= 0
350 MATCH_NOMATCH if failed to match )
351 a negative PCRE_ERROR_xxx value if aborted by an error condition
352 (e.g. stopped by recursion limit)
353 */
354
355 static int
356 match(REGISTER const uschar *eptr, REGISTER const uschar *ecode,
357 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
358 int flags)
359 {
360 /* These variables do not need to be preserved over recursion in this function,
361 so they can be ordinary variables in all cases. Mark them with "register"
362 because they are used a lot in loops. */
363
364 register int rrc; /* Returns from recursive calls */
365 register int i; /* Used for loops not involving calls to RMATCH() */
366 register int c; /* Character values not kept over RMATCH() calls */
367 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
368
369 /* When recursion is not being used, all "local" variables that have to be
370 preserved over calls to RMATCH() are part of a "frame" which is obtained from
371 heap storage. Set up the top-level frame here; others are obtained from the
372 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
373
374 #ifdef NO_RECURSE
375 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
376 frame->Xprevframe = NULL; /* Marks the top level */
377
378 /* Copy in the original argument variables */
379
380 frame->Xeptr = eptr;
381 frame->Xecode = ecode;
382 frame->Xoffset_top = offset_top;
383 frame->Xims = ims;
384 frame->Xeptrb = eptrb;
385 frame->Xflags = flags;
386
387 /* This is where control jumps back to to effect "recursion" */
388
389 HEAP_RECURSE:
390
391 /* Macros make the argument variables come from the current frame */
392
393 #define eptr frame->Xeptr
394 #define ecode frame->Xecode
395 #define offset_top frame->Xoffset_top
396 #define ims frame->Xims
397 #define eptrb frame->Xeptrb
398 #define flags frame->Xflags
399
400 /* Ditto for the local variables */
401
402 #ifdef SUPPORT_UTF8
403 #define charptr frame->Xcharptr
404 #endif
405 #define callpat frame->Xcallpat
406 #define data frame->Xdata
407 #define next frame->Xnext
408 #define pp frame->Xpp
409 #define prev frame->Xprev
410 #define saved_eptr frame->Xsaved_eptr
411
412 #define new_recursive frame->Xnew_recursive
413
414 #define cur_is_word frame->Xcur_is_word
415 #define condition frame->Xcondition
416 #define minimize frame->Xminimize
417 #define prev_is_word frame->Xprev_is_word
418
419 #define original_ims frame->Xoriginal_ims
420
421 #ifdef SUPPORT_UCP
422 #define prop_type frame->Xprop_type
423 #define prop_fail_result frame->Xprop_fail_result
424 #define prop_category frame->Xprop_category
425 #define prop_chartype frame->Xprop_chartype
426 #define prop_othercase frame->Xprop_othercase
427 #define prop_test_against frame->Xprop_test_against
428 #define prop_test_variable frame->Xprop_test_variable
429 #endif
430
431 #define ctype frame->Xctype
432 #define fc frame->Xfc
433 #define fi frame->Xfi
434 #define length frame->Xlength
435 #define max frame->Xmax
436 #define min frame->Xmin
437 #define number frame->Xnumber
438 #define offset frame->Xoffset
439 #define op frame->Xop
440 #define save_capture_last frame->Xsave_capture_last
441 #define save_offset1 frame->Xsave_offset1
442 #define save_offset2 frame->Xsave_offset2
443 #define save_offset3 frame->Xsave_offset3
444 #define stacksave frame->Xstacksave
445
446 #define newptrb frame->Xnewptrb
447
448 /* When recursion is being used, local variables are allocated on the stack and
449 get preserved during recursion in the normal way. In this environment, fi and
450 i, and fc and c, can be the same variables. */
451
452 #else
453 #define fi i
454 #define fc c
455
456
457 #ifdef SUPPORT_UTF8 /* Many of these variables are used ony */
458 const uschar *charptr; /* small blocks of the code. My normal */
459 #endif /* style of coding would have declared */
460 const uschar *callpat; /* them within each of those blocks. */
461 const uschar *data; /* However, in order to accommodate the */
462 const uschar *next; /* version of this code that uses an */
463 const uschar *pp; /* external "stack" implemented on the */
464 const uschar *prev; /* heap, it is easier to declare them */
465 const uschar *saved_eptr; /* all here, so the declarations can */
466 /* be cut out in a block. The only */
467 recursion_info new_recursive; /* declarations within blocks below are */
468 /* for variables that do not have to */
469 BOOL cur_is_word; /* be preserved over a recursive call */
470 BOOL condition; /* to RMATCH(). */
471 BOOL minimize;
472 BOOL prev_is_word;
473
474 unsigned long int original_ims;
475
476 #ifdef SUPPORT_UCP
477 int prop_type;
478 int prop_fail_result;
479 int prop_category;
480 int prop_chartype;
481 int prop_othercase;
482 int prop_test_against;
483 int *prop_test_variable;
484 #endif
485
486 int ctype;
487 int length;
488 int max;
489 int min;
490 int number;
491 int offset;
492 int op;
493 int save_capture_last;
494 int save_offset1, save_offset2, save_offset3;
495 int stacksave[REC_STACK_SAVE_MAX];
496
497 eptrblock newptrb;
498 #endif
499
500 /* These statements are here to stop the compiler complaining about unitialized
501 variables. */
502
503 #ifdef SUPPORT_UCP
504 prop_fail_result = 0;
505 prop_test_against = 0;
506 prop_test_variable = NULL;
507 #endif
508
509 /* OK, now we can get on with the real code of the function. Recursion is
510 specified by the macros RMATCH and RRETURN. When NO_RECURSE is *not* defined,
511 these just turn into a recursive call to match() and a "return", respectively.
512 However, RMATCH isn't like a function call because it's quite a complicated
513 macro. It has to be used in one particular way. This shouldn't, however, impact
514 performance when true recursion is being used. */
515
516 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
517
518 original_ims = ims; /* Save for resetting on ')' */
519 utf8 = md->utf8; /* Local copy of the flag */
520
521 /* At the start of a bracketed group, add the current subject pointer to the
522 stack of such pointers, to be re-instated at the end of the group when we hit
523 the closing ket. When match() is called in other circumstances, we don't add to
524 this stack. */
525
526 if ((flags & match_isgroup) != 0)
527 {
528 newptrb.epb_prev = eptrb;
529 newptrb.epb_saved_eptr = eptr;
530 eptrb = &newptrb;
531 }
532
533 /* Now start processing the operations. */
534
535 for (;;)
536 {
537 op = *ecode;
538 minimize = FALSE;
539
540 /* For partial matching, remember if we ever hit the end of the subject after
541 matching at least one subject character. */
542
543 if (md->partial &&
544 eptr >= md->end_subject &&
545 eptr > md->start_match)
546 md->hitend = TRUE;
547
548 /* Opening capturing bracket. If there is space in the offset vector, save
549 the current subject position in the working slot at the top of the vector. We
550 mustn't change the current values of the data slot, because they may be set
551 from a previous iteration of this group, and be referred to by a reference
552 inside the group.
553
554 If the bracket fails to match, we need to restore this value and also the
555 values of the final offsets, in case they were set by a previous iteration of
556 the same bracket.
557
558 If there isn't enough space in the offset vector, treat this as if it were a
559 non-capturing bracket. Don't worry about setting the flag for the error case
560 here; that is handled in the code for KET. */
561
562 if (op > OP_BRA)
563 {
564 number = op - OP_BRA;
565
566 /* For extended extraction brackets (large number), we have to fish out the
567 number from a dummy opcode at the start. */
568
569 if (number > EXTRACT_BASIC_MAX)
570 number = GET2(ecode, 2+LINK_SIZE);
571 offset = number << 1;
572
573 #ifdef DEBUG
574 printf("start bracket %d subject=", number);
575 pchars(eptr, 16, TRUE, md);
576 printf("\n");
577 #endif
578
579 if (offset < md->offset_max)
580 {
581 save_offset1 = md->offset_vector[offset];
582 save_offset2 = md->offset_vector[offset+1];
583 save_offset3 = md->offset_vector[md->offset_end - number];
584 save_capture_last = md->capture_last;
585
586 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
587 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
588
589 do
590 {
591 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
592 match_isgroup);
593 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
594 md->capture_last = save_capture_last;
595 ecode += GET(ecode, 1);
596 }
597 while (*ecode == OP_ALT);
598
599 DPRINTF(("bracket %d failed\n", number));
600
601 md->offset_vector[offset] = save_offset1;
602 md->offset_vector[offset+1] = save_offset2;
603 md->offset_vector[md->offset_end - number] = save_offset3;
604
605 RRETURN(MATCH_NOMATCH);
606 }
607
608 /* Insufficient room for saving captured contents */
609
610 else op = OP_BRA;
611 }
612
613 /* Other types of node can be handled by a switch */
614
615 switch(op)
616 {
617 case OP_BRA: /* Non-capturing bracket: optimized */
618 DPRINTF(("start bracket 0\n"));
619 do
620 {
621 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
622 match_isgroup);
623 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
624 ecode += GET(ecode, 1);
625 }
626 while (*ecode == OP_ALT);
627 DPRINTF(("bracket 0 failed\n"));
628 RRETURN(MATCH_NOMATCH);
629
630 /* Conditional group: compilation checked that there are no more than
631 two branches. If the condition is false, skipping the first branch takes us
632 past the end if there is only one branch, but that's OK because that is
633 exactly what going to the ket would do. */
634
635 case OP_COND:
636 if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */
637 {
638 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
639 condition = (offset == CREF_RECURSE * 2)?
640 (md->recursive != NULL) :
641 (offset < offset_top && md->offset_vector[offset] >= 0);
642 RMATCH(rrc, eptr, ecode + (condition?
643 (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1))),
644 offset_top, md, ims, eptrb, match_isgroup);
645 RRETURN(rrc);
646 }
647
648 /* The condition is an assertion. Call match() to evaluate it - setting
649 the final argument TRUE causes it to stop at the end of an assertion. */
650
651 else
652 {
653 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
654 match_condassert | match_isgroup);
655 if (rrc == MATCH_MATCH)
656 {
657 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2);
658 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
659 }
660 else if (rrc != MATCH_NOMATCH)
661 {
662 RRETURN(rrc); /* Need braces because of following else */
663 }
664 else ecode += GET(ecode, 1);
665 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
666 match_isgroup);
667 RRETURN(rrc);
668 }
669 /* Control never reaches here */
670
671 /* Skip over conditional reference or large extraction number data if
672 encountered. */
673
674 case OP_CREF:
675 case OP_BRANUMBER:
676 ecode += 3;
677 break;
678
679 /* End of the pattern. If we are in a recursion, we should restore the
680 offsets appropriately and continue from after the call. */
681
682 case OP_END:
683 if (md->recursive != NULL && md->recursive->group_num == 0)
684 {
685 recursion_info *rec = md->recursive;
686 DPRINTF(("Hit the end in a (?0) recursion\n"));
687 md->recursive = rec->prevrec;
688 memmove(md->offset_vector, rec->offset_save,
689 rec->saved_max * sizeof(int));
690 md->start_match = rec->save_start;
691 ims = original_ims;
692 ecode = rec->after_call;
693 break;
694 }
695
696 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
697 string - backtracking will then try other alternatives, if any. */
698
699 if (md->notempty && eptr == md->start_match) RRETURN(MATCH_NOMATCH);
700 md->end_match_ptr = eptr; /* Record where we ended */
701 md->end_offset_top = offset_top; /* and how many extracts were taken */
702 RRETURN(MATCH_MATCH);
703
704 /* Change option settings */
705
706 case OP_OPT:
707 ims = ecode[1];
708 ecode += 2;
709 DPRINTF(("ims set to %02lx\n", ims));
710 break;
711
712 /* Assertion brackets. Check the alternative branches in turn - the
713 matching won't pass the KET for an assertion. If any one branch matches,
714 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
715 start of each branch to move the current point backwards, so the code at
716 this level is identical to the lookahead case. */
717
718 case OP_ASSERT:
719 case OP_ASSERTBACK:
720 do
721 {
722 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
723 match_isgroup);
724 if (rrc == MATCH_MATCH) break;
725 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
726 ecode += GET(ecode, 1);
727 }
728 while (*ecode == OP_ALT);
729 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
730
731 /* If checking an assertion for a condition, return MATCH_MATCH. */
732
733 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
734
735 /* Continue from after the assertion, updating the offsets high water
736 mark, since extracts may have been taken during the assertion. */
737
738 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
739 ecode += 1 + LINK_SIZE;
740 offset_top = md->end_offset_top;
741 continue;
742
743 /* Negative assertion: all branches must fail to match */
744
745 case OP_ASSERT_NOT:
746 case OP_ASSERTBACK_NOT:
747 do
748 {
749 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
750 match_isgroup);
751 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
752 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
753 ecode += GET(ecode,1);
754 }
755 while (*ecode == OP_ALT);
756
757 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
758
759 ecode += 1 + LINK_SIZE;
760 continue;
761
762 /* Move the subject pointer back. This occurs only at the start of
763 each branch of a lookbehind assertion. If we are too close to the start to
764 move back, this match function fails. When working with UTF-8 we move
765 back a number of characters, not bytes. */
766
767 case OP_REVERSE:
768 #ifdef SUPPORT_UTF8
769 if (utf8)
770 {
771 c = GET(ecode,1);
772 for (i = 0; i < c; i++)
773 {
774 eptr--;
775 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
776 BACKCHAR(eptr)
777 }
778 }
779 else
780 #endif
781
782 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
783
784 {
785 eptr -= GET(ecode,1);
786 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
787 }
788
789 /* Skip to next op code */
790
791 ecode += 1 + LINK_SIZE;
792 break;
793
794 /* The callout item calls an external function, if one is provided, passing
795 details of the match so far. This is mainly for debugging, though the
796 function is able to force a failure. */
797
798 case OP_CALLOUT:
799 if (pcre_callout != NULL)
800 {
801 pcre_callout_block cb;
802 cb.version = 1; /* Version 1 of the callout block */
803 cb.callout_number = ecode[1];
804 cb.offset_vector = md->offset_vector;
805 cb.subject = (const char *)md->start_subject;
806 cb.subject_length = md->end_subject - md->start_subject;
807 cb.start_match = md->start_match - md->start_subject;
808 cb.current_position = eptr - md->start_subject;
809 cb.pattern_position = GET(ecode, 2);
810 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
811 cb.capture_top = offset_top/2;
812 cb.capture_last = md->capture_last;
813 cb.callout_data = md->callout_data;
814 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
815 if (rrc < 0) RRETURN(rrc);
816 }
817 ecode += 2 + 2*LINK_SIZE;
818 break;
819
820 /* Recursion either matches the current regex, or some subexpression. The
821 offset data is the offset to the starting bracket from the start of the
822 whole pattern. (This is so that it works from duplicated subpatterns.)
823
824 If there are any capturing brackets started but not finished, we have to
825 save their starting points and reinstate them after the recursion. However,
826 we don't know how many such there are (offset_top records the completed
827 total) so we just have to save all the potential data. There may be up to
828 65535 such values, which is too large to put on the stack, but using malloc
829 for small numbers seems expensive. As a compromise, the stack is used when
830 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
831 is used. A problem is what to do if the malloc fails ... there is no way of
832 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
833 values on the stack, and accept that the rest may be wrong.
834
835 There are also other values that have to be saved. We use a chained
836 sequence of blocks that actually live on the stack. Thanks to Robin Houston
837 for the original version of this logic. */
838
839 case OP_RECURSE:
840 {
841 callpat = md->start_code + GET(ecode, 1);
842 new_recursive.group_num = *callpat - OP_BRA;
843
844 /* For extended extraction brackets (large number), we have to fish out
845 the number from a dummy opcode at the start. */
846
847 if (new_recursive.group_num > EXTRACT_BASIC_MAX)
848 new_recursive.group_num = GET2(callpat, 2+LINK_SIZE);
849
850 /* Add to "recursing stack" */
851
852 new_recursive.prevrec = md->recursive;
853 md->recursive = &new_recursive;
854
855 /* Find where to continue from afterwards */
856
857 ecode += 1 + LINK_SIZE;
858 new_recursive.after_call = ecode;
859
860 /* Now save the offset data. */
861
862 new_recursive.saved_max = md->offset_end;
863 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
864 new_recursive.offset_save = stacksave;
865 else
866 {
867 new_recursive.offset_save =
868 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
869 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
870 }
871
872 memcpy(new_recursive.offset_save, md->offset_vector,
873 new_recursive.saved_max * sizeof(int));
874 new_recursive.save_start = md->start_match;
875 md->start_match = eptr;
876
877 /* OK, now we can do the recursion. For each top-level alternative we
878 restore the offset and recursion data. */
879
880 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
881 do
882 {
883 RMATCH(rrc, eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims,
884 eptrb, match_isgroup);
885 if (rrc == MATCH_MATCH)
886 {
887 md->recursive = new_recursive.prevrec;
888 if (new_recursive.offset_save != stacksave)
889 (pcre_free)(new_recursive.offset_save);
890 RRETURN(MATCH_MATCH);
891 }
892 else if (rrc != MATCH_NOMATCH) RRETURN(rrc);
893
894 md->recursive = &new_recursive;
895 memcpy(md->offset_vector, new_recursive.offset_save,
896 new_recursive.saved_max * sizeof(int));
897 callpat += GET(callpat, 1);
898 }
899 while (*callpat == OP_ALT);
900
901 DPRINTF(("Recursion didn't match\n"));
902 md->recursive = new_recursive.prevrec;
903 if (new_recursive.offset_save != stacksave)
904 (pcre_free)(new_recursive.offset_save);
905 RRETURN(MATCH_NOMATCH);
906 }
907 /* Control never reaches here */
908
909 /* "Once" brackets are like assertion brackets except that after a match,
910 the point in the subject string is not moved back. Thus there can never be
911 a move back into the brackets. Friedl calls these "atomic" subpatterns.
912 Check the alternative branches in turn - the matching won't pass the KET
913 for this kind of subpattern. If any one branch matches, we carry on as at
914 the end of a normal bracket, leaving the subject pointer. */
915
916 case OP_ONCE:
917 {
918 prev = ecode;
919 saved_eptr = eptr;
920
921 do
922 {
923 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
924 eptrb, match_isgroup);
925 if (rrc == MATCH_MATCH) break;
926 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
927 ecode += GET(ecode,1);
928 }
929 while (*ecode == OP_ALT);
930
931 /* If hit the end of the group (which could be repeated), fail */
932
933 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
934
935 /* Continue as from after the assertion, updating the offsets high water
936 mark, since extracts may have been taken. */
937
938 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
939
940 offset_top = md->end_offset_top;
941 eptr = md->end_match_ptr;
942
943 /* For a non-repeating ket, just continue at this level. This also
944 happens for a repeating ket if no characters were matched in the group.
945 This is the forcible breaking of infinite loops as implemented in Perl
946 5.005. If there is an options reset, it will get obeyed in the normal
947 course of events. */
948
949 if (*ecode == OP_KET || eptr == saved_eptr)
950 {
951 ecode += 1+LINK_SIZE;
952 break;
953 }
954
955 /* The repeating kets try the rest of the pattern or restart from the
956 preceding bracket, in the appropriate order. We need to reset any options
957 that changed within the bracket before re-running it, so check the next
958 opcode. */
959
960 if (ecode[1+LINK_SIZE] == OP_OPT)
961 {
962 ims = (ims & ~PCRE_IMS) | ecode[4];
963 DPRINTF(("ims set to %02lx at group repeat\n", ims));
964 }
965
966 if (*ecode == OP_KETRMIN)
967 {
968 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);
969 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
970 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
971 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
972 }
973 else /* OP_KETRMAX */
974 {
975 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
976 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
977 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
978 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
979 }
980 }
981 RRETURN(MATCH_NOMATCH);
982
983 /* An alternation is the end of a branch; scan along to find the end of the
984 bracketed group and go to there. */
985
986 case OP_ALT:
987 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
988 break;
989
990 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
991 that it may occur zero times. It may repeat infinitely, or not at all -
992 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
993 repeat limits are compiled as a number of copies, with the optional ones
994 preceded by BRAZERO or BRAMINZERO. */
995
996 case OP_BRAZERO:
997 {
998 next = ecode+1;
999 RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, match_isgroup);
1000 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1001 do next += GET(next,1); while (*next == OP_ALT);
1002 ecode = next + 1+LINK_SIZE;
1003 }
1004 break;
1005
1006 case OP_BRAMINZERO:
1007 {
1008 next = ecode+1;
1009 do next += GET(next,1); while (*next == OP_ALT);
1010 RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb,
1011 match_isgroup);
1012 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1013 ecode++;
1014 }
1015 break;
1016
1017 /* End of a group, repeated or non-repeating. If we are at the end of
1018 an assertion "group", stop matching and return MATCH_MATCH, but record the
1019 current high water mark for use by positive assertions. Do this also
1020 for the "once" (not-backup up) groups. */
1021
1022 case OP_KET:
1023 case OP_KETRMIN:
1024 case OP_KETRMAX:
1025 {
1026 prev = ecode - GET(ecode, 1);
1027 saved_eptr = eptrb->epb_saved_eptr;
1028
1029 /* Back up the stack of bracket start pointers. */
1030
1031 eptrb = eptrb->epb_prev;
1032
1033 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1034 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1035 *prev == OP_ONCE)
1036 {
1037 md->end_match_ptr = eptr; /* For ONCE */
1038 md->end_offset_top = offset_top;
1039 RRETURN(MATCH_MATCH);
1040 }
1041
1042 /* In all other cases except a conditional group we have to check the
1043 group number back at the start and if necessary complete handling an
1044 extraction by setting the offsets and bumping the high water mark. */
1045
1046 if (*prev != OP_COND)
1047 {
1048 number = *prev - OP_BRA;
1049
1050 /* For extended extraction brackets (large number), we have to fish out
1051 the number from a dummy opcode at the start. */
1052
1053 if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);
1054 offset = number << 1;
1055
1056 #ifdef DEBUG
1057 printf("end bracket %d", number);
1058 printf("\n");
1059 #endif
1060
1061 /* Test for a numbered group. This includes groups called as a result
1062 of recursion. Note that whole-pattern recursion is coded as a recurse
1063 into group 0, so it won't be picked up here. Instead, we catch it when
1064 the OP_END is reached. */
1065
1066 if (number > 0)
1067 {
1068 md->capture_last = number;
1069 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1070 {
1071 md->offset_vector[offset] =
1072 md->offset_vector[md->offset_end - number];
1073 md->offset_vector[offset+1] = eptr - md->start_subject;
1074 if (offset_top <= offset) offset_top = offset + 2;
1075 }
1076
1077 /* Handle a recursively called group. Restore the offsets
1078 appropriately and continue from after the call. */
1079
1080 if (md->recursive != NULL && md->recursive->group_num == number)
1081 {
1082 recursion_info *rec = md->recursive;
1083 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1084 md->recursive = rec->prevrec;
1085 md->start_match = rec->save_start;
1086 memcpy(md->offset_vector, rec->offset_save,
1087 rec->saved_max * sizeof(int));
1088 ecode = rec->after_call;
1089 ims = original_ims;
1090 break;
1091 }
1092 }
1093 }
1094
1095 /* Reset the value of the ims flags, in case they got changed during
1096 the group. */
1097
1098 ims = original_ims;
1099 DPRINTF(("ims reset to %02lx\n", ims));
1100
1101 /* For a non-repeating ket, just continue at this level. This also
1102 happens for a repeating ket if no characters were matched in the group.
1103 This is the forcible breaking of infinite loops as implemented in Perl
1104 5.005. If there is an options reset, it will get obeyed in the normal
1105 course of events. */
1106
1107 if (*ecode == OP_KET || eptr == saved_eptr)
1108 {
1109 ecode += 1 + LINK_SIZE;
1110 break;
1111 }
1112
1113 /* The repeating kets try the rest of the pattern or restart from the
1114 preceding bracket, in the appropriate order. */
1115
1116 if (*ecode == OP_KETRMIN)
1117 {
1118 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
1119 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1120 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
1121 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1122 }
1123 else /* OP_KETRMAX */
1124 {
1125 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
1126 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1127 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
1128 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1129 }
1130 }
1131
1132 RRETURN(MATCH_NOMATCH);
1133
1134 /* Start of subject unless notbol, or after internal newline if multiline */
1135
1136 case OP_CIRC:
1137 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1138 if ((ims & PCRE_MULTILINE) != 0)
1139 {
1140 if (eptr != md->start_subject && eptr[-1] != NEWLINE)
1141 RRETURN(MATCH_NOMATCH);
1142 ecode++;
1143 break;
1144 }
1145 /* ... else fall through */
1146
1147 /* Start of subject assertion */
1148
1149 case OP_SOD:
1150 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1151 ecode++;
1152 break;
1153
1154 /* Start of match assertion */
1155
1156 case OP_SOM:
1157 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1158 ecode++;
1159 break;
1160
1161 /* Assert before internal newline if multiline, or before a terminating
1162 newline unless endonly is set, else end of subject unless noteol is set. */
1163
1164 case OP_DOLL:
1165 if ((ims & PCRE_MULTILINE) != 0)
1166 {
1167 if (eptr < md->end_subject)
1168 { if (*eptr != NEWLINE) RRETURN(MATCH_NOMATCH); }
1169 else
1170 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1171 ecode++;
1172 break;
1173 }
1174 else
1175 {
1176 if (md->noteol) RRETURN(MATCH_NOMATCH);
1177 if (!md->endonly)
1178 {
1179 if (eptr < md->end_subject - 1 ||
1180 (eptr == md->end_subject - 1 && *eptr != NEWLINE))
1181 RRETURN(MATCH_NOMATCH);
1182 ecode++;
1183 break;
1184 }
1185 }
1186 /* ... else fall through */
1187
1188 /* End of subject assertion (\z) */
1189
1190 case OP_EOD:
1191 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1192 ecode++;
1193 break;
1194
1195 /* End of subject or ending \n assertion (\Z) */
1196
1197 case OP_EODN:
1198 if (eptr < md->end_subject - 1 ||
1199 (eptr == md->end_subject - 1 && *eptr != NEWLINE)) RRETURN(MATCH_NOMATCH);
1200 ecode++;
1201 break;
1202
1203 /* Word boundary assertions */
1204
1205 case OP_NOT_WORD_BOUNDARY:
1206 case OP_WORD_BOUNDARY:
1207 {
1208
1209 /* Find out if the previous and current characters are "word" characters.
1210 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1211 be "non-word" characters. */
1212
1213 #ifdef SUPPORT_UTF8
1214 if (utf8)
1215 {
1216 if (eptr == md->start_subject) prev_is_word = FALSE; else
1217 {
1218 const uschar *lastptr = eptr - 1;
1219 while((*lastptr & 0xc0) == 0x80) lastptr--;
1220 GETCHAR(c, lastptr);
1221 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1222 }
1223 if (eptr >= md->end_subject) cur_is_word = FALSE; else
1224 {
1225 GETCHAR(c, eptr);
1226 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1227 }
1228 }
1229 else
1230 #endif
1231
1232 /* More streamlined when not in UTF-8 mode */
1233
1234 {
1235 prev_is_word = (eptr != md->start_subject) &&
1236 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1237 cur_is_word = (eptr < md->end_subject) &&
1238 ((md->ctypes[*eptr] & ctype_word) != 0);
1239 }
1240
1241 /* Now see if the situation is what we want */
1242
1243 if ((*ecode++ == OP_WORD_BOUNDARY)?
1244 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1245 RRETURN(MATCH_NOMATCH);
1246 }
1247 break;
1248
1249 /* Match a single character type; inline for speed */
1250
1251 case OP_ANY:
1252 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
1253 RRETURN(MATCH_NOMATCH);
1254 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1255 #ifdef SUPPORT_UTF8
1256 if (utf8)
1257 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1258 #endif
1259 ecode++;
1260 break;
1261
1262 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1263 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1264
1265 case OP_ANYBYTE:
1266 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1267 ecode++;
1268 break;
1269
1270 case OP_NOT_DIGIT:
1271 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1272 GETCHARINCTEST(c, eptr);
1273 if (
1274 #ifdef SUPPORT_UTF8
1275 c < 256 &&
1276 #endif
1277 (md->ctypes[c] & ctype_digit) != 0
1278 )
1279 RRETURN(MATCH_NOMATCH);
1280 ecode++;
1281 break;
1282
1283 case OP_DIGIT:
1284 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1285 GETCHARINCTEST(c, eptr);
1286 if (
1287 #ifdef SUPPORT_UTF8
1288 c >= 256 ||
1289 #endif
1290 (md->ctypes[c] & ctype_digit) == 0
1291 )
1292 RRETURN(MATCH_NOMATCH);
1293 ecode++;
1294 break;
1295
1296 case OP_NOT_WHITESPACE:
1297 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1298 GETCHARINCTEST(c, eptr);
1299 if (
1300 #ifdef SUPPORT_UTF8
1301 c < 256 &&
1302 #endif
1303 (md->ctypes[c] & ctype_space) != 0
1304 )
1305 RRETURN(MATCH_NOMATCH);
1306 ecode++;
1307 break;
1308
1309 case OP_WHITESPACE:
1310 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1311 GETCHARINCTEST(c, eptr);
1312 if (
1313 #ifdef SUPPORT_UTF8
1314 c >= 256 ||
1315 #endif
1316 (md->ctypes[c] & ctype_space) == 0
1317 )
1318 RRETURN(MATCH_NOMATCH);
1319 ecode++;
1320 break;
1321
1322 case OP_NOT_WORDCHAR:
1323 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1324 GETCHARINCTEST(c, eptr);
1325 if (
1326 #ifdef SUPPORT_UTF8
1327 c < 256 &&
1328 #endif
1329 (md->ctypes[c] & ctype_word) != 0
1330 )
1331 RRETURN(MATCH_NOMATCH);
1332 ecode++;
1333 break;
1334
1335 case OP_WORDCHAR:
1336 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1337 GETCHARINCTEST(c, eptr);
1338 if (
1339 #ifdef SUPPORT_UTF8
1340 c >= 256 ||
1341 #endif
1342 (md->ctypes[c] & ctype_word) == 0
1343 )
1344 RRETURN(MATCH_NOMATCH);
1345 ecode++;
1346 break;
1347
1348 #ifdef SUPPORT_UCP
1349 /* Check the next character by Unicode property. We will get here only
1350 if the support is in the binary; otherwise a compile-time error occurs. */
1351
1352 case OP_PROP:
1353 case OP_NOTPROP:
1354 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1355 GETCHARINCTEST(c, eptr);
1356 {
1357 int chartype, rqdtype;
1358 int othercase;
1359 int category = ucp_findchar(c, &chartype, &othercase);
1360
1361 rqdtype = *(++ecode);
1362 ecode++;
1363
1364 if (rqdtype >= 128)
1365 {
1366 if ((rqdtype - 128 != category) == (op == OP_PROP))
1367 RRETURN(MATCH_NOMATCH);
1368 }
1369 else
1370 {
1371 if ((rqdtype != chartype) == (op == OP_PROP))
1372 RRETURN(MATCH_NOMATCH);
1373 }
1374 }
1375 break;
1376
1377 /* Match an extended Unicode sequence. We will get here only if the support
1378 is in the binary; otherwise a compile-time error occurs. */
1379
1380 case OP_EXTUNI:
1381 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1382 GETCHARINCTEST(c, eptr);
1383 {
1384 int chartype;
1385 int othercase;
1386 int category = ucp_findchar(c, &chartype, &othercase);
1387 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1388 while (eptr < md->end_subject)
1389 {
1390 int len = 1;
1391 if (!utf8) c = *eptr; else
1392 {
1393 GETCHARLEN(c, eptr, len);
1394 }
1395 category = ucp_findchar(c, &chartype, &othercase);
1396 if (category != ucp_M) break;
1397 eptr += len;
1398 }
1399 }
1400 ecode++;
1401 break;
1402 #endif
1403
1404
1405 /* Match a back reference, possibly repeatedly. Look past the end of the
1406 item to see if there is repeat information following. The code is similar
1407 to that for character classes, but repeated for efficiency. Then obey
1408 similar code to character type repeats - written out again for speed.
1409 However, if the referenced string is the empty string, always treat
1410 it as matched, any number of times (otherwise there could be infinite
1411 loops). */
1412
1413 case OP_REF:
1414 {
1415 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1416 ecode += 3; /* Advance past item */
1417
1418 /* If the reference is unset, set the length to be longer than the amount
1419 of subject left; this ensures that every attempt at a match fails. We
1420 can't just fail here, because of the possibility of quantifiers with zero
1421 minima. */
1422
1423 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
1424 md->end_subject - eptr + 1 :
1425 md->offset_vector[offset+1] - md->offset_vector[offset];
1426
1427 /* Set up for repetition, or handle the non-repeated case */
1428
1429 switch (*ecode)
1430 {
1431 case OP_CRSTAR:
1432 case OP_CRMINSTAR:
1433 case OP_CRPLUS:
1434 case OP_CRMINPLUS:
1435 case OP_CRQUERY:
1436 case OP_CRMINQUERY:
1437 c = *ecode++ - OP_CRSTAR;
1438 minimize = (c & 1) != 0;
1439 min = rep_min[c]; /* Pick up values from tables; */
1440 max = rep_max[c]; /* zero for max => infinity */
1441 if (max == 0) max = INT_MAX;
1442 break;
1443
1444 case OP_CRRANGE:
1445 case OP_CRMINRANGE:
1446 minimize = (*ecode == OP_CRMINRANGE);
1447 min = GET2(ecode, 1);
1448 max = GET2(ecode, 3);
1449 if (max == 0) max = INT_MAX;
1450 ecode += 5;
1451 break;
1452
1453 default: /* No repeat follows */
1454 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1455 eptr += length;
1456 continue; /* With the main loop */
1457 }
1458
1459 /* If the length of the reference is zero, just continue with the
1460 main loop. */
1461
1462 if (length == 0) continue;
1463
1464 /* First, ensure the minimum number of matches are present. We get back
1465 the length of the reference string explicitly rather than passing the
1466 address of eptr, so that eptr can be a register variable. */
1467
1468 for (i = 1; i <= min; i++)
1469 {
1470 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1471 eptr += length;
1472 }
1473
1474 /* If min = max, continue at the same level without recursion.
1475 They are not both allowed to be zero. */
1476
1477 if (min == max) continue;
1478
1479 /* If minimizing, keep trying and advancing the pointer */
1480
1481 if (minimize)
1482 {
1483 for (fi = min;; fi++)
1484 {
1485 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1486 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1487 if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1488 RRETURN(MATCH_NOMATCH);
1489 eptr += length;
1490 }
1491 /* Control never gets here */
1492 }
1493
1494 /* If maximizing, find the longest string and work backwards */
1495
1496 else
1497 {
1498 pp = eptr;
1499 for (i = min; i < max; i++)
1500 {
1501 if (!match_ref(offset, eptr, length, md, ims)) break;
1502 eptr += length;
1503 }
1504 while (eptr >= pp)
1505 {
1506 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1507 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1508 eptr -= length;
1509 }
1510 RRETURN(MATCH_NOMATCH);
1511 }
1512 }
1513 /* Control never gets here */
1514
1515
1516
1517 /* Match a bit-mapped character class, possibly repeatedly. This op code is
1518 used when all the characters in the class have values in the range 0-255,
1519 and either the matching is caseful, or the characters are in the range
1520 0-127 when UTF-8 processing is enabled. The only difference between
1521 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1522 encountered.
1523
1524 First, look past the end of the item to see if there is repeat information
1525 following. Then obey similar code to character type repeats - written out
1526 again for speed. */
1527
1528 case OP_NCLASS:
1529 case OP_CLASS:
1530 {
1531 data = ecode + 1; /* Save for matching */
1532 ecode += 33; /* Advance past the item */
1533
1534 switch (*ecode)
1535 {
1536 case OP_CRSTAR:
1537 case OP_CRMINSTAR:
1538 case OP_CRPLUS:
1539 case OP_CRMINPLUS:
1540 case OP_CRQUERY:
1541 case OP_CRMINQUERY:
1542 c = *ecode++ - OP_CRSTAR;
1543 minimize = (c & 1) != 0;
1544 min = rep_min[c]; /* Pick up values from tables; */
1545 max = rep_max[c]; /* zero for max => infinity */
1546 if (max == 0) max = INT_MAX;
1547 break;
1548
1549 case OP_CRRANGE:
1550 case OP_CRMINRANGE:
1551 minimize = (*ecode == OP_CRMINRANGE);
1552 min = GET2(ecode, 1);
1553 max = GET2(ecode, 3);
1554 if (max == 0) max = INT_MAX;
1555 ecode += 5;
1556 break;
1557
1558 default: /* No repeat follows */
1559 min = max = 1;
1560 break;
1561 }
1562
1563 /* First, ensure the minimum number of matches are present. */
1564
1565 #ifdef SUPPORT_UTF8
1566 /* UTF-8 mode */
1567 if (utf8)
1568 {
1569 for (i = 1; i <= min; i++)
1570 {
1571 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1572 GETCHARINC(c, eptr);
1573 if (c > 255)
1574 {
1575 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1576 }
1577 else
1578 {
1579 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1580 }
1581 }
1582 }
1583 else
1584 #endif
1585 /* Not UTF-8 mode */
1586 {
1587 for (i = 1; i <= min; i++)
1588 {
1589 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1590 c = *eptr++;
1591 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1592 }
1593 }
1594
1595 /* If max == min we can continue with the main loop without the
1596 need to recurse. */
1597
1598 if (min == max) continue;
1599
1600 /* If minimizing, keep testing the rest of the expression and advancing
1601 the pointer while it matches the class. */
1602
1603 if (minimize)
1604 {
1605 #ifdef SUPPORT_UTF8
1606 /* UTF-8 mode */
1607 if (utf8)
1608 {
1609 for (fi = min;; fi++)
1610 {
1611 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1612 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1613 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1614 GETCHARINC(c, eptr);
1615 if (c > 255)
1616 {
1617 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1618 }
1619 else
1620 {
1621 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1622 }
1623 }
1624 }
1625 else
1626 #endif
1627 /* Not UTF-8 mode */
1628 {
1629 for (fi = min;; fi++)
1630 {
1631 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1632 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1633 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1634 c = *eptr++;
1635 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1636 }
1637 }
1638 /* Control never gets here */
1639 }
1640
1641 /* If maximizing, find the longest possible run, then work backwards. */
1642
1643 else
1644 {
1645 pp = eptr;
1646
1647 #ifdef SUPPORT_UTF8
1648 /* UTF-8 mode */
1649 if (utf8)
1650 {
1651 for (i = min; i < max; i++)
1652 {
1653 int len = 1;
1654 if (eptr >= md->end_subject) break;
1655 GETCHARLEN(c, eptr, len);
1656 if (c > 255)
1657 {
1658 if (op == OP_CLASS) break;
1659 }
1660 else
1661 {
1662 if ((data[c/8] & (1 << (c&7))) == 0) break;
1663 }
1664 eptr += len;
1665 }
1666 for (;;)
1667 {
1668 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1669 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1670 if (eptr-- == pp) break; /* Stop if tried at original pos */
1671 BACKCHAR(eptr);
1672 }
1673 }
1674 else
1675 #endif
1676 /* Not UTF-8 mode */
1677 {
1678 for (i = min; i < max; i++)
1679 {
1680 if (eptr >= md->end_subject) break;
1681 c = *eptr;
1682 if ((data[c/8] & (1 << (c&7))) == 0) break;
1683 eptr++;
1684 }
1685 while (eptr >= pp)
1686 {
1687 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1688 eptr--;
1689 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1690 }
1691 }
1692
1693 RRETURN(MATCH_NOMATCH);
1694 }
1695 }
1696 /* Control never gets here */
1697
1698
1699 /* Match an extended character class. This opcode is encountered only
1700 in UTF-8 mode, because that's the only time it is compiled. */
1701
1702 #ifdef SUPPORT_UTF8
1703 case OP_XCLASS:
1704 {
1705 data = ecode + 1 + LINK_SIZE; /* Save for matching */
1706 ecode += GET(ecode, 1); /* Advance past the item */
1707
1708 switch (*ecode)
1709 {
1710 case OP_CRSTAR:
1711 case OP_CRMINSTAR:
1712 case OP_CRPLUS:
1713 case OP_CRMINPLUS:
1714 case OP_CRQUERY:
1715 case OP_CRMINQUERY:
1716 c = *ecode++ - OP_CRSTAR;
1717 minimize = (c & 1) != 0;
1718 min = rep_min[c]; /* Pick up values from tables; */
1719 max = rep_max[c]; /* zero for max => infinity */
1720 if (max == 0) max = INT_MAX;
1721 break;
1722
1723 case OP_CRRANGE:
1724 case OP_CRMINRANGE:
1725 minimize = (*ecode == OP_CRMINRANGE);
1726 min = GET2(ecode, 1);
1727 max = GET2(ecode, 3);
1728 if (max == 0) max = INT_MAX;
1729 ecode += 5;
1730 break;
1731
1732 default: /* No repeat follows */
1733 min = max = 1;
1734 break;
1735 }
1736
1737 /* First, ensure the minimum number of matches are present. */
1738
1739 for (i = 1; i <= min; i++)
1740 {
1741 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1742 GETCHARINC(c, eptr);
1743 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1744 }
1745
1746 /* If max == min we can continue with the main loop without the
1747 need to recurse. */
1748
1749 if (min == max) continue;
1750
1751 /* If minimizing, keep testing the rest of the expression and advancing
1752 the pointer while it matches the class. */
1753
1754 if (minimize)
1755 {
1756 for (fi = min;; fi++)
1757 {
1758 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1759 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1760 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1761 GETCHARINC(c, eptr);
1762 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1763 }
1764 /* Control never gets here */
1765 }
1766
1767 /* If maximizing, find the longest possible run, then work backwards. */
1768
1769 else
1770 {
1771 pp = eptr;
1772 for (i = min; i < max; i++)
1773 {
1774 int len = 1;
1775 if (eptr >= md->end_subject) break;
1776 GETCHARLEN(c, eptr, len);
1777 if (!_pcre_xclass(c, data)) break;
1778 eptr += len;
1779 }
1780 for(;;)
1781 {
1782 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1783 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1784 if (eptr-- == pp) break; /* Stop if tried at original pos */
1785 BACKCHAR(eptr)
1786 }
1787 RRETURN(MATCH_NOMATCH);
1788 }
1789
1790 /* Control never gets here */
1791 }
1792 #endif /* End of XCLASS */
1793
1794 /* Match a single character, casefully */
1795
1796 case OP_CHAR:
1797 #ifdef SUPPORT_UTF8
1798 if (utf8)
1799 {
1800 length = 1;
1801 ecode++;
1802 GETCHARLEN(fc, ecode, length);
1803 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1804 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
1805 }
1806 else
1807 #endif
1808
1809 /* Non-UTF-8 mode */
1810 {
1811 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
1812 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
1813 ecode += 2;
1814 }
1815 break;
1816
1817 /* Match a single character, caselessly */
1818
1819 case OP_CHARNC:
1820 #ifdef SUPPORT_UTF8
1821 if (utf8)
1822 {
1823 length = 1;
1824 ecode++;
1825 GETCHARLEN(fc, ecode, length);
1826
1827 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1828
1829 /* If the pattern character's value is < 128, we have only one byte, and
1830 can use the fast lookup table. */
1831
1832 if (fc < 128)
1833 {
1834 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
1835 }
1836
1837 /* Otherwise we must pick up the subject character */
1838
1839 else
1840 {
1841 int dc;
1842 GETCHARINC(dc, eptr);
1843 ecode += length;
1844
1845 /* If we have Unicode property support, we can use it to test the other
1846 case of the character, if there is one. The result of ucp_findchar() is
1847 < 0 if the char isn't found, and othercase is returned as zero if there
1848 isn't one. */
1849
1850 if (fc != dc)
1851 {
1852 #ifdef SUPPORT_UCP
1853 int chartype;
1854 int othercase;
1855 if (ucp_findchar(fc, &chartype, &othercase) < 0 || dc != othercase)
1856 #endif
1857 RRETURN(MATCH_NOMATCH);
1858 }
1859 }
1860 }
1861 else
1862 #endif /* SUPPORT_UTF8 */
1863
1864 /* Non-UTF-8 mode */
1865 {
1866 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
1867 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
1868 ecode += 2;
1869 }
1870 break;
1871
1872 /* Match a single character repeatedly; different opcodes share code. */
1873
1874 case OP_EXACT:
1875 min = max = GET2(ecode, 1);
1876 ecode += 3;
1877 goto REPEATCHAR;
1878
1879 case OP_UPTO:
1880 case OP_MINUPTO:
1881 min = 0;
1882 max = GET2(ecode, 1);
1883 minimize = *ecode == OP_MINUPTO;
1884 ecode += 3;
1885 goto REPEATCHAR;
1886
1887 case OP_STAR:
1888 case OP_MINSTAR:
1889 case OP_PLUS:
1890 case OP_MINPLUS:
1891 case OP_QUERY:
1892 case OP_MINQUERY:
1893 c = *ecode++ - OP_STAR;
1894 minimize = (c & 1) != 0;
1895 min = rep_min[c]; /* Pick up values from tables; */
1896 max = rep_max[c]; /* zero for max => infinity */
1897 if (max == 0) max = INT_MAX;
1898
1899 /* Common code for all repeated single-character matches. We can give
1900 up quickly if there are fewer than the minimum number of characters left in
1901 the subject. */
1902
1903 REPEATCHAR:
1904 #ifdef SUPPORT_UTF8
1905 if (utf8)
1906 {
1907 length = 1;
1908 charptr = ecode;
1909 GETCHARLEN(fc, ecode, length);
1910 if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1911 ecode += length;
1912
1913 /* Handle multibyte character matching specially here. There is
1914 support for caseless matching if UCP support is present. */
1915
1916 if (length > 1)
1917 {
1918 int oclength = 0;
1919 uschar occhars[8];
1920
1921 #ifdef SUPPORT_UCP
1922 int othercase;
1923 int chartype;
1924 if ((ims & PCRE_CASELESS) != 0 &&
1925 ucp_findchar(fc, &chartype, &othercase) >= 0 &&
1926 othercase > 0)
1927 oclength = _pcre_ord2utf8(othercase, occhars);
1928 #endif /* SUPPORT_UCP */
1929
1930 for (i = 1; i <= min; i++)
1931 {
1932 if (memcmp(eptr, charptr, length) == 0) eptr += length;
1933 /* Need braces because of following else */
1934 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
1935 else
1936 {
1937 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
1938 eptr += oclength;
1939 }
1940 }
1941
1942 if (min == max) continue;
1943
1944 if (minimize)
1945 {
1946 for (fi = min;; fi++)
1947 {
1948 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1949 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1950 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1951 if (memcmp(eptr, charptr, length) == 0) eptr += length;
1952 /* Need braces because of following else */
1953 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
1954 else
1955 {
1956 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
1957 eptr += oclength;
1958 }
1959 }
1960 /* Control never gets here */
1961 }
1962 else
1963 {
1964 pp = eptr;
1965 for (i = min; i < max; i++)
1966 {
1967 if (eptr > md->end_subject - length) break;
1968 if (memcmp(eptr, charptr, length) == 0) eptr += length;
1969 else if (oclength == 0) break;
1970 else
1971 {
1972 if (memcmp(eptr, occhars, oclength) != 0) break;
1973 eptr += oclength;
1974 }
1975 }
1976 while (eptr >= pp)
1977 {
1978 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1979 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1980 eptr -= length;
1981 }
1982 RRETURN(MATCH_NOMATCH);
1983 }
1984 /* Control never gets here */
1985 }
1986
1987 /* If the length of a UTF-8 character is 1, we fall through here, and
1988 obey the code as for non-UTF-8 characters below, though in this case the
1989 value of fc will always be < 128. */
1990 }
1991 else
1992 #endif /* SUPPORT_UTF8 */
1993
1994 /* When not in UTF-8 mode, load a single-byte character. */
1995 {
1996 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1997 fc = *ecode++;
1998 }
1999
2000 /* The value of fc at this point is always less than 256, though we may or
2001 may not be in UTF-8 mode. The code is duplicated for the caseless and
2002 caseful cases, for speed, since matching characters is likely to be quite
2003 common. First, ensure the minimum number of matches are present. If min =
2004 max, continue at the same level without recursing. Otherwise, if
2005 minimizing, keep trying the rest of the expression and advancing one
2006 matching character if failing, up to the maximum. Alternatively, if
2007 maximizing, find the maximum number of characters and work backwards. */
2008
2009 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2010 max, eptr));
2011
2012 if ((ims & PCRE_CASELESS) != 0)
2013 {
2014 fc = md->lcc[fc];
2015 for (i = 1; i <= min; i++)
2016 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2017 if (min == max) continue;
2018 if (minimize)
2019 {
2020 for (fi = min;; fi++)
2021 {
2022 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2023 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2024 if (fi >= max || eptr >= md->end_subject ||
2025 fc != md->lcc[*eptr++])
2026 RRETURN(MATCH_NOMATCH);
2027 }
2028 /* Control never gets here */
2029 }
2030 else
2031 {
2032 pp = eptr;
2033 for (i = min; i < max; i++)
2034 {
2035 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2036 eptr++;
2037 }
2038 while (eptr >= pp)
2039 {
2040 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2041 eptr--;
2042 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2043 }
2044 RRETURN(MATCH_NOMATCH);
2045 }
2046 /* Control never gets here */
2047 }
2048
2049 /* Caseful comparisons (includes all multi-byte characters) */
2050
2051 else
2052 {
2053 for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2054 if (min == max) continue;
2055 if (minimize)
2056 {
2057 for (fi = min;; fi++)
2058 {
2059 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2060 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2061 if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2062 RRETURN(MATCH_NOMATCH);
2063 }
2064 /* Control never gets here */
2065 }
2066 else
2067 {
2068 pp = eptr;
2069 for (i = min; i < max; i++)
2070 {
2071 if (eptr >= md->end_subject || fc != *eptr) break;
2072 eptr++;
2073 }
2074 while (eptr >= pp)
2075 {
2076 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2077 eptr--;
2078 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2079 }
2080 RRETURN(MATCH_NOMATCH);
2081 }
2082 }
2083 /* Control never gets here */
2084
2085 /* Match a negated single one-byte character. The character we are
2086 checking can be multibyte. */
2087
2088 case OP_NOT:
2089 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2090 ecode++;
2091 GETCHARINCTEST(c, eptr);
2092 if ((ims & PCRE_CASELESS) != 0)
2093 {
2094 #ifdef SUPPORT_UTF8
2095 if (c < 256)
2096 #endif
2097 c = md->lcc[c];
2098 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2099 }
2100 else
2101 {
2102 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2103 }
2104 break;
2105
2106 /* Match a negated single one-byte character repeatedly. This is almost a
2107 repeat of the code for a repeated single character, but I haven't found a
2108 nice way of commoning these up that doesn't require a test of the
2109 positive/negative option for each character match. Maybe that wouldn't add
2110 very much to the time taken, but character matching *is* what this is all
2111 about... */
2112
2113 case OP_NOTEXACT:
2114 min = max = GET2(ecode, 1);
2115 ecode += 3;
2116 goto REPEATNOTCHAR;
2117
2118 case OP_NOTUPTO:
2119 case OP_NOTMINUPTO:
2120 min = 0;
2121 max = GET2(ecode, 1);
2122 minimize = *ecode == OP_NOTMINUPTO;
2123 ecode += 3;
2124 goto REPEATNOTCHAR;
2125
2126 case OP_NOTSTAR:
2127 case OP_NOTMINSTAR:
2128 case OP_NOTPLUS:
2129 case OP_NOTMINPLUS:
2130 case OP_NOTQUERY:
2131 case OP_NOTMINQUERY:
2132 c = *ecode++ - OP_NOTSTAR;
2133 minimize = (c & 1) != 0;
2134 min = rep_min[c]; /* Pick up values from tables; */
2135 max = rep_max[c]; /* zero for max => infinity */
2136 if (max == 0) max = INT_MAX;
2137
2138 /* Common code for all repeated single-byte matches. We can give up quickly
2139 if there are fewer than the minimum number of bytes left in the
2140 subject. */
2141
2142 REPEATNOTCHAR:
2143 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2144 fc = *ecode++;
2145
2146 /* The code is duplicated for the caseless and caseful cases, for speed,
2147 since matching characters is likely to be quite common. First, ensure the
2148 minimum number of matches are present. If min = max, continue at the same
2149 level without recursing. Otherwise, if minimizing, keep trying the rest of
2150 the expression and advancing one matching character if failing, up to the
2151 maximum. Alternatively, if maximizing, find the maximum number of
2152 characters and work backwards. */
2153
2154 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2155 max, eptr));
2156
2157 if ((ims & PCRE_CASELESS) != 0)
2158 {
2159 fc = md->lcc[fc];
2160
2161 #ifdef SUPPORT_UTF8
2162 /* UTF-8 mode */
2163 if (utf8)
2164 {
2165 register int d;
2166 for (i = 1; i <= min; i++)
2167 {
2168 GETCHARINC(d, eptr);
2169 if (d < 256) d = md->lcc[d];
2170 if (fc == d) RRETURN(MATCH_NOMATCH);
2171 }
2172 }
2173 else
2174 #endif
2175
2176 /* Not UTF-8 mode */
2177 {
2178 for (i = 1; i <= min; i++)
2179 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2180 }
2181
2182 if (min == max) continue;
2183
2184 if (minimize)
2185 {
2186 #ifdef SUPPORT_UTF8
2187 /* UTF-8 mode */
2188 if (utf8)
2189 {
2190 register int d;
2191 for (fi = min;; fi++)
2192 {
2193 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2194 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2195 GETCHARINC(d, eptr);
2196 if (d < 256) d = md->lcc[d];
2197 if (fi >= max || eptr >= md->end_subject || fc == d)
2198 RRETURN(MATCH_NOMATCH);
2199 }
2200 }
2201 else
2202 #endif
2203 /* Not UTF-8 mode */
2204 {
2205 for (fi = min;; fi++)
2206 {
2207 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2208 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2209 if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2210 RRETURN(MATCH_NOMATCH);
2211 }
2212 }
2213 /* Control never gets here */
2214 }
2215
2216 /* Maximize case */
2217
2218 else
2219 {
2220 pp = eptr;
2221
2222 #ifdef SUPPORT_UTF8
2223 /* UTF-8 mode */
2224 if (utf8)
2225 {
2226 register int d;
2227 for (i = min; i < max; i++)
2228 {
2229 int len = 1;
2230 if (eptr >= md->end_subject) break;
2231 GETCHARLEN(d, eptr, len);
2232 if (d < 256) d = md->lcc[d];
2233 if (fc == d) break;
2234 eptr += len;
2235 }
2236 for(;;)
2237 {
2238 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2239 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2240 if (eptr-- == pp) break; /* Stop if tried at original pos */
2241 BACKCHAR(eptr);
2242 }
2243 }
2244 else
2245 #endif
2246 /* Not UTF-8 mode */
2247 {
2248 for (i = min; i < max; i++)
2249 {
2250 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2251 eptr++;
2252 }
2253 while (eptr >= pp)
2254 {
2255 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2256 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2257 eptr--;
2258 }
2259 }
2260
2261 RRETURN(MATCH_NOMATCH);
2262 }
2263 /* Control never gets here */
2264 }
2265
2266 /* Caseful comparisons */
2267
2268 else
2269 {
2270 #ifdef SUPPORT_UTF8
2271 /* UTF-8 mode */
2272 if (utf8)
2273 {
2274 register int d;
2275 for (i = 1; i <= min; i++)
2276 {
2277 GETCHARINC(d, eptr);
2278 if (fc == d) RRETURN(MATCH_NOMATCH);
2279 }
2280 }
2281 else
2282 #endif
2283 /* Not UTF-8 mode */
2284 {
2285 for (i = 1; i <= min; i++)
2286 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2287 }
2288
2289 if (min == max) continue;
2290
2291 if (minimize)
2292 {
2293 #ifdef SUPPORT_UTF8
2294 /* UTF-8 mode */
2295 if (utf8)
2296 {
2297 register int d;
2298 for (fi = min;; fi++)
2299 {
2300 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2301 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2302 GETCHARINC(d, eptr);
2303 if (fi >= max || eptr >= md->end_subject || fc == d)
2304 RRETURN(MATCH_NOMATCH);
2305 }
2306 }
2307 else
2308 #endif
2309 /* Not UTF-8 mode */
2310 {
2311 for (fi = min;; fi++)
2312 {
2313 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2314 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2315 if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2316 RRETURN(MATCH_NOMATCH);
2317 }
2318 }
2319 /* Control never gets here */
2320 }
2321
2322 /* Maximize case */
2323
2324 else
2325 {
2326 pp = eptr;
2327
2328 #ifdef SUPPORT_UTF8
2329 /* UTF-8 mode */
2330 if (utf8)
2331 {
2332 register int d;
2333 for (i = min; i < max; i++)
2334 {
2335 int len = 1;
2336 if (eptr >= md->end_subject) break;
2337 GETCHARLEN(d, eptr, len);
2338 if (fc == d) break;
2339 eptr += len;
2340 }
2341 for(;;)
2342 {
2343 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2344 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2345 if (eptr-- == pp) break; /* Stop if tried at original pos */
2346 BACKCHAR(eptr);
2347 }
2348 }
2349 else
2350 #endif
2351 /* Not UTF-8 mode */
2352 {
2353 for (i = min; i < max; i++)
2354 {
2355 if (eptr >= md->end_subject || fc == *eptr) break;
2356 eptr++;
2357 }
2358 while (eptr >= pp)
2359 {
2360 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2361 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2362 eptr--;
2363 }
2364 }
2365
2366 RRETURN(MATCH_NOMATCH);
2367 }
2368 }
2369 /* Control never gets here */
2370
2371 /* Match a single character type repeatedly; several different opcodes
2372 share code. This is very similar to the code for single characters, but we
2373 repeat it in the interests of efficiency. */
2374
2375 case OP_TYPEEXACT:
2376 min = max = GET2(ecode, 1);
2377 minimize = TRUE;
2378 ecode += 3;
2379 goto REPEATTYPE;
2380
2381 case OP_TYPEUPTO:
2382 case OP_TYPEMINUPTO:
2383 min = 0;
2384 max = GET2(ecode, 1);
2385 minimize = *ecode == OP_TYPEMINUPTO;
2386 ecode += 3;
2387 goto REPEATTYPE;
2388
2389 case OP_TYPESTAR:
2390 case OP_TYPEMINSTAR:
2391 case OP_TYPEPLUS:
2392 case OP_TYPEMINPLUS:
2393 case OP_TYPEQUERY:
2394 case OP_TYPEMINQUERY:
2395 c = *ecode++ - OP_TYPESTAR;
2396 minimize = (c & 1) != 0;
2397 min = rep_min[c]; /* Pick up values from tables; */
2398 max = rep_max[c]; /* zero for max => infinity */
2399 if (max == 0) max = INT_MAX;
2400
2401 /* Common code for all repeated single character type matches. Note that
2402 in UTF-8 mode, '.' matches a character of any length, but for the other
2403 character types, the valid characters are all one-byte long. */
2404
2405 REPEATTYPE:
2406 ctype = *ecode++; /* Code for the character type */
2407
2408 #ifdef SUPPORT_UCP
2409 if (ctype == OP_PROP || ctype == OP_NOTPROP)
2410 {
2411 prop_fail_result = ctype == OP_NOTPROP;
2412 prop_type = *ecode++;
2413 if (prop_type >= 128)
2414 {
2415 prop_test_against = prop_type - 128;
2416 prop_test_variable = &prop_category;
2417 }
2418 else
2419 {
2420 prop_test_against = prop_type;
2421 prop_test_variable = &prop_chartype;
2422 }
2423 }
2424 else prop_type = -1;
2425 #endif
2426
2427 /* First, ensure the minimum number of matches are present. Use inline
2428 code for maximizing the speed, and do the type test once at the start
2429 (i.e. keep it out of the loop). Also we can test that there are at least
2430 the minimum number of bytes before we start. This isn't as effective in
2431 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2432 is tidier. Also separate the UCP code, which can be the same for both UTF-8
2433 and single-bytes. */
2434
2435 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2436 if (min > 0)
2437 {
2438 #ifdef SUPPORT_UCP
2439 if (prop_type > 0)
2440 {
2441 for (i = 1; i <= min; i++)
2442 {
2443 GETCHARINC(c, eptr);
2444 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
2445 if ((*prop_test_variable == prop_test_against) == prop_fail_result)
2446 RRETURN(MATCH_NOMATCH);
2447 }
2448 }
2449
2450 /* Match extended Unicode sequences. We will get here only if the
2451 support is in the binary; otherwise a compile-time error occurs. */
2452
2453 else if (ctype == OP_EXTUNI)
2454 {
2455 for (i = 1; i <= min; i++)
2456 {
2457 GETCHARINCTEST(c, eptr);
2458 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
2459 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2460 while (eptr < md->end_subject)
2461 {
2462 int len = 1;
2463 if (!utf8) c = *eptr; else
2464 {
2465 GETCHARLEN(c, eptr, len);
2466 }
2467 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
2468 if (prop_category != ucp_M) break;
2469 eptr += len;
2470 }
2471 }
2472 }
2473
2474 else
2475 #endif /* SUPPORT_UCP */
2476
2477 /* Handle all other cases when the coding is UTF-8 */
2478
2479 #ifdef SUPPORT_UTF8
2480 if (utf8) switch(ctype)
2481 {
2482 case OP_ANY:
2483 for (i = 1; i <= min; i++)
2484 {
2485 if (eptr >= md->end_subject ||
2486 (*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0))
2487 RRETURN(MATCH_NOMATCH);
2488 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2489 }
2490 break;
2491
2492 case OP_ANYBYTE:
2493 eptr += min;
2494 break;
2495
2496 case OP_NOT_DIGIT:
2497 for (i = 1; i <= min; i++)
2498 {
2499 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2500 GETCHARINC(c, eptr);
2501 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
2502 RRETURN(MATCH_NOMATCH);
2503 }
2504 break;
2505
2506 case OP_DIGIT:
2507 for (i = 1; i <= min; i++)
2508 {
2509 if (eptr >= md->end_subject ||
2510 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
2511 RRETURN(MATCH_NOMATCH);
2512 /* No need to skip more bytes - we know it's a 1-byte character */
2513 }
2514 break;
2515
2516 case OP_NOT_WHITESPACE:
2517 for (i = 1; i <= min; i++)
2518 {
2519 if (eptr >= md->end_subject ||
2520 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
2521 RRETURN(MATCH_NOMATCH);
2522 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2523 }
2524 break;
2525
2526 case OP_WHITESPACE:
2527 for (i = 1; i <= min; i++)
2528 {
2529 if (eptr >= md->end_subject ||
2530 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
2531 RRETURN(MATCH_NOMATCH);
2532 /* No need to skip more bytes - we know it's a 1-byte character */
2533 }
2534 break;
2535
2536 case OP_NOT_WORDCHAR:
2537 for (i = 1; i <= min; i++)
2538 {
2539 if (eptr >= md->end_subject ||
2540 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
2541 RRETURN(MATCH_NOMATCH);
2542 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2543 }
2544 break;
2545
2546 case OP_WORDCHAR:
2547 for (i = 1; i <= min; i++)
2548 {
2549 if (eptr >= md->end_subject ||
2550 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
2551 RRETURN(MATCH_NOMATCH);
2552 /* No need to skip more bytes - we know it's a 1-byte character */
2553 }
2554 break;
2555
2556 default:
2557 RRETURN(PCRE_ERROR_INTERNAL);
2558 } /* End switch(ctype) */
2559
2560 else
2561 #endif /* SUPPORT_UTF8 */
2562
2563 /* Code for the non-UTF-8 case for minimum matching of operators other
2564 than OP_PROP and OP_NOTPROP. */
2565
2566 switch(ctype)
2567 {
2568 case OP_ANY:
2569 if ((ims & PCRE_DOTALL) == 0)
2570 {
2571 for (i = 1; i <= min; i++)
2572 if (*eptr++ == NEWLINE) RRETURN(MATCH_NOMATCH);
2573 }
2574 else eptr += min;
2575 break;
2576
2577 case OP_ANYBYTE:
2578 eptr += min;
2579 break;
2580
2581 case OP_NOT_DIGIT:
2582 for (i = 1; i <= min; i++)
2583 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
2584 break;
2585
2586 case OP_DIGIT:
2587 for (i = 1; i <= min; i++)
2588 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
2589 break;
2590
2591 case OP_NOT_WHITESPACE:
2592 for (i = 1; i <= min; i++)
2593 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
2594 break;
2595
2596 case OP_WHITESPACE:
2597 for (i = 1; i <= min; i++)
2598 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
2599 break;
2600
2601 case OP_NOT_WORDCHAR:
2602 for (i = 1; i <= min; i++)
2603 if ((md->ctypes[*eptr++] & ctype_word) != 0)
2604 RRETURN(MATCH_NOMATCH);
2605 break;
2606
2607 case OP_WORDCHAR:
2608 for (i = 1; i <= min; i++)
2609 if ((md->ctypes[*eptr++] & ctype_word) == 0)
2610 RRETURN(MATCH_NOMATCH);
2611 break;
2612
2613 default:
2614 RRETURN(PCRE_ERROR_INTERNAL);
2615 }
2616 }
2617
2618 /* If min = max, continue at the same level without recursing */
2619
2620 if (min == max) continue;
2621
2622 /* If minimizing, we have to test the rest of the pattern before each
2623 subsequent match. Again, separate the UTF-8 case for speed, and also
2624 separate the UCP cases. */
2625
2626 if (minimize)
2627 {
2628 #ifdef SUPPORT_UCP
2629 if (prop_type > 0)
2630 {
2631 for (fi = min;; fi++)
2632 {
2633 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2634 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2635 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2636 GETCHARINC(c, eptr);
2637 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
2638 if ((*prop_test_variable == prop_test_against) == prop_fail_result)
2639 RRETURN(MATCH_NOMATCH);
2640 }
2641 }
2642
2643 /* Match extended Unicode sequences. We will get here only if the
2644 support is in the binary; otherwise a compile-time error occurs. */
2645
2646 else if (ctype == OP_EXTUNI)
2647 {
2648 for (fi = min;; fi++)
2649 {
2650 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2651 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2652 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2653 GETCHARINCTEST(c, eptr);
2654 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
2655 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2656 while (eptr < md->end_subject)
2657 {
2658 int len = 1;
2659 if (!utf8) c = *eptr; else
2660 {
2661 GETCHARLEN(c, eptr, len);
2662 }
2663 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
2664 if (prop_category != ucp_M) break;
2665 eptr += len;
2666 }
2667 }
2668 }
2669
2670 else
2671 #endif /* SUPPORT_UCP */
2672
2673 #ifdef SUPPORT_UTF8
2674 /* UTF-8 mode */
2675 if (utf8)
2676 {
2677 for (fi = min;; fi++)
2678 {
2679 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2680 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2681 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2682
2683 GETCHARINC(c, eptr);
2684 switch(ctype)
2685 {
2686 case OP_ANY:
2687 if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
2688 break;
2689
2690 case OP_ANYBYTE:
2691 break;
2692
2693 case OP_NOT_DIGIT:
2694 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
2695 RRETURN(MATCH_NOMATCH);
2696 break;
2697
2698 case OP_DIGIT:
2699 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
2700 RRETURN(MATCH_NOMATCH);
2701 break;
2702
2703 case OP_NOT_WHITESPACE:
2704 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
2705 RRETURN(MATCH_NOMATCH);
2706 break;
2707
2708 case OP_WHITESPACE:
2709 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
2710 RRETURN(MATCH_NOMATCH);
2711 break;
2712
2713 case OP_NOT_WORDCHAR:
2714 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
2715 RRETURN(MATCH_NOMATCH);
2716 break;
2717
2718 case OP_WORDCHAR:
2719 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
2720 RRETURN(MATCH_NOMATCH);
2721 break;
2722
2723 default:
2724 RRETURN(PCRE_ERROR_INTERNAL);
2725 }
2726 }
2727 }
2728 else
2729 #endif
2730 /* Not UTF-8 mode */
2731 {
2732 for (fi = min;; fi++)
2733 {
2734 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2735 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2736 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2737 c = *eptr++;
2738 switch(ctype)
2739 {
2740 case OP_ANY:
2741 if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
2742 break;
2743
2744 case OP_ANYBYTE:
2745 break;
2746
2747 case OP_NOT_DIGIT:
2748 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
2749 break;
2750
2751 case OP_DIGIT:
2752 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
2753 break;
2754
2755 case OP_NOT_WHITESPACE:
2756 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
2757 break;
2758
2759 case OP_WHITESPACE:
2760 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
2761 break;
2762
2763 case OP_NOT_WORDCHAR:
2764 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
2765 break;
2766
2767 case OP_WORDCHAR:
2768 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
2769 break;
2770
2771 default:
2772 RRETURN(PCRE_ERROR_INTERNAL);
2773 }
2774 }
2775 }
2776 /* Control never gets here */
2777 }
2778
2779 /* If maximizing it is worth using inline code for speed, doing the type
2780 test once at the start (i.e. keep it out of the loop). Again, keep the
2781 UTF-8 and UCP stuff separate. */
2782
2783 else
2784 {
2785 pp = eptr; /* Remember where we started */
2786
2787 #ifdef SUPPORT_UCP
2788 if (prop_type > 0)
2789 {
2790 for (i = min; i < max; i++)
2791 {
2792 int len = 1;
2793 if (eptr >= md->end_subject) break;
2794 GETCHARLEN(c, eptr, len);
2795 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
2796 if ((*prop_test_variable == prop_test_against) == prop_fail_result)
2797 break;
2798 eptr+= len;
2799 }
2800
2801 /* eptr is now past the end of the maximum run */
2802
2803 for(;;)
2804 {
2805 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2806 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2807 if (eptr-- == pp) break; /* Stop if tried at original pos */
2808 BACKCHAR(eptr);
2809 }
2810 }
2811
2812 /* Match extended Unicode sequences. We will get here only if the
2813 support is in the binary; otherwise a compile-time error occurs. */
2814
2815 else if (ctype == OP_EXTUNI)
2816 {
2817 for (i = min; i < max; i++)
2818 {
2819 if (eptr >= md->end_subject) break;
2820 GETCHARINCTEST(c, eptr);
2821 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
2822 if (prop_category == ucp_M) break;
2823 while (eptr < md->end_subject)
2824 {
2825 int len = 1;
2826 if (!utf8) c = *eptr; else
2827 {
2828 GETCHARLEN(c, eptr, len);
2829 }
2830 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
2831 if (prop_category != ucp_M) break;
2832 eptr += len;
2833 }
2834 }
2835
2836 /* eptr is now past the end of the maximum run */
2837
2838 for(;;)
2839 {
2840 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2841 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2842 if (eptr-- == pp) break; /* Stop if tried at original pos */
2843 for (;;) /* Move back over one extended */
2844 {
2845 int len = 1;
2846 BACKCHAR(eptr);
2847 if (!utf8) c = *eptr; else
2848 {
2849 GETCHARLEN(c, eptr, len);
2850 }
2851 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
2852 if (prop_category != ucp_M) break;
2853 eptr--;
2854 }
2855 }
2856 }
2857
2858 else
2859 #endif /* SUPPORT_UCP */
2860
2861 #ifdef SUPPORT_UTF8
2862 /* UTF-8 mode */
2863
2864 if (utf8)
2865 {
2866 switch(ctype)
2867 {
2868 case OP_ANY:
2869
2870 /* Special code is required for UTF8, but when the maximum is unlimited
2871 we don't need it, so we repeat the non-UTF8 code. This is probably
2872 worth it, because .* is quite a common idiom. */
2873
2874 if (max < INT_MAX)
2875 {
2876 if ((ims & PCRE_DOTALL) == 0)
2877 {
2878 for (i = min; i < max; i++)
2879 {
2880 if (eptr >= md->end_subject || *eptr == NEWLINE) break;
2881 eptr++;
2882 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2883 }
2884 }
2885 else
2886 {
2887 for (i = min; i < max; i++)
2888 {
2889 eptr++;
2890 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2891 }
2892 }
2893 }
2894
2895 /* Handle unlimited UTF-8 repeat */
2896
2897 else
2898 {
2899 if ((ims & PCRE_DOTALL) == 0)
2900 {
2901 for (i = min; i < max; i++)
2902 {
2903 if (eptr >= md->end_subject || *eptr == NEWLINE) break;
2904 eptr++;
2905 }
2906 break;
2907 }
2908 else
2909 {
2910 c = max - min;
2911 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
2912 eptr += c;
2913 }
2914 }
2915 break;
2916
2917 /* The byte case is the same as non-UTF8 */
2918
2919 case OP_ANYBYTE:
2920 c = max - min;
2921 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
2922 eptr += c;
2923 break;
2924
2925 case OP_NOT_DIGIT:
2926 for (i = min; i < max; i++)
2927 {
2928 int len = 1;
2929 if (eptr >= md->end_subject) break;
2930 GETCHARLEN(c, eptr, len);
2931 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
2932 eptr+= len;
2933 }
2934 break;
2935
2936 case OP_DIGIT:
2937 for (i = min; i < max; i++)
2938 {
2939 int len = 1;
2940 if (eptr >= md->end_subject) break;
2941 GETCHARLEN(c, eptr, len);
2942 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
2943 eptr+= len;
2944 }
2945 break;
2946
2947 case OP_NOT_WHITESPACE:
2948 for (i = min; i < max; i++)
2949 {
2950 int len = 1;
2951 if (eptr >= md->end_subject) break;
2952 GETCHARLEN(c, eptr, len);
2953 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
2954 eptr+= len;
2955 }
2956 break;
2957
2958 case OP_WHITESPACE:
2959 for (i = min; i < max; i++)
2960 {
2961 int len = 1;
2962 if (eptr >= md->end_subject) break;
2963 GETCHARLEN(c, eptr, len);
2964 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
2965 eptr+= len;
2966 }
2967 break;
2968
2969 case OP_NOT_WORDCHAR:
2970 for (i = min; i < max; i++)
2971 {
2972 int len = 1;
2973 if (eptr >= md->end_subject) break;
2974 GETCHARLEN(c, eptr, len);
2975 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
2976 eptr+= len;
2977 }
2978 break;
2979
2980 case OP_WORDCHAR:
2981 for (i = min; i < max; i++)
2982 {
2983 int len = 1;
2984 if (eptr >= md->end_subject) break;
2985 GETCHARLEN(c, eptr, len);
2986 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
2987 eptr+= len;
2988 }
2989 break;
2990
2991 default:
2992 RRETURN(PCRE_ERROR_INTERNAL);
2993 }
2994
2995 /* eptr is now past the end of the maximum run */
2996
2997 for(;;)
2998 {
2999 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3000 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3001 if (eptr-- == pp) break; /* Stop if tried at original pos */
3002 BACKCHAR(eptr);
3003 }
3004 }
3005 else
3006 #endif
3007
3008 /* Not UTF-8 mode */
3009 {
3010 switch(ctype)
3011 {
3012 case OP_ANY:
3013 if ((ims & PCRE_DOTALL) == 0)
3014 {
3015 for (i = min; i < max; i++)
3016 {
3017 if (eptr >= md->end_subject || *eptr == NEWLINE) break;
3018 eptr++;
3019 }
3020 break;
3021 }
3022 /* For DOTALL case, fall through and treat as \C */
3023
3024 case OP_ANYBYTE:
3025 c = max - min;
3026 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
3027 eptr += c;
3028 break;
3029
3030 case OP_NOT_DIGIT:
3031 for (i = min; i < max; i++)
3032 {
3033 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
3034 break;
3035 eptr++;
3036 }
3037 break;
3038
3039 case OP_DIGIT:
3040 for (i = min; i < max; i++)
3041 {
3042 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
3043 break;
3044 eptr++;
3045 }
3046 break;
3047
3048 case OP_NOT_WHITESPACE:
3049 for (i = min; i < max; i++)
3050 {
3051 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
3052 break;
3053 eptr++;
3054 }
3055 break;
3056
3057 case OP_WHITESPACE:
3058 for (i = min; i < max; i++)
3059 {
3060 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
3061 break;
3062 eptr++;
3063 }
3064 break;
3065
3066 case OP_NOT_WORDCHAR:
3067 for (i = min; i < max; i++)
3068 {
3069 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
3070 break;
3071 eptr++;
3072 }
3073 break;
3074
3075 case OP_WORDCHAR:
3076 for (i = min; i < max; i++)
3077 {
3078 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
3079 break;
3080 eptr++;
3081 }
3082 break;
3083
3084 default:
3085 RRETURN(PCRE_ERROR_INTERNAL);
3086 }
3087
3088 /* eptr is now past the end of the maximum run */
3089
3090 while (eptr >= pp)
3091 {
3092 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3093 eptr--;
3094 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3095 }
3096 }
3097
3098 /* Get here if we can't make it match with any permitted repetitions */
3099
3100 RRETURN(MATCH_NOMATCH);
3101 }
3102 /* Control never gets here */
3103
3104 /* There's been some horrible disaster. Since all codes > OP_BRA are
3105 for capturing brackets, and there shouldn't be any gaps between 0 and
3106 OP_BRA, arrival here can only mean there is something seriously wrong
3107 in the code above or the OP_xxx definitions. */
3108
3109 default:
3110 DPRINTF(("Unknown opcode %d\n", *ecode));
3111 RRETURN(PCRE_ERROR_UNKNOWN_NODE);
3112 }
3113
3114 /* Do not stick any code in here without much thought; it is assumed
3115 that "continue" in the code above comes out to here to repeat the main
3116 loop. */
3117
3118 } /* End of main loop */
3119 /* Control never reaches here */
3120 }
3121
3122
3123 /***************************************************************************
3124 ****************************************************************************
3125 RECURSION IN THE match() FUNCTION
3126
3127 Undefine all the macros that were defined above to handle this. */
3128
3129 #ifdef NO_RECURSE
3130 #undef eptr
3131 #undef ecode
3132 #undef offset_top
3133 #undef ims
3134 #undef eptrb
3135 #undef flags
3136
3137 #undef callpat
3138 #undef charptr
3139 #undef data
3140 #undef next
3141 #undef pp
3142 #undef prev
3143 #undef saved_eptr
3144
3145 #undef new_recursive
3146
3147 #undef cur_is_word
3148 #undef condition
3149 #undef minimize
3150 #undef prev_is_word
3151
3152 #undef original_ims
3153
3154 #undef ctype
3155 #undef length
3156 #undef max
3157 #undef min
3158 #undef number
3159 #undef offset
3160 #undef op
3161 #undef save_capture_last
3162 #undef save_offset1
3163 #undef save_offset2
3164 #undef save_offset3
3165 #undef stacksave
3166
3167 #undef newptrb
3168
3169 #endif
3170
3171 /* These two are defined as macros in both cases */
3172
3173 #undef fc
3174 #undef fi
3175
3176 /***************************************************************************
3177 ***************************************************************************/
3178
3179
3180
3181 /*************************************************
3182 * Execute a Regular Expression *
3183 *************************************************/
3184
3185 /* This function applies a compiled re to a subject string and picks out
3186 portions of the string if it matches. Two elements in the vector are set for
3187 each substring: the offsets to the start and end of the substring.
3188
3189 Arguments:
3190 argument_re points to the compiled expression
3191 extra_data points to extra data or is NULL
3192 subject points to the subject string
3193 length length of subject string (may contain binary zeros)
3194 start_offset where to start in the subject string
3195 options option bits
3196 offsets points to a vector of ints to be filled in with offsets
3197 offsetcount the number of elements in the vector
3198
3199 Returns: > 0 => success; value is the number of elements filled in
3200 = 0 => success, but offsets is not big enough
3201 -1 => failed to match
3202 < -1 => some kind of unexpected problem
3203 */
3204
3205 EXPORT int
3206 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
3207 const char *subject, int length, int start_offset, int options, int *offsets,
3208 int offsetcount)
3209 {
3210 int rc, resetcount, ocount;
3211 int first_byte = -1;
3212 int req_byte = -1;
3213 int req_byte2 = -1;
3214 unsigned long int ims = 0;
3215 BOOL using_temporary_offsets = FALSE;
3216 BOOL anchored;
3217 BOOL startline;
3218 BOOL firstline;
3219 BOOL first_byte_caseless = FALSE;
3220 BOOL req_byte_caseless = FALSE;
3221 match_data match_block;
3222 const uschar *tables;
3223 const uschar *start_bits = NULL;
3224 const uschar *start_match = (const uschar *)subject + start_offset;
3225 const uschar *end_subject;
3226 const uschar *req_byte_ptr = start_match - 1;
3227
3228 pcre_study_data internal_study;
3229 const pcre_study_data *study;
3230
3231 real_pcre internal_re;
3232 const real_pcre *external_re = (const real_pcre *)argument_re;
3233 const real_pcre *re = external_re;
3234
3235 /* Plausibility checks */
3236
3237 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3238 if (re == NULL || subject == NULL ||
3239 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3240 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3241
3242 /* Fish out the optional data from the extra_data structure, first setting
3243 the default values. */
3244
3245 study = NULL;
3246 match_block.match_limit = MATCH_LIMIT;
3247 match_block.callout_data = NULL;
3248
3249 /* The table pointer is always in native byte order. */
3250
3251 tables = external_re->tables;
3252
3253 if (extra_data != NULL)
3254 {
3255 register unsigned int flags = extra_data->flags;
3256 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3257 study = (const pcre_study_data *)extra_data->study_data;
3258 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
3259 match_block.match_limit = extra_data->match_limit;
3260 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3261 match_block.callout_data = extra_data->callout_data;
3262 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
3263 }
3264
3265 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3266 is a feature that makes it possible to save compiled regex and re-use them
3267 in other programs later. */
3268
3269 if (tables == NULL) tables = _pcre_default_tables;
3270
3271 /* Check that the first field in the block is the magic number. If it is not,
3272 test for a regex that was compiled on a host of opposite endianness. If this is
3273 the case, flipped values are put in internal_re and internal_study if there was
3274 study data too. */
3275
3276 if (re->magic_number != MAGIC_NUMBER)
3277 {
3278 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
3279 if (re == NULL) return PCRE_ERROR_BADMAGIC;
3280 if (study != NULL) study = &internal_study;
3281 }
3282
3283 /* Set up other data */
3284
3285 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
3286 startline = (re->options & PCRE_STARTLINE) != 0;
3287 firstline = (re->options & PCRE_FIRSTLINE) != 0;
3288
3289 /* The code starts after the real_pcre block and the capture name table. */
3290
3291 match_block.start_code = (const uschar *)external_re + re->name_table_offset +
3292 re->name_count * re->name_entry_size;
3293
3294 match_block.start_subject = (const uschar *)subject;
3295 match_block.start_offset = start_offset;
3296 match_block.end_subject = match_block.start_subject + length;
3297 end_subject = match_block.end_subject;
3298
3299 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
3300 match_block.utf8 = (re->options & PCRE_UTF8) != 0;
3301
3302 match_block.notbol = (options & PCRE_NOTBOL) != 0;
3303 match_block.noteol = (options & PCRE_NOTEOL) != 0;
3304 match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
3305 match_block.partial = (options & PCRE_PARTIAL) != 0;
3306 match_block.hitend = FALSE;
3307
3308 match_block.recursive = NULL; /* No recursion at top level */
3309
3310 match_block.lcc = tables + lcc_offset;
3311 match_block.ctypes = tables + ctypes_offset;
3312
3313 /* Partial matching is supported only for a restricted set of regexes at the
3314 moment. */
3315
3316 if (match_block.partial && (re->options & PCRE_NOPARTIAL) != 0)
3317 return PCRE_ERROR_BADPARTIAL;
3318
3319 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3320 back the character offset. */
3321
3322 #ifdef SUPPORT_UTF8
3323 if (match_block.utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
3324 {
3325 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
3326 return PCRE_ERROR_BADUTF8;
3327 if (start_offset > 0 && start_offset < length)
3328 {
3329 int tb = ((uschar *)subject)[start_offset];
3330 if (tb > 127)
3331 {
3332 tb &= 0xc0;
3333 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
3334 }
3335 }
3336 }
3337 #endif
3338
3339 /* The ims options can vary during the matching as a result of the presence
3340 of (?ims) items in the pattern. They are kept in a local variable so that
3341 restoring at the exit of a group is easy. */
3342
3343 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
3344
3345 /* If the expression has got more back references than the offsets supplied can
3346 hold, we get a temporary chunk of working store to use during the matching.
3347 Otherwise, we can use the vector supplied, rounding down its size to a multiple
3348 of 3. */
3349
3350 ocount = offsetcount - (offsetcount % 3);
3351
3352 if (re->top_backref > 0 && re->top_backref >= ocount/3)
3353 {
3354 ocount = re->top_backref * 3 + 3;
3355 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
3356 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
3357 using_temporary_offsets = TRUE;
3358 DPRINTF(("Got memory to hold back references\n"));
3359 }
3360 else match_block.offset_vector = offsets;
3361
3362 match_block.offset_end = ocount;
3363 match_block.offset_max = (2*ocount)/3;
3364 match_block.offset_overflow = FALSE;
3365 match_block.capture_last = -1;
3366
3367 /* Compute the minimum number of offsets that we need to reset each time. Doing
3368 this makes a huge difference to execution time when there aren't many brackets
3369 in the pattern. */
3370
3371 resetcount = 2 + re->top_bracket * 2;
3372 if (resetcount > offsetcount) resetcount = ocount;
3373
3374 /* Reset the working variable associated with each extraction. These should
3375 never be used unless previously set, but they get saved and restored, and so we
3376 initialize them to avoid reading uninitialized locations. */
3377
3378 if (match_block.offset_vector != NULL)
3379 {
3380 register int *iptr = match_block.offset_vector + ocount;
3381 register int *iend = iptr - resetcount/2 + 1;
3382 while (--iptr >= iend) *iptr = -1;
3383 }
3384
3385 /* Set up the first character to match, if available. The first_byte value is
3386 never set for an anchored regular expression, but the anchoring may be forced
3387 at run time, so we have to test for anchoring. The first char may be unset for
3388 an unanchored pattern, of course. If there's no first char and the pattern was
3389 studied, there may be a bitmap of possible first characters. */
3390
3391 if (!anchored)
3392 {
3393 if ((re->options & PCRE_FIRSTSET) != 0)
3394 {
3395 first_byte = re->first_byte & 255;
3396 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
3397 first_byte = match_block.lcc[first_byte];
3398 }
3399 else
3400 if (!startline && study != NULL &&
3401 (study->options & PCRE_STUDY_MAPPED) != 0)
3402 start_bits = study->start_bits;
3403 }
3404
3405 /* For anchored or unanchored matches, there may be a "last known required
3406 character" set. */
3407
3408 if ((re->options & PCRE_REQCHSET) != 0)
3409 {
3410 req_byte = re->req_byte & 255;
3411 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
3412 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
3413 }
3414
3415 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
3416 the loop runs just once. */
3417
3418 do
3419 {
3420 const uschar *save_end_subject = end_subject;
3421
3422 /* Reset the maximum number of extractions we might see. */
3423
3424 if (match_block.offset_vector != NULL)
3425 {
3426 register int *iptr = match_block.offset_vector;
3427 register int *iend = iptr + resetcount;
3428 while (iptr < iend) *iptr++ = -1;
3429 }
3430
3431 /* Advance to a unique first char if possible. If firstline is TRUE, the
3432 start of the match is constrained to the first line of a multiline string.
3433 Implement this by temporarily adjusting end_subject so that we stop scanning
3434 at a newline. If the match fails at the newline, later code breaks this loop.
3435 */
3436
3437 if (firstline)
3438 {
3439 const uschar *t = start_match;
3440 while (t < save_end_subject && *t != '\n') t++;
3441 end_subject = t;
3442 }
3443
3444 /* Now test for a unique first byte */
3445
3446 if (first_byte >= 0)
3447 {
3448 if (first_byte_caseless)
3449 while (start_match < end_subject &&
3450 match_block.lcc[*start_match] != first_byte)
3451 start_match++;
3452 else
3453 while (start_match < end_subject && *start_match != first_byte)
3454 start_match++;
3455 }
3456
3457 /* Or to just after \n for a multiline match if possible */
3458
3459 else if (startline)
3460 {
3461 if (start_match > match_block.start_subject + start_offset)
3462 {
3463 while (start_match < end_subject && start_match[-1] != NEWLINE)
3464 start_match++;
3465 }
3466 }
3467
3468 /* Or to a non-unique first char after study */
3469
3470 else if (start_bits != NULL)
3471 {
3472 while (start_match < end_subject)
3473 {
3474 register unsigned int c = *start_match;
3475 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
3476 }
3477 }
3478
3479 /* Restore fudged end_subject */
3480
3481 end_subject = save_end_subject;
3482
3483 #ifdef DEBUG /* Sigh. Some compilers never learn. */
3484 printf(">>>> Match against: ");
3485 pchars(start_match, end_subject - start_match, TRUE, &match_block);
3486 printf("\n");
3487 #endif
3488
3489 /* If req_byte is set, we know that that character must appear in the subject
3490 for the match to succeed. If the first character is set, req_byte must be
3491 later in the subject; otherwise the test starts at the match point. This
3492 optimization can save a huge amount of backtracking in patterns with nested
3493 unlimited repeats that aren't going to match. Writing separate code for
3494 cased/caseless versions makes it go faster, as does using an autoincrement
3495 and backing off on a match.
3496
3497 HOWEVER: when the subject string is very, very long, searching to its end can
3498 take a long time, and give bad performance on quite ordinary patterns. This
3499 showed up when somebody was matching /^C/ on a 32-megabyte string... so we
3500 don't do this when the string is sufficiently long.
3501
3502 ALSO: this processing is disabled when partial matching is requested.
3503 */
3504
3505 if (req_byte >= 0 &&
3506 end_subject - start_match < REQ_BYTE_MAX &&
3507 !match_block.partial)
3508 {
3509 register const uschar *p = start_match + ((first_byte >= 0)? 1 : 0);
3510
3511 /* We don't need to repeat the search if we haven't yet reached the
3512 place we found it at last time. */
3513
3514 if (p > req_byte_ptr)
3515 {
3516 if (req_byte_caseless)
3517 {
3518 while (p < end_subject)
3519 {
3520 register int pp = *p++;
3521 if (pp == req_byte || pp == req_byte2) { p--; break; }
3522 }
3523 }
3524 else
3525 {
3526 while (p < end_subject)
3527 {
3528 if (*p++ == req_byte) { p--; break; }
3529 }
3530 }
3531
3532 /* If we can't find the required character, break the matching loop */
3533
3534 if (p >= end_subject) break;
3535
3536 /* If we have found the required character, save the point where we
3537 found it, so that we don't search again next time round the loop if
3538 the start hasn't passed this character yet. */
3539
3540 req_byte_ptr = p;
3541 }
3542 }
3543
3544 /* When a match occurs, substrings will be set for all internal extractions;
3545 we just need to set up the whole thing as substring 0 before returning. If
3546 there were too many extractions, set the return code to zero. In the case
3547 where we had to get some local store to hold offsets for backreferences, copy
3548 those back references that we can. In this case there need not be overflow
3549 if certain parts of the pattern were not used. */
3550
3551 match_block.start_match = start_match;
3552 match_block.match_call_count = 0;
3553
3554 rc = match(start_match, match_block.start_code, 2, &match_block, ims, NULL,
3555 match_isgroup);
3556
3557 /* When the result is no match, if the subject's first character was a
3558 newline and the PCRE_FIRSTLINE option is set, break (which will return
3559 PCRE_ERROR_NOMATCH). The option requests that a match occur before the first
3560 newline in the subject. Otherwise, advance the pointer to the next character
3561 and continue - but the continuation will actually happen only when the
3562 pattern is not anchored. */
3563
3564 if (rc == MATCH_NOMATCH)
3565 {
3566 if (firstline && *start_match == NEWLINE) break;
3567 start_match++;
3568 #ifdef SUPPORT_UTF8
3569 if (match_block.utf8)
3570 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
3571 start_match++;
3572 #endif
3573 continue;
3574 }
3575
3576 if (rc != MATCH_MATCH)
3577 {
3578 DPRINTF((">>>> error: returning %d\n", rc));
3579 return rc;
3580 }
3581
3582 /* We have a match! Copy the offset information from temporary store if
3583 necessary */
3584
3585 if (using_temporary_offsets)
3586 {
3587 if (offsetcount >= 4)
3588 {
3589 memcpy(offsets + 2, match_block.offset_vector + 2,
3590 (offsetcount - 2) * sizeof(int));
3591 DPRINTF(("Copied offsets from temporary memory\n"));
3592 }
3593 if (match_block.end_offset_top > offsetcount)
3594 match_block.offset_overflow = TRUE;
3595
3596 DPRINTF(("Freeing temporary memory\n"));
3597 (pcre_free)(match_block.offset_vector);
3598 }
3599
3600 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
3601
3602 if (offsetcount < 2) rc = 0; else
3603 {
3604 offsets[0] = start_match - match_block.start_subject;
3605 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
3606 }
3607
3608 DPRINTF((">>>> returning %d\n", rc));
3609 return rc;
3610 }
3611
3612 /* This "while" is the end of the "do" above */
3613
3614 while (!anchored && start_match <= end_subject);
3615
3616 if (using_temporary_offsets)
3617 {
3618 DPRINTF(("Freeing temporary memory\n"));
3619 (pcre_free)(match_block.offset_vector);
3620 }
3621
3622 if (match_block.partial && match_block.hitend)
3623 {
3624 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
3625 return PCRE_ERROR_PARTIAL;
3626 }
3627 else
3628 {
3629 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
3630 return PCRE_ERROR_NOMATCH;
3631 }
3632 }
3633
3634 /* End of pcre_exec.c */