bugzilla 612 - write recipients list in X-Envelope-To header of MBOX spool file
[exim.git] / src / src / pcre / pcre_exec.c
1 /* $Cambridge: exim/src/src/pcre/pcre_exec.c,v 1.6 2007/11/12 13:02:19 nm4 Exp $ */
2
3 /*************************************************
4 * Perl-Compatible Regular Expressions *
5 *************************************************/
6
7 /* PCRE is a library of functions to support regular expressions whose syntax
8 and semantics are as close as possible to those of the Perl 5 language.
9
10 Written by Philip Hazel
11 Copyright (c) 1997-2007 University of Cambridge
12
13 -----------------------------------------------------------------------------
14 Redistribution and use in source and binary forms, with or without
15 modification, are permitted provided that the following conditions are met:
16
17 * Redistributions of source code must retain the above copyright notice,
18 this list of conditions and the following disclaimer.
19
20 * Redistributions in binary form must reproduce the above copyright
21 notice, this list of conditions and the following disclaimer in the
22 documentation and/or other materials provided with the distribution.
23
24 * Neither the name of the University of Cambridge nor the names of its
25 contributors may be used to endorse or promote products derived from
26 this software without specific prior written permission.
27
28 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 POSSIBILITY OF SUCH DAMAGE.
39 -----------------------------------------------------------------------------
40 */
41
42
43 /* This module contains pcre_exec(), the externally visible function that does
44 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
45 possible. There are also some static supporting functions. */
46
47 #ifdef HAVE_CONFIG_H
48 #include "config.h"
49 #endif
50
51 #define NLBLOCK md /* Block containing newline information */
52 #define PSSTART start_subject /* Field containing processed string start */
53 #define PSEND end_subject /* Field containing processed string end */
54
55 #include "pcre_internal.h"
56
57 /* Undefine some potentially clashing cpp symbols */
58
59 #undef min
60 #undef max
61
62 /* Flag bits for the match() function */
63
64 #define match_condassert 0x01 /* Called to check a condition assertion */
65 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Special internal returns from the match() function. Make them sufficiently
74 negative to avoid the external error codes. */
75
76 #define MATCH_COMMIT (-999)
77 #define MATCH_PRUNE (-998)
78 #define MATCH_SKIP (-997)
79 #define MATCH_THEN (-996)
80
81 /* Maximum number of ints of offset to save on the stack for recursive calls.
82 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
83 because the offset vector is always a multiple of 3 long. */
84
85 #define REC_STACK_SAVE_MAX 30
86
87 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
88
89 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
90 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
91
92
93
94 #ifdef DEBUG
95 /*************************************************
96 * Debugging function to print chars *
97 *************************************************/
98
99 /* Print a sequence of chars in printable format, stopping at the end of the
100 subject if the requested.
101
102 Arguments:
103 p points to characters
104 length number to print
105 is_subject TRUE if printing from within md->start_subject
106 md pointer to matching data block, if is_subject is TRUE
107
108 Returns: nothing
109 */
110
111 static void
112 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
113 {
114 unsigned int c;
115 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
116 while (length-- > 0)
117 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
118 }
119 #endif
120
121
122
123 /*************************************************
124 * Match a back-reference *
125 *************************************************/
126
127 /* If a back reference hasn't been set, the length that is passed is greater
128 than the number of characters left in the string, so the match fails.
129
130 Arguments:
131 offset index into the offset vector
132 eptr points into the subject
133 length length to be matched
134 md points to match data block
135 ims the ims flags
136
137 Returns: TRUE if matched
138 */
139
140 static BOOL
141 match_ref(int offset, register USPTR eptr, int length, match_data *md,
142 unsigned long int ims)
143 {
144 USPTR p = md->start_subject + md->offset_vector[offset];
145
146 #ifdef DEBUG
147 if (eptr >= md->end_subject)
148 printf("matching subject <null>");
149 else
150 {
151 printf("matching subject ");
152 pchars(eptr, length, TRUE, md);
153 }
154 printf(" against backref ");
155 pchars(p, length, FALSE, md);
156 printf("\n");
157 #endif
158
159 /* Always fail if not enough characters left */
160
161 if (length > md->end_subject - eptr) return FALSE;
162
163 /* Separate the caselesss case for speed */
164
165 if ((ims & PCRE_CASELESS) != 0)
166 {
167 while (length-- > 0)
168 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
169 }
170 else
171 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
172
173 return TRUE;
174 }
175
176
177
178 /***************************************************************************
179 ****************************************************************************
180 RECURSION IN THE match() FUNCTION
181
182 The match() function is highly recursive, though not every recursive call
183 increases the recursive depth. Nevertheless, some regular expressions can cause
184 it to recurse to a great depth. I was writing for Unix, so I just let it call
185 itself recursively. This uses the stack for saving everything that has to be
186 saved for a recursive call. On Unix, the stack can be large, and this works
187 fine.
188
189 It turns out that on some non-Unix-like systems there are problems with
190 programs that use a lot of stack. (This despite the fact that every last chip
191 has oodles of memory these days, and techniques for extending the stack have
192 been known for decades.) So....
193
194 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
195 calls by keeping local variables that need to be preserved in blocks of memory
196 obtained from malloc() instead instead of on the stack. Macros are used to
197 achieve this so that the actual code doesn't look very different to what it
198 always used to.
199
200 The original heap-recursive code used longjmp(). However, it seems that this
201 can be very slow on some operating systems. Following a suggestion from Stan
202 Switzer, the use of longjmp() has been abolished, at the cost of having to
203 provide a unique number for each call to RMATCH. There is no way of generating
204 a sequence of numbers at compile time in C. I have given them names, to make
205 them stand out more clearly.
206
207 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
208 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
209 tests. Furthermore, not using longjmp() means that local dynamic variables
210 don't have indeterminate values; this has meant that the frame size can be
211 reduced because the result can be "passed back" by straight setting of the
212 variable instead of being passed in the frame.
213 ****************************************************************************
214 ***************************************************************************/
215
216 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
217 below must be updated in sync. */
218
219 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
220 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
221 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
222 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
223 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
224 RM51, RM52, RM53, RM54 };
225
226 /* These versions of the macros use the stack, as normal. There are debugging
227 versions and production versions. Note that the "rw" argument of RMATCH isn't
228 actuall used in this definition. */
229
230 #ifndef NO_RECURSE
231 #define REGISTER register
232
233 #ifdef DEBUG
234 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
235 { \
236 printf("match() called in line %d\n", __LINE__); \
237 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
238 printf("to line %d\n", __LINE__); \
239 }
240 #define RRETURN(ra) \
241 { \
242 printf("match() returned %d from line %d ", ra, __LINE__); \
243 return ra; \
244 }
245 #else
246 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
247 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
248 #define RRETURN(ra) return ra
249 #endif
250
251 #else
252
253
254 /* These versions of the macros manage a private stack on the heap. Note that
255 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
256 argument of match(), which never changes. */
257
258 #define REGISTER
259
260 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
261 {\
262 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
263 frame->Xwhere = rw; \
264 newframe->Xeptr = ra;\
265 newframe->Xecode = rb;\
266 newframe->Xmstart = mstart;\
267 newframe->Xoffset_top = rc;\
268 newframe->Xims = re;\
269 newframe->Xeptrb = rf;\
270 newframe->Xflags = rg;\
271 newframe->Xrdepth = frame->Xrdepth + 1;\
272 newframe->Xprevframe = frame;\
273 frame = newframe;\
274 DPRINTF(("restarting from line %d\n", __LINE__));\
275 goto HEAP_RECURSE;\
276 L_##rw:\
277 DPRINTF(("jumped back to line %d\n", __LINE__));\
278 }
279
280 #define RRETURN(ra)\
281 {\
282 heapframe *newframe = frame;\
283 frame = newframe->Xprevframe;\
284 (pcre_stack_free)(newframe);\
285 if (frame != NULL)\
286 {\
287 rrc = ra;\
288 goto HEAP_RETURN;\
289 }\
290 return ra;\
291 }
292
293
294 /* Structure for remembering the local variables in a private frame */
295
296 typedef struct heapframe {
297 struct heapframe *Xprevframe;
298
299 /* Function arguments that may change */
300
301 const uschar *Xeptr;
302 const uschar *Xecode;
303 const uschar *Xmstart;
304 int Xoffset_top;
305 long int Xims;
306 eptrblock *Xeptrb;
307 int Xflags;
308 unsigned int Xrdepth;
309
310 /* Function local variables */
311
312 const uschar *Xcallpat;
313 const uschar *Xcharptr;
314 const uschar *Xdata;
315 const uschar *Xnext;
316 const uschar *Xpp;
317 const uschar *Xprev;
318 const uschar *Xsaved_eptr;
319
320 recursion_info Xnew_recursive;
321
322 BOOL Xcur_is_word;
323 BOOL Xcondition;
324 BOOL Xprev_is_word;
325
326 unsigned long int Xoriginal_ims;
327
328 #ifdef SUPPORT_UCP
329 int Xprop_type;
330 int Xprop_value;
331 int Xprop_fail_result;
332 int Xprop_category;
333 int Xprop_chartype;
334 int Xprop_script;
335 int Xoclength;
336 uschar Xocchars[8];
337 #endif
338
339 int Xctype;
340 unsigned int Xfc;
341 int Xfi;
342 int Xlength;
343 int Xmax;
344 int Xmin;
345 int Xnumber;
346 int Xoffset;
347 int Xop;
348 int Xsave_capture_last;
349 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
350 int Xstacksave[REC_STACK_SAVE_MAX];
351
352 eptrblock Xnewptrb;
353
354 /* Where to jump back to */
355
356 int Xwhere;
357
358 } heapframe;
359
360 #endif
361
362
363 /***************************************************************************
364 ***************************************************************************/
365
366
367
368 /*************************************************
369 * Match from current position *
370 *************************************************/
371
372 /* This function is called recursively in many circumstances. Whenever it
373 returns a negative (error) response, the outer incarnation must also return the
374 same response.
375
376 Performance note: It might be tempting to extract commonly used fields from the
377 md structure (e.g. utf8, end_subject) into individual variables to improve
378 performance. Tests using gcc on a SPARC disproved this; in the first case, it
379 made performance worse.
380
381 Arguments:
382 eptr pointer to current character in subject
383 ecode pointer to current position in compiled code
384 mstart pointer to the current match start position (can be modified
385 by encountering \K)
386 offset_top current top pointer
387 md pointer to "static" info for the match
388 ims current /i, /m, and /s options
389 eptrb pointer to chain of blocks containing eptr at start of
390 brackets - for testing for empty matches
391 flags can contain
392 match_condassert - this is an assertion condition
393 match_cbegroup - this is the start of an unlimited repeat
394 group that can match an empty string
395 rdepth the recursion depth
396
397 Returns: MATCH_MATCH if matched ) these values are >= 0
398 MATCH_NOMATCH if failed to match )
399 a negative PCRE_ERROR_xxx value if aborted by an error condition
400 (e.g. stopped by repeated call or recursion limit)
401 */
402
403 static int
404 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
405 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
406 int flags, unsigned int rdepth)
407 {
408 /* These variables do not need to be preserved over recursion in this function,
409 so they can be ordinary variables in all cases. Mark some of them with
410 "register" because they are used a lot in loops. */
411
412 register int rrc; /* Returns from recursive calls */
413 register int i; /* Used for loops not involving calls to RMATCH() */
414 register unsigned int c; /* Character values not kept over RMATCH() calls */
415 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
416
417 BOOL minimize, possessive; /* Quantifier options */
418
419 /* When recursion is not being used, all "local" variables that have to be
420 preserved over calls to RMATCH() are part of a "frame" which is obtained from
421 heap storage. Set up the top-level frame here; others are obtained from the
422 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
423
424 #ifdef NO_RECURSE
425 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
426 frame->Xprevframe = NULL; /* Marks the top level */
427
428 /* Copy in the original argument variables */
429
430 frame->Xeptr = eptr;
431 frame->Xecode = ecode;
432 frame->Xmstart = mstart;
433 frame->Xoffset_top = offset_top;
434 frame->Xims = ims;
435 frame->Xeptrb = eptrb;
436 frame->Xflags = flags;
437 frame->Xrdepth = rdepth;
438
439 /* This is where control jumps back to to effect "recursion" */
440
441 HEAP_RECURSE:
442
443 /* Macros make the argument variables come from the current frame */
444
445 #define eptr frame->Xeptr
446 #define ecode frame->Xecode
447 #define mstart frame->Xmstart
448 #define offset_top frame->Xoffset_top
449 #define ims frame->Xims
450 #define eptrb frame->Xeptrb
451 #define flags frame->Xflags
452 #define rdepth frame->Xrdepth
453
454 /* Ditto for the local variables */
455
456 #ifdef SUPPORT_UTF8
457 #define charptr frame->Xcharptr
458 #endif
459 #define callpat frame->Xcallpat
460 #define data frame->Xdata
461 #define next frame->Xnext
462 #define pp frame->Xpp
463 #define prev frame->Xprev
464 #define saved_eptr frame->Xsaved_eptr
465
466 #define new_recursive frame->Xnew_recursive
467
468 #define cur_is_word frame->Xcur_is_word
469 #define condition frame->Xcondition
470 #define prev_is_word frame->Xprev_is_word
471
472 #define original_ims frame->Xoriginal_ims
473
474 #ifdef SUPPORT_UCP
475 #define prop_type frame->Xprop_type
476 #define prop_value frame->Xprop_value
477 #define prop_fail_result frame->Xprop_fail_result
478 #define prop_category frame->Xprop_category
479 #define prop_chartype frame->Xprop_chartype
480 #define prop_script frame->Xprop_script
481 #define oclength frame->Xoclength
482 #define occhars frame->Xocchars
483 #endif
484
485 #define ctype frame->Xctype
486 #define fc frame->Xfc
487 #define fi frame->Xfi
488 #define length frame->Xlength
489 #define max frame->Xmax
490 #define min frame->Xmin
491 #define number frame->Xnumber
492 #define offset frame->Xoffset
493 #define op frame->Xop
494 #define save_capture_last frame->Xsave_capture_last
495 #define save_offset1 frame->Xsave_offset1
496 #define save_offset2 frame->Xsave_offset2
497 #define save_offset3 frame->Xsave_offset3
498 #define stacksave frame->Xstacksave
499
500 #define newptrb frame->Xnewptrb
501
502 /* When recursion is being used, local variables are allocated on the stack and
503 get preserved during recursion in the normal way. In this environment, fi and
504 i, and fc and c, can be the same variables. */
505
506 #else /* NO_RECURSE not defined */
507 #define fi i
508 #define fc c
509
510
511 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
512 const uschar *charptr; /* in small blocks of the code. My normal */
513 #endif /* style of coding would have declared */
514 const uschar *callpat; /* them within each of those blocks. */
515 const uschar *data; /* However, in order to accommodate the */
516 const uschar *next; /* version of this code that uses an */
517 USPTR pp; /* external "stack" implemented on the */
518 const uschar *prev; /* heap, it is easier to declare them all */
519 USPTR saved_eptr; /* here, so the declarations can be cut */
520 /* out in a block. The only declarations */
521 recursion_info new_recursive; /* within blocks below are for variables */
522 /* that do not have to be preserved over */
523 BOOL cur_is_word; /* a recursive call to RMATCH(). */
524 BOOL condition;
525 BOOL prev_is_word;
526
527 unsigned long int original_ims;
528
529 #ifdef SUPPORT_UCP
530 int prop_type;
531 int prop_value;
532 int prop_fail_result;
533 int prop_category;
534 int prop_chartype;
535 int prop_script;
536 int oclength;
537 uschar occhars[8];
538 #endif
539
540 int ctype;
541 int length;
542 int max;
543 int min;
544 int number;
545 int offset;
546 int op;
547 int save_capture_last;
548 int save_offset1, save_offset2, save_offset3;
549 int stacksave[REC_STACK_SAVE_MAX];
550
551 eptrblock newptrb;
552 #endif /* NO_RECURSE */
553
554 /* These statements are here to stop the compiler complaining about unitialized
555 variables. */
556
557 #ifdef SUPPORT_UCP
558 prop_value = 0;
559 prop_fail_result = 0;
560 #endif
561
562
563 /* This label is used for tail recursion, which is used in a few cases even
564 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
565 used. Thanks to Ian Taylor for noticing this possibility and sending the
566 original patch. */
567
568 TAIL_RECURSE:
569
570 /* OK, now we can get on with the real code of the function. Recursive calls
571 are specified by the macro RMATCH and RRETURN is used to return. When
572 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
573 and a "return", respectively (possibly with some debugging if DEBUG is
574 defined). However, RMATCH isn't like a function call because it's quite a
575 complicated macro. It has to be used in one particular way. This shouldn't,
576 however, impact performance when true recursion is being used. */
577
578 #ifdef SUPPORT_UTF8
579 utf8 = md->utf8; /* Local copy of the flag */
580 #else
581 utf8 = FALSE;
582 #endif
583
584 /* First check that we haven't called match() too many times, or that we
585 haven't exceeded the recursive call limit. */
586
587 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
588 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
589
590 original_ims = ims; /* Save for resetting on ')' */
591
592 /* At the start of a group with an unlimited repeat that may match an empty
593 string, the match_cbegroup flag is set. When this is the case, add the current
594 subject pointer to the chain of such remembered pointers, to be checked when we
595 hit the closing ket, in order to break infinite loops that match no characters.
596 When match() is called in other circumstances, don't add to the chain. The
597 match_cbegroup flag must NOT be used with tail recursion, because the memory
598 block that is used is on the stack, so a new one may be required for each
599 match(). */
600
601 if ((flags & match_cbegroup) != 0)
602 {
603 newptrb.epb_saved_eptr = eptr;
604 newptrb.epb_prev = eptrb;
605 eptrb = &newptrb;
606 }
607
608 /* Now start processing the opcodes. */
609
610 for (;;)
611 {
612 minimize = possessive = FALSE;
613 op = *ecode;
614
615 /* For partial matching, remember if we ever hit the end of the subject after
616 matching at least one subject character. */
617
618 if (md->partial &&
619 eptr >= md->end_subject &&
620 eptr > mstart)
621 md->hitend = TRUE;
622
623 switch(op)
624 {
625 case OP_FAIL:
626 RRETURN(MATCH_NOMATCH);
627
628 case OP_PRUNE:
629 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
630 ims, eptrb, flags, RM51);
631 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
632 RRETURN(MATCH_PRUNE);
633
634 case OP_COMMIT:
635 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
636 ims, eptrb, flags, RM52);
637 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
638 RRETURN(MATCH_COMMIT);
639
640 case OP_SKIP:
641 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
642 ims, eptrb, flags, RM53);
643 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
644 md->start_match_ptr = eptr; /* Pass back current position */
645 RRETURN(MATCH_SKIP);
646
647 case OP_THEN:
648 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
649 ims, eptrb, flags, RM54);
650 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
651 RRETURN(MATCH_THEN);
652
653 /* Handle a capturing bracket. If there is space in the offset vector, save
654 the current subject position in the working slot at the top of the vector.
655 We mustn't change the current values of the data slot, because they may be
656 set from a previous iteration of this group, and be referred to by a
657 reference inside the group.
658
659 If the bracket fails to match, we need to restore this value and also the
660 values of the final offsets, in case they were set by a previous iteration
661 of the same bracket.
662
663 If there isn't enough space in the offset vector, treat this as if it were
664 a non-capturing bracket. Don't worry about setting the flag for the error
665 case here; that is handled in the code for KET. */
666
667 case OP_CBRA:
668 case OP_SCBRA:
669 number = GET2(ecode, 1+LINK_SIZE);
670 offset = number << 1;
671
672 #ifdef DEBUG
673 printf("start bracket %d\n", number);
674 printf("subject=");
675 pchars(eptr, 16, TRUE, md);
676 printf("\n");
677 #endif
678
679 if (offset < md->offset_max)
680 {
681 save_offset1 = md->offset_vector[offset];
682 save_offset2 = md->offset_vector[offset+1];
683 save_offset3 = md->offset_vector[md->offset_end - number];
684 save_capture_last = md->capture_last;
685
686 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
687 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
688
689 flags = (op == OP_SCBRA)? match_cbegroup : 0;
690 do
691 {
692 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
693 ims, eptrb, flags, RM1);
694 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
695 md->capture_last = save_capture_last;
696 ecode += GET(ecode, 1);
697 }
698 while (*ecode == OP_ALT);
699
700 DPRINTF(("bracket %d failed\n", number));
701
702 md->offset_vector[offset] = save_offset1;
703 md->offset_vector[offset+1] = save_offset2;
704 md->offset_vector[md->offset_end - number] = save_offset3;
705
706 RRETURN(MATCH_NOMATCH);
707 }
708
709 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
710 as a non-capturing bracket. */
711
712 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
713 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
714
715 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
716
717 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
718 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
719
720 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
721 final alternative within the brackets, we would return the result of a
722 recursive call to match() whatever happened. We can reduce stack usage by
723 turning this into a tail recursion, except in the case when match_cbegroup
724 is set.*/
725
726 case OP_BRA:
727 case OP_SBRA:
728 DPRINTF(("start non-capturing bracket\n"));
729 flags = (op >= OP_SBRA)? match_cbegroup : 0;
730 for (;;)
731 {
732 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
733 {
734 if (flags == 0) /* Not a possibly empty group */
735 {
736 ecode += _pcre_OP_lengths[*ecode];
737 DPRINTF(("bracket 0 tail recursion\n"));
738 goto TAIL_RECURSE;
739 }
740
741 /* Possibly empty group; can't use tail recursion. */
742
743 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
744 eptrb, flags, RM48);
745 RRETURN(rrc);
746 }
747
748 /* For non-final alternatives, continue the loop for a NOMATCH result;
749 otherwise return. */
750
751 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
752 eptrb, flags, RM2);
753 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
754 ecode += GET(ecode, 1);
755 }
756 /* Control never reaches here. */
757
758 /* Conditional group: compilation checked that there are no more than
759 two branches. If the condition is false, skipping the first branch takes us
760 past the end if there is only one branch, but that's OK because that is
761 exactly what going to the ket would do. As there is only one branch to be
762 obeyed, we can use tail recursion to avoid using another stack frame. */
763
764 case OP_COND:
765 case OP_SCOND:
766 if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
767 {
768 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
769 condition = md->recursive != NULL &&
770 (offset == RREF_ANY || offset == md->recursive->group_num);
771 ecode += condition? 3 : GET(ecode, 1);
772 }
773
774 else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
775 {
776 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
777 condition = offset < offset_top && md->offset_vector[offset] >= 0;
778 ecode += condition? 3 : GET(ecode, 1);
779 }
780
781 else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
782 {
783 condition = FALSE;
784 ecode += GET(ecode, 1);
785 }
786
787 /* The condition is an assertion. Call match() to evaluate it - setting
788 the final argument match_condassert causes it to stop at the end of an
789 assertion. */
790
791 else
792 {
793 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
794 match_condassert, RM3);
795 if (rrc == MATCH_MATCH)
796 {
797 condition = TRUE;
798 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
799 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
800 }
801 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
802 {
803 RRETURN(rrc); /* Need braces because of following else */
804 }
805 else
806 {
807 condition = FALSE;
808 ecode += GET(ecode, 1);
809 }
810 }
811
812 /* We are now at the branch that is to be obeyed. As there is only one,
813 we can use tail recursion to avoid using another stack frame, except when
814 match_cbegroup is required for an unlimited repeat of a possibly empty
815 group. If the second alternative doesn't exist, we can just plough on. */
816
817 if (condition || *ecode == OP_ALT)
818 {
819 ecode += 1 + LINK_SIZE;
820 if (op == OP_SCOND) /* Possibly empty group */
821 {
822 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
823 RRETURN(rrc);
824 }
825 else /* Group must match something */
826 {
827 flags = 0;
828 goto TAIL_RECURSE;
829 }
830 }
831 else /* Condition false & no 2nd alternative */
832 {
833 ecode += 1 + LINK_SIZE;
834 }
835 break;
836
837
838 /* End of the pattern, either real or forced. If we are in a top-level
839 recursion, we should restore the offsets appropriately and continue from
840 after the call. */
841
842 case OP_ACCEPT:
843 case OP_END:
844 if (md->recursive != NULL && md->recursive->group_num == 0)
845 {
846 recursion_info *rec = md->recursive;
847 DPRINTF(("End of pattern in a (?0) recursion\n"));
848 md->recursive = rec->prevrec;
849 memmove(md->offset_vector, rec->offset_save,
850 rec->saved_max * sizeof(int));
851 mstart = rec->save_start;
852 ims = original_ims;
853 ecode = rec->after_call;
854 break;
855 }
856
857 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
858 string - backtracking will then try other alternatives, if any. */
859
860 if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
861 md->end_match_ptr = eptr; /* Record where we ended */
862 md->end_offset_top = offset_top; /* and how many extracts were taken */
863 md->start_match_ptr = mstart; /* and the start (\K can modify) */
864 RRETURN(MATCH_MATCH);
865
866 /* Change option settings */
867
868 case OP_OPT:
869 ims = ecode[1];
870 ecode += 2;
871 DPRINTF(("ims set to %02lx\n", ims));
872 break;
873
874 /* Assertion brackets. Check the alternative branches in turn - the
875 matching won't pass the KET for an assertion. If any one branch matches,
876 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
877 start of each branch to move the current point backwards, so the code at
878 this level is identical to the lookahead case. */
879
880 case OP_ASSERT:
881 case OP_ASSERTBACK:
882 do
883 {
884 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
885 RM4);
886 if (rrc == MATCH_MATCH) break;
887 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
888 ecode += GET(ecode, 1);
889 }
890 while (*ecode == OP_ALT);
891 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
892
893 /* If checking an assertion for a condition, return MATCH_MATCH. */
894
895 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
896
897 /* Continue from after the assertion, updating the offsets high water
898 mark, since extracts may have been taken during the assertion. */
899
900 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
901 ecode += 1 + LINK_SIZE;
902 offset_top = md->end_offset_top;
903 continue;
904
905 /* Negative assertion: all branches must fail to match */
906
907 case OP_ASSERT_NOT:
908 case OP_ASSERTBACK_NOT:
909 do
910 {
911 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
912 RM5);
913 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
914 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
915 ecode += GET(ecode,1);
916 }
917 while (*ecode == OP_ALT);
918
919 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
920
921 ecode += 1 + LINK_SIZE;
922 continue;
923
924 /* Move the subject pointer back. This occurs only at the start of
925 each branch of a lookbehind assertion. If we are too close to the start to
926 move back, this match function fails. When working with UTF-8 we move
927 back a number of characters, not bytes. */
928
929 case OP_REVERSE:
930 #ifdef SUPPORT_UTF8
931 if (utf8)
932 {
933 i = GET(ecode, 1);
934 while (i-- > 0)
935 {
936 eptr--;
937 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
938 BACKCHAR(eptr);
939 }
940 }
941 else
942 #endif
943
944 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
945
946 {
947 eptr -= GET(ecode, 1);
948 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
949 }
950
951 /* Skip to next op code */
952
953 ecode += 1 + LINK_SIZE;
954 break;
955
956 /* The callout item calls an external function, if one is provided, passing
957 details of the match so far. This is mainly for debugging, though the
958 function is able to force a failure. */
959
960 case OP_CALLOUT:
961 if (pcre_callout != NULL)
962 {
963 pcre_callout_block cb;
964 cb.version = 1; /* Version 1 of the callout block */
965 cb.callout_number = ecode[1];
966 cb.offset_vector = md->offset_vector;
967 cb.subject = (PCRE_SPTR)md->start_subject;
968 cb.subject_length = md->end_subject - md->start_subject;
969 cb.start_match = mstart - md->start_subject;
970 cb.current_position = eptr - md->start_subject;
971 cb.pattern_position = GET(ecode, 2);
972 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
973 cb.capture_top = offset_top/2;
974 cb.capture_last = md->capture_last;
975 cb.callout_data = md->callout_data;
976 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
977 if (rrc < 0) RRETURN(rrc);
978 }
979 ecode += 2 + 2*LINK_SIZE;
980 break;
981
982 /* Recursion either matches the current regex, or some subexpression. The
983 offset data is the offset to the starting bracket from the start of the
984 whole pattern. (This is so that it works from duplicated subpatterns.)
985
986 If there are any capturing brackets started but not finished, we have to
987 save their starting points and reinstate them after the recursion. However,
988 we don't know how many such there are (offset_top records the completed
989 total) so we just have to save all the potential data. There may be up to
990 65535 such values, which is too large to put on the stack, but using malloc
991 for small numbers seems expensive. As a compromise, the stack is used when
992 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
993 is used. A problem is what to do if the malloc fails ... there is no way of
994 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
995 values on the stack, and accept that the rest may be wrong.
996
997 There are also other values that have to be saved. We use a chained
998 sequence of blocks that actually live on the stack. Thanks to Robin Houston
999 for the original version of this logic. */
1000
1001 case OP_RECURSE:
1002 {
1003 callpat = md->start_code + GET(ecode, 1);
1004 new_recursive.group_num = (callpat == md->start_code)? 0 :
1005 GET2(callpat, 1 + LINK_SIZE);
1006
1007 /* Add to "recursing stack" */
1008
1009 new_recursive.prevrec = md->recursive;
1010 md->recursive = &new_recursive;
1011
1012 /* Find where to continue from afterwards */
1013
1014 ecode += 1 + LINK_SIZE;
1015 new_recursive.after_call = ecode;
1016
1017 /* Now save the offset data. */
1018
1019 new_recursive.saved_max = md->offset_end;
1020 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1021 new_recursive.offset_save = stacksave;
1022 else
1023 {
1024 new_recursive.offset_save =
1025 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1026 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1027 }
1028
1029 memcpy(new_recursive.offset_save, md->offset_vector,
1030 new_recursive.saved_max * sizeof(int));
1031 new_recursive.save_start = mstart;
1032 mstart = eptr;
1033
1034 /* OK, now we can do the recursion. For each top-level alternative we
1035 restore the offset and recursion data. */
1036
1037 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1038 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1039 do
1040 {
1041 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1042 md, ims, eptrb, flags, RM6);
1043 if (rrc == MATCH_MATCH)
1044 {
1045 DPRINTF(("Recursion matched\n"));
1046 md->recursive = new_recursive.prevrec;
1047 if (new_recursive.offset_save != stacksave)
1048 (pcre_free)(new_recursive.offset_save);
1049 RRETURN(MATCH_MATCH);
1050 }
1051 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1052 {
1053 DPRINTF(("Recursion gave error %d\n", rrc));
1054 RRETURN(rrc);
1055 }
1056
1057 md->recursive = &new_recursive;
1058 memcpy(md->offset_vector, new_recursive.offset_save,
1059 new_recursive.saved_max * sizeof(int));
1060 callpat += GET(callpat, 1);
1061 }
1062 while (*callpat == OP_ALT);
1063
1064 DPRINTF(("Recursion didn't match\n"));
1065 md->recursive = new_recursive.prevrec;
1066 if (new_recursive.offset_save != stacksave)
1067 (pcre_free)(new_recursive.offset_save);
1068 RRETURN(MATCH_NOMATCH);
1069 }
1070 /* Control never reaches here */
1071
1072 /* "Once" brackets are like assertion brackets except that after a match,
1073 the point in the subject string is not moved back. Thus there can never be
1074 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1075 Check the alternative branches in turn - the matching won't pass the KET
1076 for this kind of subpattern. If any one branch matches, we carry on as at
1077 the end of a normal bracket, leaving the subject pointer. */
1078
1079 case OP_ONCE:
1080 prev = ecode;
1081 saved_eptr = eptr;
1082
1083 do
1084 {
1085 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1086 if (rrc == MATCH_MATCH) break;
1087 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1088 ecode += GET(ecode,1);
1089 }
1090 while (*ecode == OP_ALT);
1091
1092 /* If hit the end of the group (which could be repeated), fail */
1093
1094 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1095
1096 /* Continue as from after the assertion, updating the offsets high water
1097 mark, since extracts may have been taken. */
1098
1099 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1100
1101 offset_top = md->end_offset_top;
1102 eptr = md->end_match_ptr;
1103
1104 /* For a non-repeating ket, just continue at this level. This also
1105 happens for a repeating ket if no characters were matched in the group.
1106 This is the forcible breaking of infinite loops as implemented in Perl
1107 5.005. If there is an options reset, it will get obeyed in the normal
1108 course of events. */
1109
1110 if (*ecode == OP_KET || eptr == saved_eptr)
1111 {
1112 ecode += 1+LINK_SIZE;
1113 break;
1114 }
1115
1116 /* The repeating kets try the rest of the pattern or restart from the
1117 preceding bracket, in the appropriate order. The second "call" of match()
1118 uses tail recursion, to avoid using another stack frame. We need to reset
1119 any options that changed within the bracket before re-running it, so
1120 check the next opcode. */
1121
1122 if (ecode[1+LINK_SIZE] == OP_OPT)
1123 {
1124 ims = (ims & ~PCRE_IMS) | ecode[4];
1125 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1126 }
1127
1128 if (*ecode == OP_KETRMIN)
1129 {
1130 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1131 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1132 ecode = prev;
1133 flags = 0;
1134 goto TAIL_RECURSE;
1135 }
1136 else /* OP_KETRMAX */
1137 {
1138 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1139 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1140 ecode += 1 + LINK_SIZE;
1141 flags = 0;
1142 goto TAIL_RECURSE;
1143 }
1144 /* Control never gets here */
1145
1146 /* An alternation is the end of a branch; scan along to find the end of the
1147 bracketed group and go to there. */
1148
1149 case OP_ALT:
1150 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1151 break;
1152
1153 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
1154 that it may occur zero times. It may repeat infinitely, or not at all -
1155 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
1156 repeat limits are compiled as a number of copies, with the optional ones
1157 preceded by BRAZERO or BRAMINZERO. */
1158
1159 case OP_BRAZERO:
1160 {
1161 next = ecode+1;
1162 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1163 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1164 do next += GET(next,1); while (*next == OP_ALT);
1165 ecode = next + 1 + LINK_SIZE;
1166 }
1167 break;
1168
1169 case OP_BRAMINZERO:
1170 {
1171 next = ecode+1;
1172 do next += GET(next, 1); while (*next == OP_ALT);
1173 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1174 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1175 ecode++;
1176 }
1177 break;
1178
1179 /* End of a group, repeated or non-repeating. */
1180
1181 case OP_KET:
1182 case OP_KETRMIN:
1183 case OP_KETRMAX:
1184 prev = ecode - GET(ecode, 1);
1185
1186 /* If this was a group that remembered the subject start, in order to break
1187 infinite repeats of empty string matches, retrieve the subject start from
1188 the chain. Otherwise, set it NULL. */
1189
1190 if (*prev >= OP_SBRA)
1191 {
1192 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1193 eptrb = eptrb->epb_prev; /* Backup to previous group */
1194 }
1195 else saved_eptr = NULL;
1196
1197 /* If we are at the end of an assertion group, stop matching and return
1198 MATCH_MATCH, but record the current high water mark for use by positive
1199 assertions. Do this also for the "once" (atomic) groups. */
1200
1201 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1202 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1203 *prev == OP_ONCE)
1204 {
1205 md->end_match_ptr = eptr; /* For ONCE */
1206 md->end_offset_top = offset_top;
1207 RRETURN(MATCH_MATCH);
1208 }
1209
1210 /* For capturing groups we have to check the group number back at the start
1211 and if necessary complete handling an extraction by setting the offsets and
1212 bumping the high water mark. Note that whole-pattern recursion is coded as
1213 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1214 when the OP_END is reached. Other recursion is handled here. */
1215
1216 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1217 {
1218 number = GET2(prev, 1+LINK_SIZE);
1219 offset = number << 1;
1220
1221 #ifdef DEBUG
1222 printf("end bracket %d", number);
1223 printf("\n");
1224 #endif
1225
1226 md->capture_last = number;
1227 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1228 {
1229 md->offset_vector[offset] =
1230 md->offset_vector[md->offset_end - number];
1231 md->offset_vector[offset+1] = eptr - md->start_subject;
1232 if (offset_top <= offset) offset_top = offset + 2;
1233 }
1234
1235 /* Handle a recursively called group. Restore the offsets
1236 appropriately and continue from after the call. */
1237
1238 if (md->recursive != NULL && md->recursive->group_num == number)
1239 {
1240 recursion_info *rec = md->recursive;
1241 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1242 md->recursive = rec->prevrec;
1243 mstart = rec->save_start;
1244 memcpy(md->offset_vector, rec->offset_save,
1245 rec->saved_max * sizeof(int));
1246 ecode = rec->after_call;
1247 ims = original_ims;
1248 break;
1249 }
1250 }
1251
1252 /* For both capturing and non-capturing groups, reset the value of the ims
1253 flags, in case they got changed during the group. */
1254
1255 ims = original_ims;
1256 DPRINTF(("ims reset to %02lx\n", ims));
1257
1258 /* For a non-repeating ket, just continue at this level. This also
1259 happens for a repeating ket if no characters were matched in the group.
1260 This is the forcible breaking of infinite loops as implemented in Perl
1261 5.005. If there is an options reset, it will get obeyed in the normal
1262 course of events. */
1263
1264 if (*ecode == OP_KET || eptr == saved_eptr)
1265 {
1266 ecode += 1 + LINK_SIZE;
1267 break;
1268 }
1269
1270 /* The repeating kets try the rest of the pattern or restart from the
1271 preceding bracket, in the appropriate order. In the second case, we can use
1272 tail recursion to avoid using another stack frame, unless we have an
1273 unlimited repeat of a group that can match an empty string. */
1274
1275 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1276
1277 if (*ecode == OP_KETRMIN)
1278 {
1279 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1280 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1281 if (flags != 0) /* Could match an empty string */
1282 {
1283 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1284 RRETURN(rrc);
1285 }
1286 ecode = prev;
1287 goto TAIL_RECURSE;
1288 }
1289 else /* OP_KETRMAX */
1290 {
1291 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1292 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1293 ecode += 1 + LINK_SIZE;
1294 flags = 0;
1295 goto TAIL_RECURSE;
1296 }
1297 /* Control never gets here */
1298
1299 /* Start of subject unless notbol, or after internal newline if multiline */
1300
1301 case OP_CIRC:
1302 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1303 if ((ims & PCRE_MULTILINE) != 0)
1304 {
1305 if (eptr != md->start_subject &&
1306 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1307 RRETURN(MATCH_NOMATCH);
1308 ecode++;
1309 break;
1310 }
1311 /* ... else fall through */
1312
1313 /* Start of subject assertion */
1314
1315 case OP_SOD:
1316 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1317 ecode++;
1318 break;
1319
1320 /* Start of match assertion */
1321
1322 case OP_SOM:
1323 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1324 ecode++;
1325 break;
1326
1327 /* Reset the start of match point */
1328
1329 case OP_SET_SOM:
1330 mstart = eptr;
1331 ecode++;
1332 break;
1333
1334 /* Assert before internal newline if multiline, or before a terminating
1335 newline unless endonly is set, else end of subject unless noteol is set. */
1336
1337 case OP_DOLL:
1338 if ((ims & PCRE_MULTILINE) != 0)
1339 {
1340 if (eptr < md->end_subject)
1341 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1342 else
1343 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1344 ecode++;
1345 break;
1346 }
1347 else
1348 {
1349 if (md->noteol) RRETURN(MATCH_NOMATCH);
1350 if (!md->endonly)
1351 {
1352 if (eptr != md->end_subject &&
1353 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1354 RRETURN(MATCH_NOMATCH);
1355 ecode++;
1356 break;
1357 }
1358 }
1359 /* ... else fall through for endonly */
1360
1361 /* End of subject assertion (\z) */
1362
1363 case OP_EOD:
1364 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1365 ecode++;
1366 break;
1367
1368 /* End of subject or ending \n assertion (\Z) */
1369
1370 case OP_EODN:
1371 if (eptr != md->end_subject &&
1372 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1373 RRETURN(MATCH_NOMATCH);
1374 ecode++;
1375 break;
1376
1377 /* Word boundary assertions */
1378
1379 case OP_NOT_WORD_BOUNDARY:
1380 case OP_WORD_BOUNDARY:
1381 {
1382
1383 /* Find out if the previous and current characters are "word" characters.
1384 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1385 be "non-word" characters. */
1386
1387 #ifdef SUPPORT_UTF8
1388 if (utf8)
1389 {
1390 if (eptr == md->start_subject) prev_is_word = FALSE; else
1391 {
1392 const uschar *lastptr = eptr - 1;
1393 while((*lastptr & 0xc0) == 0x80) lastptr--;
1394 GETCHAR(c, lastptr);
1395 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1396 }
1397 if (eptr >= md->end_subject) cur_is_word = FALSE; else
1398 {
1399 GETCHAR(c, eptr);
1400 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1401 }
1402 }
1403 else
1404 #endif
1405
1406 /* More streamlined when not in UTF-8 mode */
1407
1408 {
1409 prev_is_word = (eptr != md->start_subject) &&
1410 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1411 cur_is_word = (eptr < md->end_subject) &&
1412 ((md->ctypes[*eptr] & ctype_word) != 0);
1413 }
1414
1415 /* Now see if the situation is what we want */
1416
1417 if ((*ecode++ == OP_WORD_BOUNDARY)?
1418 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1419 RRETURN(MATCH_NOMATCH);
1420 }
1421 break;
1422
1423 /* Match a single character type; inline for speed */
1424
1425 case OP_ANY:
1426 if ((ims & PCRE_DOTALL) == 0)
1427 {
1428 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1429 }
1430 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1431 if (utf8)
1432 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1433 ecode++;
1434 break;
1435
1436 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1437 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1438
1439 case OP_ANYBYTE:
1440 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1441 ecode++;
1442 break;
1443
1444 case OP_NOT_DIGIT:
1445 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1446 GETCHARINCTEST(c, eptr);
1447 if (
1448 #ifdef SUPPORT_UTF8
1449 c < 256 &&
1450 #endif
1451 (md->ctypes[c] & ctype_digit) != 0
1452 )
1453 RRETURN(MATCH_NOMATCH);
1454 ecode++;
1455 break;
1456
1457 case OP_DIGIT:
1458 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1459 GETCHARINCTEST(c, eptr);
1460 if (
1461 #ifdef SUPPORT_UTF8
1462 c >= 256 ||
1463 #endif
1464 (md->ctypes[c] & ctype_digit) == 0
1465 )
1466 RRETURN(MATCH_NOMATCH);
1467 ecode++;
1468 break;
1469
1470 case OP_NOT_WHITESPACE:
1471 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1472 GETCHARINCTEST(c, eptr);
1473 if (
1474 #ifdef SUPPORT_UTF8
1475 c < 256 &&
1476 #endif
1477 (md->ctypes[c] & ctype_space) != 0
1478 )
1479 RRETURN(MATCH_NOMATCH);
1480 ecode++;
1481 break;
1482
1483 case OP_WHITESPACE:
1484 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1485 GETCHARINCTEST(c, eptr);
1486 if (
1487 #ifdef SUPPORT_UTF8
1488 c >= 256 ||
1489 #endif
1490 (md->ctypes[c] & ctype_space) == 0
1491 )
1492 RRETURN(MATCH_NOMATCH);
1493 ecode++;
1494 break;
1495
1496 case OP_NOT_WORDCHAR:
1497 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1498 GETCHARINCTEST(c, eptr);
1499 if (
1500 #ifdef SUPPORT_UTF8
1501 c < 256 &&
1502 #endif
1503 (md->ctypes[c] & ctype_word) != 0
1504 )
1505 RRETURN(MATCH_NOMATCH);
1506 ecode++;
1507 break;
1508
1509 case OP_WORDCHAR:
1510 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1511 GETCHARINCTEST(c, eptr);
1512 if (
1513 #ifdef SUPPORT_UTF8
1514 c >= 256 ||
1515 #endif
1516 (md->ctypes[c] & ctype_word) == 0
1517 )
1518 RRETURN(MATCH_NOMATCH);
1519 ecode++;
1520 break;
1521
1522 case OP_ANYNL:
1523 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1524 GETCHARINCTEST(c, eptr);
1525 switch(c)
1526 {
1527 default: RRETURN(MATCH_NOMATCH);
1528 case 0x000d:
1529 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1530 break;
1531
1532 case 0x000a:
1533 break;
1534
1535 case 0x000b:
1536 case 0x000c:
1537 case 0x0085:
1538 case 0x2028:
1539 case 0x2029:
1540 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1541 break;
1542 }
1543 ecode++;
1544 break;
1545
1546 case OP_NOT_HSPACE:
1547 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1548 GETCHARINCTEST(c, eptr);
1549 switch(c)
1550 {
1551 default: break;
1552 case 0x09: /* HT */
1553 case 0x20: /* SPACE */
1554 case 0xa0: /* NBSP */
1555 case 0x1680: /* OGHAM SPACE MARK */
1556 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1557 case 0x2000: /* EN QUAD */
1558 case 0x2001: /* EM QUAD */
1559 case 0x2002: /* EN SPACE */
1560 case 0x2003: /* EM SPACE */
1561 case 0x2004: /* THREE-PER-EM SPACE */
1562 case 0x2005: /* FOUR-PER-EM SPACE */
1563 case 0x2006: /* SIX-PER-EM SPACE */
1564 case 0x2007: /* FIGURE SPACE */
1565 case 0x2008: /* PUNCTUATION SPACE */
1566 case 0x2009: /* THIN SPACE */
1567 case 0x200A: /* HAIR SPACE */
1568 case 0x202f: /* NARROW NO-BREAK SPACE */
1569 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1570 case 0x3000: /* IDEOGRAPHIC SPACE */
1571 RRETURN(MATCH_NOMATCH);
1572 }
1573 ecode++;
1574 break;
1575
1576 case OP_HSPACE:
1577 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1578 GETCHARINCTEST(c, eptr);
1579 switch(c)
1580 {
1581 default: RRETURN(MATCH_NOMATCH);
1582 case 0x09: /* HT */
1583 case 0x20: /* SPACE */
1584 case 0xa0: /* NBSP */
1585 case 0x1680: /* OGHAM SPACE MARK */
1586 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1587 case 0x2000: /* EN QUAD */
1588 case 0x2001: /* EM QUAD */
1589 case 0x2002: /* EN SPACE */
1590 case 0x2003: /* EM SPACE */
1591 case 0x2004: /* THREE-PER-EM SPACE */
1592 case 0x2005: /* FOUR-PER-EM SPACE */
1593 case 0x2006: /* SIX-PER-EM SPACE */
1594 case 0x2007: /* FIGURE SPACE */
1595 case 0x2008: /* PUNCTUATION SPACE */
1596 case 0x2009: /* THIN SPACE */
1597 case 0x200A: /* HAIR SPACE */
1598 case 0x202f: /* NARROW NO-BREAK SPACE */
1599 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1600 case 0x3000: /* IDEOGRAPHIC SPACE */
1601 break;
1602 }
1603 ecode++;
1604 break;
1605
1606 case OP_NOT_VSPACE:
1607 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1608 GETCHARINCTEST(c, eptr);
1609 switch(c)
1610 {
1611 default: break;
1612 case 0x0a: /* LF */
1613 case 0x0b: /* VT */
1614 case 0x0c: /* FF */
1615 case 0x0d: /* CR */
1616 case 0x85: /* NEL */
1617 case 0x2028: /* LINE SEPARATOR */
1618 case 0x2029: /* PARAGRAPH SEPARATOR */
1619 RRETURN(MATCH_NOMATCH);
1620 }
1621 ecode++;
1622 break;
1623
1624 case OP_VSPACE:
1625 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1626 GETCHARINCTEST(c, eptr);
1627 switch(c)
1628 {
1629 default: RRETURN(MATCH_NOMATCH);
1630 case 0x0a: /* LF */
1631 case 0x0b: /* VT */
1632 case 0x0c: /* FF */
1633 case 0x0d: /* CR */
1634 case 0x85: /* NEL */
1635 case 0x2028: /* LINE SEPARATOR */
1636 case 0x2029: /* PARAGRAPH SEPARATOR */
1637 break;
1638 }
1639 ecode++;
1640 break;
1641
1642 #ifdef SUPPORT_UCP
1643 /* Check the next character by Unicode property. We will get here only
1644 if the support is in the binary; otherwise a compile-time error occurs. */
1645
1646 case OP_PROP:
1647 case OP_NOTPROP:
1648 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1649 GETCHARINCTEST(c, eptr);
1650 {
1651 int chartype, script;
1652 int category = _pcre_ucp_findprop(c, &chartype, &script);
1653
1654 switch(ecode[1])
1655 {
1656 case PT_ANY:
1657 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1658 break;
1659
1660 case PT_LAMP:
1661 if ((chartype == ucp_Lu ||
1662 chartype == ucp_Ll ||
1663 chartype == ucp_Lt) == (op == OP_NOTPROP))
1664 RRETURN(MATCH_NOMATCH);
1665 break;
1666
1667 case PT_GC:
1668 if ((ecode[2] != category) == (op == OP_PROP))
1669 RRETURN(MATCH_NOMATCH);
1670 break;
1671
1672 case PT_PC:
1673 if ((ecode[2] != chartype) == (op == OP_PROP))
1674 RRETURN(MATCH_NOMATCH);
1675 break;
1676
1677 case PT_SC:
1678 if ((ecode[2] != script) == (op == OP_PROP))
1679 RRETURN(MATCH_NOMATCH);
1680 break;
1681
1682 default:
1683 RRETURN(PCRE_ERROR_INTERNAL);
1684 }
1685
1686 ecode += 3;
1687 }
1688 break;
1689
1690 /* Match an extended Unicode sequence. We will get here only if the support
1691 is in the binary; otherwise a compile-time error occurs. */
1692
1693 case OP_EXTUNI:
1694 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1695 GETCHARINCTEST(c, eptr);
1696 {
1697 int chartype, script;
1698 int category = _pcre_ucp_findprop(c, &chartype, &script);
1699 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1700 while (eptr < md->end_subject)
1701 {
1702 int len = 1;
1703 if (!utf8) c = *eptr; else
1704 {
1705 GETCHARLEN(c, eptr, len);
1706 }
1707 category = _pcre_ucp_findprop(c, &chartype, &script);
1708 if (category != ucp_M) break;
1709 eptr += len;
1710 }
1711 }
1712 ecode++;
1713 break;
1714 #endif
1715
1716
1717 /* Match a back reference, possibly repeatedly. Look past the end of the
1718 item to see if there is repeat information following. The code is similar
1719 to that for character classes, but repeated for efficiency. Then obey
1720 similar code to character type repeats - written out again for speed.
1721 However, if the referenced string is the empty string, always treat
1722 it as matched, any number of times (otherwise there could be infinite
1723 loops). */
1724
1725 case OP_REF:
1726 {
1727 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1728 ecode += 3; /* Advance past item */
1729
1730 /* If the reference is unset, set the length to be longer than the amount
1731 of subject left; this ensures that every attempt at a match fails. We
1732 can't just fail here, because of the possibility of quantifiers with zero
1733 minima. */
1734
1735 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
1736 md->end_subject - eptr + 1 :
1737 md->offset_vector[offset+1] - md->offset_vector[offset];
1738
1739 /* Set up for repetition, or handle the non-repeated case */
1740
1741 switch (*ecode)
1742 {
1743 case OP_CRSTAR:
1744 case OP_CRMINSTAR:
1745 case OP_CRPLUS:
1746 case OP_CRMINPLUS:
1747 case OP_CRQUERY:
1748 case OP_CRMINQUERY:
1749 c = *ecode++ - OP_CRSTAR;
1750 minimize = (c & 1) != 0;
1751 min = rep_min[c]; /* Pick up values from tables; */
1752 max = rep_max[c]; /* zero for max => infinity */
1753 if (max == 0) max = INT_MAX;
1754 break;
1755
1756 case OP_CRRANGE:
1757 case OP_CRMINRANGE:
1758 minimize = (*ecode == OP_CRMINRANGE);
1759 min = GET2(ecode, 1);
1760 max = GET2(ecode, 3);
1761 if (max == 0) max = INT_MAX;
1762 ecode += 5;
1763 break;
1764
1765 default: /* No repeat follows */
1766 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1767 eptr += length;
1768 continue; /* With the main loop */
1769 }
1770
1771 /* If the length of the reference is zero, just continue with the
1772 main loop. */
1773
1774 if (length == 0) continue;
1775
1776 /* First, ensure the minimum number of matches are present. We get back
1777 the length of the reference string explicitly rather than passing the
1778 address of eptr, so that eptr can be a register variable. */
1779
1780 for (i = 1; i <= min; i++)
1781 {
1782 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1783 eptr += length;
1784 }
1785
1786 /* If min = max, continue at the same level without recursion.
1787 They are not both allowed to be zero. */
1788
1789 if (min == max) continue;
1790
1791 /* If minimizing, keep trying and advancing the pointer */
1792
1793 if (minimize)
1794 {
1795 for (fi = min;; fi++)
1796 {
1797 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1798 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1799 if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1800 RRETURN(MATCH_NOMATCH);
1801 eptr += length;
1802 }
1803 /* Control never gets here */
1804 }
1805
1806 /* If maximizing, find the longest string and work backwards */
1807
1808 else
1809 {
1810 pp = eptr;
1811 for (i = min; i < max; i++)
1812 {
1813 if (!match_ref(offset, eptr, length, md, ims)) break;
1814 eptr += length;
1815 }
1816 while (eptr >= pp)
1817 {
1818 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1819 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1820 eptr -= length;
1821 }
1822 RRETURN(MATCH_NOMATCH);
1823 }
1824 }
1825 /* Control never gets here */
1826
1827
1828
1829 /* Match a bit-mapped character class, possibly repeatedly. This op code is
1830 used when all the characters in the class have values in the range 0-255,
1831 and either the matching is caseful, or the characters are in the range
1832 0-127 when UTF-8 processing is enabled. The only difference between
1833 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1834 encountered.
1835
1836 First, look past the end of the item to see if there is repeat information
1837 following. Then obey similar code to character type repeats - written out
1838 again for speed. */
1839
1840 case OP_NCLASS:
1841 case OP_CLASS:
1842 {
1843 data = ecode + 1; /* Save for matching */
1844 ecode += 33; /* Advance past the item */
1845
1846 switch (*ecode)
1847 {
1848 case OP_CRSTAR:
1849 case OP_CRMINSTAR:
1850 case OP_CRPLUS:
1851 case OP_CRMINPLUS:
1852 case OP_CRQUERY:
1853 case OP_CRMINQUERY:
1854 c = *ecode++ - OP_CRSTAR;
1855 minimize = (c & 1) != 0;
1856 min = rep_min[c]; /* Pick up values from tables; */
1857 max = rep_max[c]; /* zero for max => infinity */
1858 if (max == 0) max = INT_MAX;
1859 break;
1860
1861 case OP_CRRANGE:
1862 case OP_CRMINRANGE:
1863 minimize = (*ecode == OP_CRMINRANGE);
1864 min = GET2(ecode, 1);
1865 max = GET2(ecode, 3);
1866 if (max == 0) max = INT_MAX;
1867 ecode += 5;
1868 break;
1869
1870 default: /* No repeat follows */
1871 min = max = 1;
1872 break;
1873 }
1874
1875 /* First, ensure the minimum number of matches are present. */
1876
1877 #ifdef SUPPORT_UTF8
1878 /* UTF-8 mode */
1879 if (utf8)
1880 {
1881 for (i = 1; i <= min; i++)
1882 {
1883 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1884 GETCHARINC(c, eptr);
1885 if (c > 255)
1886 {
1887 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1888 }
1889 else
1890 {
1891 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1892 }
1893 }
1894 }
1895 else
1896 #endif
1897 /* Not UTF-8 mode */
1898 {
1899 for (i = 1; i <= min; i++)
1900 {
1901 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1902 c = *eptr++;
1903 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1904 }
1905 }
1906
1907 /* If max == min we can continue with the main loop without the
1908 need to recurse. */
1909
1910 if (min == max) continue;
1911
1912 /* If minimizing, keep testing the rest of the expression and advancing
1913 the pointer while it matches the class. */
1914
1915 if (minimize)
1916 {
1917 #ifdef SUPPORT_UTF8
1918 /* UTF-8 mode */
1919 if (utf8)
1920 {
1921 for (fi = min;; fi++)
1922 {
1923 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
1924 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1925 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1926 GETCHARINC(c, eptr);
1927 if (c > 255)
1928 {
1929 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1930 }
1931 else
1932 {
1933 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1934 }
1935 }
1936 }
1937 else
1938 #endif
1939 /* Not UTF-8 mode */
1940 {
1941 for (fi = min;; fi++)
1942 {
1943 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
1944 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1945 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1946 c = *eptr++;
1947 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1948 }
1949 }
1950 /* Control never gets here */
1951 }
1952
1953 /* If maximizing, find the longest possible run, then work backwards. */
1954
1955 else
1956 {
1957 pp = eptr;
1958
1959 #ifdef SUPPORT_UTF8
1960 /* UTF-8 mode */
1961 if (utf8)
1962 {
1963 for (i = min; i < max; i++)
1964 {
1965 int len = 1;
1966 if (eptr >= md->end_subject) break;
1967 GETCHARLEN(c, eptr, len);
1968 if (c > 255)
1969 {
1970 if (op == OP_CLASS) break;
1971 }
1972 else
1973 {
1974 if ((data[c/8] & (1 << (c&7))) == 0) break;
1975 }
1976 eptr += len;
1977 }
1978 for (;;)
1979 {
1980 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
1981 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1982 if (eptr-- == pp) break; /* Stop if tried at original pos */
1983 BACKCHAR(eptr);
1984 }
1985 }
1986 else
1987 #endif
1988 /* Not UTF-8 mode */
1989 {
1990 for (i = min; i < max; i++)
1991 {
1992 if (eptr >= md->end_subject) break;
1993 c = *eptr;
1994 if ((data[c/8] & (1 << (c&7))) == 0) break;
1995 eptr++;
1996 }
1997 while (eptr >= pp)
1998 {
1999 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2000 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2001 eptr--;
2002 }
2003 }
2004
2005 RRETURN(MATCH_NOMATCH);
2006 }
2007 }
2008 /* Control never gets here */
2009
2010
2011 /* Match an extended character class. This opcode is encountered only
2012 in UTF-8 mode, because that's the only time it is compiled. */
2013
2014 #ifdef SUPPORT_UTF8
2015 case OP_XCLASS:
2016 {
2017 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2018 ecode += GET(ecode, 1); /* Advance past the item */
2019
2020 switch (*ecode)
2021 {
2022 case OP_CRSTAR:
2023 case OP_CRMINSTAR:
2024 case OP_CRPLUS:
2025 case OP_CRMINPLUS:
2026 case OP_CRQUERY:
2027 case OP_CRMINQUERY:
2028 c = *ecode++ - OP_CRSTAR;
2029 minimize = (c & 1) != 0;
2030 min = rep_min[c]; /* Pick up values from tables; */
2031 max = rep_max[c]; /* zero for max => infinity */
2032 if (max == 0) max = INT_MAX;
2033 break;
2034
2035 case OP_CRRANGE:
2036 case OP_CRMINRANGE:
2037 minimize = (*ecode == OP_CRMINRANGE);
2038 min = GET2(ecode, 1);
2039 max = GET2(ecode, 3);
2040 if (max == 0) max = INT_MAX;
2041 ecode += 5;
2042 break;
2043
2044 default: /* No repeat follows */
2045 min = max = 1;
2046 break;
2047 }
2048
2049 /* First, ensure the minimum number of matches are present. */
2050
2051 for (i = 1; i <= min; i++)
2052 {
2053 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2054 GETCHARINC(c, eptr);
2055 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2056 }
2057
2058 /* If max == min we can continue with the main loop without the
2059 need to recurse. */
2060
2061 if (min == max) continue;
2062
2063 /* If minimizing, keep testing the rest of the expression and advancing
2064 the pointer while it matches the class. */
2065
2066 if (minimize)
2067 {
2068 for (fi = min;; fi++)
2069 {
2070 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2071 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2072 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2073 GETCHARINC(c, eptr);
2074 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2075 }
2076 /* Control never gets here */
2077 }
2078
2079 /* If maximizing, find the longest possible run, then work backwards. */
2080
2081 else
2082 {
2083 pp = eptr;
2084 for (i = min; i < max; i++)
2085 {
2086 int len = 1;
2087 if (eptr >= md->end_subject) break;
2088 GETCHARLEN(c, eptr, len);
2089 if (!_pcre_xclass(c, data)) break;
2090 eptr += len;
2091 }
2092 for(;;)
2093 {
2094 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2095 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2096 if (eptr-- == pp) break; /* Stop if tried at original pos */
2097 if (utf8) BACKCHAR(eptr);
2098 }
2099 RRETURN(MATCH_NOMATCH);
2100 }
2101
2102 /* Control never gets here */
2103 }
2104 #endif /* End of XCLASS */
2105
2106 /* Match a single character, casefully */
2107
2108 case OP_CHAR:
2109 #ifdef SUPPORT_UTF8
2110 if (utf8)
2111 {
2112 length = 1;
2113 ecode++;
2114 GETCHARLEN(fc, ecode, length);
2115 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2116 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2117 }
2118 else
2119 #endif
2120
2121 /* Non-UTF-8 mode */
2122 {
2123 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2124 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2125 ecode += 2;
2126 }
2127 break;
2128
2129 /* Match a single character, caselessly */
2130
2131 case OP_CHARNC:
2132 #ifdef SUPPORT_UTF8
2133 if (utf8)
2134 {
2135 length = 1;
2136 ecode++;
2137 GETCHARLEN(fc, ecode, length);
2138
2139 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2140
2141 /* If the pattern character's value is < 128, we have only one byte, and
2142 can use the fast lookup table. */
2143
2144 if (fc < 128)
2145 {
2146 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2147 }
2148
2149 /* Otherwise we must pick up the subject character */
2150
2151 else
2152 {
2153 unsigned int dc;
2154 GETCHARINC(dc, eptr);
2155 ecode += length;
2156
2157 /* If we have Unicode property support, we can use it to test the other
2158 case of the character, if there is one. */
2159
2160 if (fc != dc)
2161 {
2162 #ifdef SUPPORT_UCP
2163 if (dc != _pcre_ucp_othercase(fc))
2164 #endif
2165 RRETURN(MATCH_NOMATCH);
2166 }
2167 }
2168 }
2169 else
2170 #endif /* SUPPORT_UTF8 */
2171
2172 /* Non-UTF-8 mode */
2173 {
2174 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2175 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2176 ecode += 2;
2177 }
2178 break;
2179
2180 /* Match a single character repeatedly. */
2181
2182 case OP_EXACT:
2183 min = max = GET2(ecode, 1);
2184 ecode += 3;
2185 goto REPEATCHAR;
2186
2187 case OP_POSUPTO:
2188 possessive = TRUE;
2189 /* Fall through */
2190
2191 case OP_UPTO:
2192 case OP_MINUPTO:
2193 min = 0;
2194 max = GET2(ecode, 1);
2195 minimize = *ecode == OP_MINUPTO;
2196 ecode += 3;
2197 goto REPEATCHAR;
2198
2199 case OP_POSSTAR:
2200 possessive = TRUE;
2201 min = 0;
2202 max = INT_MAX;
2203 ecode++;
2204 goto REPEATCHAR;
2205
2206 case OP_POSPLUS:
2207 possessive = TRUE;
2208 min = 1;
2209 max = INT_MAX;
2210 ecode++;
2211 goto REPEATCHAR;
2212
2213 case OP_POSQUERY:
2214 possessive = TRUE;
2215 min = 0;
2216 max = 1;
2217 ecode++;
2218 goto REPEATCHAR;
2219
2220 case OP_STAR:
2221 case OP_MINSTAR:
2222 case OP_PLUS:
2223 case OP_MINPLUS:
2224 case OP_QUERY:
2225 case OP_MINQUERY:
2226 c = *ecode++ - OP_STAR;
2227 minimize = (c & 1) != 0;
2228 min = rep_min[c]; /* Pick up values from tables; */
2229 max = rep_max[c]; /* zero for max => infinity */
2230 if (max == 0) max = INT_MAX;
2231
2232 /* Common code for all repeated single-character matches. We can give
2233 up quickly if there are fewer than the minimum number of characters left in
2234 the subject. */
2235
2236 REPEATCHAR:
2237 #ifdef SUPPORT_UTF8
2238 if (utf8)
2239 {
2240 length = 1;
2241 charptr = ecode;
2242 GETCHARLEN(fc, ecode, length);
2243 if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2244 ecode += length;
2245
2246 /* Handle multibyte character matching specially here. There is
2247 support for caseless matching if UCP support is present. */
2248
2249 if (length > 1)
2250 {
2251 #ifdef SUPPORT_UCP
2252 unsigned int othercase;
2253 if ((ims & PCRE_CASELESS) != 0 &&
2254 (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR)
2255 oclength = _pcre_ord2utf8(othercase, occhars);
2256 else oclength = 0;
2257 #endif /* SUPPORT_UCP */
2258
2259 for (i = 1; i <= min; i++)
2260 {
2261 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2262 #ifdef SUPPORT_UCP
2263 /* Need braces because of following else */
2264 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2265 else
2266 {
2267 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2268 eptr += oclength;
2269 }
2270 #else /* without SUPPORT_UCP */
2271 else { RRETURN(MATCH_NOMATCH); }
2272 #endif /* SUPPORT_UCP */
2273 }
2274
2275 if (min == max) continue;
2276
2277 if (minimize)
2278 {
2279 for (fi = min;; fi++)
2280 {
2281 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2282 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2283 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2284 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2285 #ifdef SUPPORT_UCP
2286 /* Need braces because of following else */
2287 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2288 else
2289 {
2290 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2291 eptr += oclength;
2292 }
2293 #else /* without SUPPORT_UCP */
2294 else { RRETURN (MATCH_NOMATCH); }
2295 #endif /* SUPPORT_UCP */
2296 }
2297 /* Control never gets here */
2298 }
2299
2300 else /* Maximize */
2301 {
2302 pp = eptr;
2303 for (i = min; i < max; i++)
2304 {
2305 if (eptr > md->end_subject - length) break;
2306 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2307 #ifdef SUPPORT_UCP
2308 else if (oclength == 0) break;
2309 else
2310 {
2311 if (memcmp(eptr, occhars, oclength) != 0) break;
2312 eptr += oclength;
2313 }
2314 #else /* without SUPPORT_UCP */
2315 else break;
2316 #endif /* SUPPORT_UCP */
2317 }
2318
2319 if (possessive) continue;
2320 for(;;)
2321 {
2322 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2323 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2324 if (eptr == pp) RRETURN(MATCH_NOMATCH);
2325 #ifdef SUPPORT_UCP
2326 eptr--;
2327 BACKCHAR(eptr);
2328 #else /* without SUPPORT_UCP */
2329 eptr -= length;
2330 #endif /* SUPPORT_UCP */
2331 }
2332 }
2333 /* Control never gets here */
2334 }
2335
2336 /* If the length of a UTF-8 character is 1, we fall through here, and
2337 obey the code as for non-UTF-8 characters below, though in this case the
2338 value of fc will always be < 128. */
2339 }
2340 else
2341 #endif /* SUPPORT_UTF8 */
2342
2343 /* When not in UTF-8 mode, load a single-byte character. */
2344 {
2345 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2346 fc = *ecode++;
2347 }
2348
2349 /* The value of fc at this point is always less than 256, though we may or
2350 may not be in UTF-8 mode. The code is duplicated for the caseless and
2351 caseful cases, for speed, since matching characters is likely to be quite
2352 common. First, ensure the minimum number of matches are present. If min =
2353 max, continue at the same level without recursing. Otherwise, if
2354 minimizing, keep trying the rest of the expression and advancing one
2355 matching character if failing, up to the maximum. Alternatively, if
2356 maximizing, find the maximum number of characters and work backwards. */
2357
2358 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2359 max, eptr));
2360
2361 if ((ims & PCRE_CASELESS) != 0)
2362 {
2363 fc = md->lcc[fc];
2364 for (i = 1; i <= min; i++)
2365 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2366 if (min == max) continue;
2367 if (minimize)
2368 {
2369 for (fi = min;; fi++)
2370 {
2371 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2372 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2373 if (fi >= max || eptr >= md->end_subject ||
2374 fc != md->lcc[*eptr++])
2375 RRETURN(MATCH_NOMATCH);
2376 }
2377 /* Control never gets here */
2378 }
2379 else /* Maximize */
2380 {
2381 pp = eptr;
2382 for (i = min; i < max; i++)
2383 {
2384 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2385 eptr++;
2386 }
2387 if (possessive) continue;
2388 while (eptr >= pp)
2389 {
2390 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2391 eptr--;
2392 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2393 }
2394 RRETURN(MATCH_NOMATCH);
2395 }
2396 /* Control never gets here */
2397 }
2398
2399 /* Caseful comparisons (includes all multi-byte characters) */
2400
2401 else
2402 {
2403 for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2404 if (min == max) continue;
2405 if (minimize)
2406 {
2407 for (fi = min;; fi++)
2408 {
2409 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2410 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2411 if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2412 RRETURN(MATCH_NOMATCH);
2413 }
2414 /* Control never gets here */
2415 }
2416 else /* Maximize */
2417 {
2418 pp = eptr;
2419 for (i = min; i < max; i++)
2420 {
2421 if (eptr >= md->end_subject || fc != *eptr) break;
2422 eptr++;
2423 }
2424 if (possessive) continue;
2425 while (eptr >= pp)
2426 {
2427 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2428 eptr--;
2429 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2430 }
2431 RRETURN(MATCH_NOMATCH);
2432 }
2433 }
2434 /* Control never gets here */
2435
2436 /* Match a negated single one-byte character. The character we are
2437 checking can be multibyte. */
2438
2439 case OP_NOT:
2440 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2441 ecode++;
2442 GETCHARINCTEST(c, eptr);
2443 if ((ims & PCRE_CASELESS) != 0)
2444 {
2445 #ifdef SUPPORT_UTF8
2446 if (c < 256)
2447 #endif
2448 c = md->lcc[c];
2449 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2450 }
2451 else
2452 {
2453 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2454 }
2455 break;
2456
2457 /* Match a negated single one-byte character repeatedly. This is almost a
2458 repeat of the code for a repeated single character, but I haven't found a
2459 nice way of commoning these up that doesn't require a test of the
2460 positive/negative option for each character match. Maybe that wouldn't add
2461 very much to the time taken, but character matching *is* what this is all
2462 about... */
2463
2464 case OP_NOTEXACT:
2465 min = max = GET2(ecode, 1);
2466 ecode += 3;
2467 goto REPEATNOTCHAR;
2468
2469 case OP_NOTUPTO:
2470 case OP_NOTMINUPTO:
2471 min = 0;
2472 max = GET2(ecode, 1);
2473 minimize = *ecode == OP_NOTMINUPTO;
2474 ecode += 3;
2475 goto REPEATNOTCHAR;
2476
2477 case OP_NOTPOSSTAR:
2478 possessive = TRUE;
2479 min = 0;
2480 max = INT_MAX;
2481 ecode++;
2482 goto REPEATNOTCHAR;
2483
2484 case OP_NOTPOSPLUS:
2485 possessive = TRUE;
2486 min = 1;
2487 max = INT_MAX;
2488 ecode++;
2489 goto REPEATNOTCHAR;
2490
2491 case OP_NOTPOSQUERY:
2492 possessive = TRUE;
2493 min = 0;
2494 max = 1;
2495 ecode++;
2496 goto REPEATNOTCHAR;
2497
2498 case OP_NOTPOSUPTO:
2499 possessive = TRUE;
2500 min = 0;
2501 max = GET2(ecode, 1);
2502 ecode += 3;
2503 goto REPEATNOTCHAR;
2504
2505 case OP_NOTSTAR:
2506 case OP_NOTMINSTAR:
2507 case OP_NOTPLUS:
2508 case OP_NOTMINPLUS:
2509 case OP_NOTQUERY:
2510 case OP_NOTMINQUERY:
2511 c = *ecode++ - OP_NOTSTAR;
2512 minimize = (c & 1) != 0;
2513 min = rep_min[c]; /* Pick up values from tables; */
2514 max = rep_max[c]; /* zero for max => infinity */
2515 if (max == 0) max = INT_MAX;
2516
2517 /* Common code for all repeated single-byte matches. We can give up quickly
2518 if there are fewer than the minimum number of bytes left in the
2519 subject. */
2520
2521 REPEATNOTCHAR:
2522 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2523 fc = *ecode++;
2524
2525 /* The code is duplicated for the caseless and caseful cases, for speed,
2526 since matching characters is likely to be quite common. First, ensure the
2527 minimum number of matches are present. If min = max, continue at the same
2528 level without recursing. Otherwise, if minimizing, keep trying the rest of
2529 the expression and advancing one matching character if failing, up to the
2530 maximum. Alternatively, if maximizing, find the maximum number of
2531 characters and work backwards. */
2532
2533 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2534 max, eptr));
2535
2536 if ((ims & PCRE_CASELESS) != 0)
2537 {
2538 fc = md->lcc[fc];
2539
2540 #ifdef SUPPORT_UTF8
2541 /* UTF-8 mode */
2542 if (utf8)
2543 {
2544 register unsigned int d;
2545 for (i = 1; i <= min; i++)
2546 {
2547 GETCHARINC(d, eptr);
2548 if (d < 256) d = md->lcc[d];
2549 if (fc == d) RRETURN(MATCH_NOMATCH);
2550 }
2551 }
2552 else
2553 #endif
2554
2555 /* Not UTF-8 mode */
2556 {
2557 for (i = 1; i <= min; i++)
2558 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2559 }
2560
2561 if (min == max) continue;
2562
2563 if (minimize)
2564 {
2565 #ifdef SUPPORT_UTF8
2566 /* UTF-8 mode */
2567 if (utf8)
2568 {
2569 register unsigned int d;
2570 for (fi = min;; fi++)
2571 {
2572 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2573 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2574 GETCHARINC(d, eptr);
2575 if (d < 256) d = md->lcc[d];
2576 if (fi >= max || eptr >= md->end_subject || fc == d)
2577 RRETURN(MATCH_NOMATCH);
2578 }
2579 }
2580 else
2581 #endif
2582 /* Not UTF-8 mode */
2583 {
2584 for (fi = min;; fi++)
2585 {
2586 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2587 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2588 if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2589 RRETURN(MATCH_NOMATCH);
2590 }
2591 }
2592 /* Control never gets here */
2593 }
2594
2595 /* Maximize case */
2596
2597 else
2598 {
2599 pp = eptr;
2600
2601 #ifdef SUPPORT_UTF8
2602 /* UTF-8 mode */
2603 if (utf8)
2604 {
2605 register unsigned int d;
2606 for (i = min; i < max; i++)
2607 {
2608 int len = 1;
2609 if (eptr >= md->end_subject) break;
2610 GETCHARLEN(d, eptr, len);
2611 if (d < 256) d = md->lcc[d];
2612 if (fc == d) break;
2613 eptr += len;
2614 }
2615 if (possessive) continue;
2616 for(;;)
2617 {
2618 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2619 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2620 if (eptr-- == pp) break; /* Stop if tried at original pos */
2621 BACKCHAR(eptr);
2622 }
2623 }
2624 else
2625 #endif
2626 /* Not UTF-8 mode */
2627 {
2628 for (i = min; i < max; i++)
2629 {
2630 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2631 eptr++;
2632 }
2633 if (possessive) continue;
2634 while (eptr >= pp)
2635 {
2636 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2637 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2638 eptr--;
2639 }
2640 }
2641
2642 RRETURN(MATCH_NOMATCH);
2643 }
2644 /* Control never gets here */
2645 }
2646
2647 /* Caseful comparisons */
2648
2649 else
2650 {
2651 #ifdef SUPPORT_UTF8
2652 /* UTF-8 mode */
2653 if (utf8)
2654 {
2655 register unsigned int d;
2656 for (i = 1; i <= min; i++)
2657 {
2658 GETCHARINC(d, eptr);
2659 if (fc == d) RRETURN(MATCH_NOMATCH);
2660 }
2661 }
2662 else
2663 #endif
2664 /* Not UTF-8 mode */
2665 {
2666 for (i = 1; i <= min; i++)
2667 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2668 }
2669
2670 if (min == max) continue;
2671
2672 if (minimize)
2673 {
2674 #ifdef SUPPORT_UTF8
2675 /* UTF-8 mode */
2676 if (utf8)
2677 {
2678 register unsigned int d;
2679 for (fi = min;; fi++)
2680 {
2681 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2682 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2683 GETCHARINC(d, eptr);
2684 if (fi >= max || eptr >= md->end_subject || fc == d)
2685 RRETURN(MATCH_NOMATCH);
2686 }
2687 }
2688 else
2689 #endif
2690 /* Not UTF-8 mode */
2691 {
2692 for (fi = min;; fi++)
2693 {
2694 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2695 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2696 if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2697 RRETURN(MATCH_NOMATCH);
2698 }
2699 }
2700 /* Control never gets here */
2701 }
2702
2703 /* Maximize case */
2704
2705 else
2706 {
2707 pp = eptr;
2708
2709 #ifdef SUPPORT_UTF8
2710 /* UTF-8 mode */
2711 if (utf8)
2712 {
2713 register unsigned int d;
2714 for (i = min; i < max; i++)
2715 {
2716 int len = 1;
2717 if (eptr >= md->end_subject) break;
2718 GETCHARLEN(d, eptr, len);
2719 if (fc == d) break;
2720 eptr += len;
2721 }
2722 if (possessive) continue;
2723 for(;;)
2724 {
2725 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
2726 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2727 if (eptr-- == pp) break; /* Stop if tried at original pos */
2728 BACKCHAR(eptr);
2729 }
2730 }
2731 else
2732 #endif
2733 /* Not UTF-8 mode */
2734 {
2735 for (i = min; i < max; i++)
2736 {
2737 if (eptr >= md->end_subject || fc == *eptr) break;
2738 eptr++;
2739 }
2740 if (possessive) continue;
2741 while (eptr >= pp)
2742 {
2743 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
2744 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2745 eptr--;
2746 }
2747 }
2748
2749 RRETURN(MATCH_NOMATCH);
2750 }
2751 }
2752 /* Control never gets here */
2753
2754 /* Match a single character type repeatedly; several different opcodes
2755 share code. This is very similar to the code for single characters, but we
2756 repeat it in the interests of efficiency. */
2757
2758 case OP_TYPEEXACT:
2759 min = max = GET2(ecode, 1);
2760 minimize = TRUE;
2761 ecode += 3;
2762 goto REPEATTYPE;
2763
2764 case OP_TYPEUPTO:
2765 case OP_TYPEMINUPTO:
2766 min = 0;
2767 max = GET2(ecode, 1);
2768 minimize = *ecode == OP_TYPEMINUPTO;
2769 ecode += 3;
2770 goto REPEATTYPE;
2771
2772 case OP_TYPEPOSSTAR:
2773 possessive = TRUE;
2774 min = 0;
2775 max = INT_MAX;
2776 ecode++;
2777 goto REPEATTYPE;
2778
2779 case OP_TYPEPOSPLUS:
2780 possessive = TRUE;
2781 min = 1;
2782 max = INT_MAX;
2783 ecode++;
2784 goto REPEATTYPE;
2785
2786 case OP_TYPEPOSQUERY:
2787 possessive = TRUE;
2788 min = 0;
2789 max = 1;
2790 ecode++;
2791 goto REPEATTYPE;
2792
2793 case OP_TYPEPOSUPTO:
2794 possessive = TRUE;
2795 min = 0;
2796 max = GET2(ecode, 1);
2797 ecode += 3;
2798 goto REPEATTYPE;
2799
2800 case OP_TYPESTAR:
2801 case OP_TYPEMINSTAR:
2802 case OP_TYPEPLUS:
2803 case OP_TYPEMINPLUS:
2804 case OP_TYPEQUERY:
2805 case OP_TYPEMINQUERY:
2806 c = *ecode++ - OP_TYPESTAR;
2807 minimize = (c & 1) != 0;
2808 min = rep_min[c]; /* Pick up values from tables; */
2809 max = rep_max[c]; /* zero for max => infinity */
2810 if (max == 0) max = INT_MAX;
2811
2812 /* Common code for all repeated single character type matches. Note that
2813 in UTF-8 mode, '.' matches a character of any length, but for the other
2814 character types, the valid characters are all one-byte long. */
2815
2816 REPEATTYPE:
2817 ctype = *ecode++; /* Code for the character type */
2818
2819 #ifdef SUPPORT_UCP
2820 if (ctype == OP_PROP || ctype == OP_NOTPROP)
2821 {
2822 prop_fail_result = ctype == OP_NOTPROP;
2823 prop_type = *ecode++;
2824 prop_value = *ecode++;
2825 }
2826 else prop_type = -1;
2827 #endif
2828
2829 /* First, ensure the minimum number of matches are present. Use inline
2830 code for maximizing the speed, and do the type test once at the start
2831 (i.e. keep it out of the loop). Also we can test that there are at least
2832 the minimum number of bytes before we start. This isn't as effective in
2833 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2834 is tidier. Also separate the UCP code, which can be the same for both UTF-8
2835 and single-bytes. */
2836
2837 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2838 if (min > 0)
2839 {
2840 #ifdef SUPPORT_UCP
2841 if (prop_type >= 0)
2842 {
2843 switch(prop_type)
2844 {
2845 case PT_ANY:
2846 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2847 for (i = 1; i <= min; i++)
2848 {
2849 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2850 GETCHARINCTEST(c, eptr);
2851 }
2852 break;
2853
2854 case PT_LAMP:
2855 for (i = 1; i <= min; i++)
2856 {
2857 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2858 GETCHARINCTEST(c, eptr);
2859 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2860 if ((prop_chartype == ucp_Lu ||
2861 prop_chartype == ucp_Ll ||
2862 prop_chartype == ucp_Lt) == prop_fail_result)
2863 RRETURN(MATCH_NOMATCH);
2864 }
2865 break;
2866
2867 case PT_GC:
2868 for (i = 1; i <= min; i++)
2869 {
2870 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2871 GETCHARINCTEST(c, eptr);
2872 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2873 if ((prop_category == prop_value) == prop_fail_result)
2874 RRETURN(MATCH_NOMATCH);
2875 }
2876 break;
2877
2878 case PT_PC:
2879 for (i = 1; i <= min; i++)
2880 {
2881 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2882 GETCHARINCTEST(c, eptr);
2883 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2884 if ((prop_chartype == prop_value) == prop_fail_result)
2885 RRETURN(MATCH_NOMATCH);
2886 }
2887 break;
2888
2889 case PT_SC:
2890 for (i = 1; i <= min; i++)
2891 {
2892 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2893 GETCHARINCTEST(c, eptr);
2894 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2895 if ((prop_script == prop_value) == prop_fail_result)
2896 RRETURN(MATCH_NOMATCH);
2897 }
2898 break;
2899
2900 default:
2901 RRETURN(PCRE_ERROR_INTERNAL);
2902 }
2903 }
2904
2905 /* Match extended Unicode sequences. We will get here only if the
2906 support is in the binary; otherwise a compile-time error occurs. */
2907
2908 else if (ctype == OP_EXTUNI)
2909 {
2910 for (i = 1; i <= min; i++)
2911 {
2912 GETCHARINCTEST(c, eptr);
2913 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2914 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2915 while (eptr < md->end_subject)
2916 {
2917 int len = 1;
2918 if (!utf8) c = *eptr; else
2919 {
2920 GETCHARLEN(c, eptr, len);
2921 }
2922 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2923 if (prop_category != ucp_M) break;
2924 eptr += len;
2925 }
2926 }
2927 }
2928
2929 else
2930 #endif /* SUPPORT_UCP */
2931
2932 /* Handle all other cases when the coding is UTF-8 */
2933
2934 #ifdef SUPPORT_UTF8
2935 if (utf8) switch(ctype)
2936 {
2937 case OP_ANY:
2938 for (i = 1; i <= min; i++)
2939 {
2940 if (eptr >= md->end_subject ||
2941 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
2942 RRETURN(MATCH_NOMATCH);
2943 eptr++;
2944 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2945 }
2946 break;
2947
2948 case OP_ANYBYTE:
2949 eptr += min;
2950 break;
2951
2952 case OP_ANYNL:
2953 for (i = 1; i <= min; i++)
2954 {
2955 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2956 GETCHARINC(c, eptr);
2957 switch(c)
2958 {
2959 default: RRETURN(MATCH_NOMATCH);
2960 case 0x000d:
2961 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2962 break;
2963
2964 case 0x000a:
2965 break;
2966
2967 case 0x000b:
2968 case 0x000c:
2969 case 0x0085:
2970 case 0x2028:
2971 case 0x2029:
2972 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2973 break;
2974 }
2975 }
2976 break;
2977
2978 case OP_NOT_HSPACE:
2979 for (i = 1; i <= min; i++)
2980 {
2981 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2982 GETCHARINC(c, eptr);
2983 switch(c)
2984 {
2985 default: break;
2986 case 0x09: /* HT */
2987 case 0x20: /* SPACE */
2988 case 0xa0: /* NBSP */
2989 case 0x1680: /* OGHAM SPACE MARK */
2990 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2991 case 0x2000: /* EN QUAD */
2992 case 0x2001: /* EM QUAD */
2993 case 0x2002: /* EN SPACE */
2994 case 0x2003: /* EM SPACE */
2995 case 0x2004: /* THREE-PER-EM SPACE */
2996 case 0x2005: /* FOUR-PER-EM SPACE */
2997 case 0x2006: /* SIX-PER-EM SPACE */
2998 case 0x2007: /* FIGURE SPACE */
2999 case 0x2008: /* PUNCTUATION SPACE */
3000 case 0x2009: /* THIN SPACE */
3001 case 0x200A: /* HAIR SPACE */
3002 case 0x202f: /* NARROW NO-BREAK SPACE */
3003 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3004 case 0x3000: /* IDEOGRAPHIC SPACE */
3005 RRETURN(MATCH_NOMATCH);
3006 }
3007 }
3008 break;
3009
3010 case OP_HSPACE:
3011 for (i = 1; i <= min; i++)
3012 {
3013 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3014 GETCHARINC(c, eptr);
3015 switch(c)
3016 {
3017 default: RRETURN(MATCH_NOMATCH);
3018 case 0x09: /* HT */
3019 case 0x20: /* SPACE */
3020 case 0xa0: /* NBSP */
3021 case 0x1680: /* OGHAM SPACE MARK */
3022 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3023 case 0x2000: /* EN QUAD */
3024 case 0x2001: /* EM QUAD */
3025 case 0x2002: /* EN SPACE */
3026 case 0x2003: /* EM SPACE */
3027 case 0x2004: /* THREE-PER-EM SPACE */
3028 case 0x2005: /* FOUR-PER-EM SPACE */
3029 case 0x2006: /* SIX-PER-EM SPACE */
3030 case 0x2007: /* FIGURE SPACE */
3031 case 0x2008: /* PUNCTUATION SPACE */
3032 case 0x2009: /* THIN SPACE */
3033 case 0x200A: /* HAIR SPACE */
3034 case 0x202f: /* NARROW NO-BREAK SPACE */
3035 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3036 case 0x3000: /* IDEOGRAPHIC SPACE */
3037 break;
3038 }
3039 }
3040 break;
3041
3042 case OP_NOT_VSPACE:
3043 for (i = 1; i <= min; i++)
3044 {
3045 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3046 GETCHARINC(c, eptr);
3047 switch(c)
3048 {
3049 default: break;
3050 case 0x0a: /* LF */
3051 case 0x0b: /* VT */
3052 case 0x0c: /* FF */
3053 case 0x0d: /* CR */
3054 case 0x85: /* NEL */
3055 case 0x2028: /* LINE SEPARATOR */
3056 case 0x2029: /* PARAGRAPH SEPARATOR */
3057 RRETURN(MATCH_NOMATCH);
3058 }
3059 }
3060 break;
3061
3062 case OP_VSPACE:
3063 for (i = 1; i <= min; i++)
3064 {
3065 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3066 GETCHARINC(c, eptr);
3067 switch(c)
3068 {
3069 default: RRETURN(MATCH_NOMATCH);
3070 case 0x0a: /* LF */
3071 case 0x0b: /* VT */
3072 case 0x0c: /* FF */
3073 case 0x0d: /* CR */
3074 case 0x85: /* NEL */
3075 case 0x2028: /* LINE SEPARATOR */
3076 case 0x2029: /* PARAGRAPH SEPARATOR */
3077 break;
3078 }
3079 }
3080 break;
3081
3082 case OP_NOT_DIGIT:
3083 for (i = 1; i <= min; i++)
3084 {
3085 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3086 GETCHARINC(c, eptr);
3087 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3088 RRETURN(MATCH_NOMATCH);
3089 }
3090 break;
3091
3092 case OP_DIGIT:
3093 for (i = 1; i <= min; i++)
3094 {
3095 if (eptr >= md->end_subject ||
3096 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3097 RRETURN(MATCH_NOMATCH);
3098 /* No need to skip more bytes - we know it's a 1-byte character */
3099 }
3100 break;
3101
3102 case OP_NOT_WHITESPACE:
3103 for (i = 1; i <= min; i++)
3104 {
3105 if (eptr >= md->end_subject ||
3106 (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0))
3107 RRETURN(MATCH_NOMATCH);
3108 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3109 }
3110 break;
3111
3112 case OP_WHITESPACE:
3113 for (i = 1; i <= min; i++)
3114 {
3115 if (eptr >= md->end_subject ||
3116 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3117 RRETURN(MATCH_NOMATCH);
3118 /* No need to skip more bytes - we know it's a 1-byte character */
3119 }
3120 break;
3121
3122 case OP_NOT_WORDCHAR:
3123 for (i = 1; i <= min; i++)
3124 {
3125 if (eptr >= md->end_subject ||
3126 (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
3127 RRETURN(MATCH_NOMATCH);
3128 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3129 }
3130 break;
3131
3132 case OP_WORDCHAR:
3133 for (i = 1; i <= min; i++)
3134 {
3135 if (eptr >= md->end_subject ||
3136 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3137 RRETURN(MATCH_NOMATCH);
3138 /* No need to skip more bytes - we know it's a 1-byte character */
3139 }
3140 break;
3141
3142 default:
3143 RRETURN(PCRE_ERROR_INTERNAL);
3144 } /* End switch(ctype) */
3145
3146 else
3147 #endif /* SUPPORT_UTF8 */
3148
3149 /* Code for the non-UTF-8 case for minimum matching of operators other
3150 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
3151 number of bytes present, as this was tested above. */
3152
3153 switch(ctype)
3154 {
3155 case OP_ANY:
3156 if ((ims & PCRE_DOTALL) == 0)
3157 {
3158 for (i = 1; i <= min; i++)
3159 {
3160 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3161 eptr++;
3162 }
3163 }
3164 else eptr += min;
3165 break;
3166
3167 case OP_ANYBYTE:
3168 eptr += min;
3169 break;
3170
3171 /* Because of the CRLF case, we can't assume the minimum number of
3172 bytes are present in this case. */
3173
3174 case OP_ANYNL:
3175 for (i = 1; i <= min; i++)
3176 {
3177 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3178 switch(*eptr++)
3179 {
3180 default: RRETURN(MATCH_NOMATCH);
3181 case 0x000d:
3182 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3183 break;
3184 case 0x000a:
3185 break;
3186
3187 case 0x000b:
3188 case 0x000c:
3189 case 0x0085:
3190 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3191 break;
3192 }
3193 }
3194 break;
3195
3196 case OP_NOT_HSPACE:
3197 for (i = 1; i <= min; i++)
3198 {
3199 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3200 switch(*eptr++)
3201 {
3202 default: break;
3203 case 0x09: /* HT */
3204 case 0x20: /* SPACE */
3205 case 0xa0: /* NBSP */
3206 RRETURN(MATCH_NOMATCH);
3207 }
3208 }
3209 break;
3210
3211 case OP_HSPACE:
3212 for (i = 1; i <= min; i++)
3213 {
3214 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3215 switch(*eptr++)
3216 {
3217 default: RRETURN(MATCH_NOMATCH);
3218 case 0x09: /* HT */
3219 case 0x20: /* SPACE */
3220 case 0xa0: /* NBSP */
3221 break;
3222 }
3223 }
3224 break;
3225
3226 case OP_NOT_VSPACE:
3227 for (i = 1; i <= min; i++)
3228 {
3229 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3230 switch(*eptr++)
3231 {
3232 default: break;
3233 case 0x0a: /* LF */
3234 case 0x0b: /* VT */
3235 case 0x0c: /* FF */
3236 case 0x0d: /* CR */
3237 case 0x85: /* NEL */
3238 RRETURN(MATCH_NOMATCH);
3239 }
3240 }
3241 break;
3242
3243 case OP_VSPACE:
3244 for (i = 1; i <= min; i++)
3245 {
3246 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3247 switch(*eptr++)
3248 {
3249 default: RRETURN(MATCH_NOMATCH);
3250 case 0x0a: /* LF */
3251 case 0x0b: /* VT */
3252 case 0x0c: /* FF */
3253 case 0x0d: /* CR */
3254 case 0x85: /* NEL */
3255 break;
3256 }
3257 }
3258 break;
3259
3260 case OP_NOT_DIGIT:
3261 for (i = 1; i <= min; i++)
3262 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3263 break;
3264
3265 case OP_DIGIT:
3266 for (i = 1; i <= min; i++)
3267 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3268 break;
3269
3270 case OP_NOT_WHITESPACE:
3271 for (i = 1; i <= min; i++)
3272 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3273 break;
3274
3275 case OP_WHITESPACE:
3276 for (i = 1; i <= min; i++)
3277 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3278 break;
3279
3280 case OP_NOT_WORDCHAR:
3281 for (i = 1; i <= min; i++)
3282 if ((md->ctypes[*eptr++] & ctype_word) != 0)
3283 RRETURN(MATCH_NOMATCH);
3284 break;
3285
3286 case OP_WORDCHAR:
3287 for (i = 1; i <= min; i++)
3288 if ((md->ctypes[*eptr++] & ctype_word) == 0)
3289 RRETURN(MATCH_NOMATCH);
3290 break;
3291
3292 default:
3293 RRETURN(PCRE_ERROR_INTERNAL);
3294 }
3295 }
3296
3297 /* If min = max, continue at the same level without recursing */
3298
3299 if (min == max) continue;
3300
3301 /* If minimizing, we have to test the rest of the pattern before each
3302 subsequent match. Again, separate the UTF-8 case for speed, and also
3303 separate the UCP cases. */
3304
3305 if (minimize)
3306 {
3307 #ifdef SUPPORT_UCP
3308 if (prop_type >= 0)
3309 {
3310 switch(prop_type)
3311 {
3312 case PT_ANY:
3313 for (fi = min;; fi++)
3314 {
3315 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3316 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3317 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3318 GETCHARINC(c, eptr);
3319 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3320 }
3321 /* Control never gets here */
3322
3323 case PT_LAMP:
3324 for (fi = min;; fi++)
3325 {
3326 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3327 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3328 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3329 GETCHARINC(c, eptr);
3330 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3331 if ((prop_chartype == ucp_Lu ||
3332 prop_chartype == ucp_Ll ||
3333 prop_chartype == ucp_Lt) == prop_fail_result)
3334 RRETURN(MATCH_NOMATCH);
3335 }
3336 /* Control never gets here */
3337
3338 case PT_GC:
3339 for (fi = min;; fi++)
3340 {
3341 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3342 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3343 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3344 GETCHARINC(c, eptr);
3345 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3346 if ((prop_category == prop_value) == prop_fail_result)
3347 RRETURN(MATCH_NOMATCH);
3348 }
3349 /* Control never gets here */
3350
3351 case PT_PC:
3352 for (fi = min;; fi++)
3353 {
3354 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3355 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3356 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3357 GETCHARINC(c, eptr);
3358 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3359 if ((prop_chartype == prop_value) == prop_fail_result)
3360 RRETURN(MATCH_NOMATCH);
3361 }
3362 /* Control never gets here */
3363
3364 case PT_SC:
3365 for (fi = min;; fi++)
3366 {
3367 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3368 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3369 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3370 GETCHARINC(c, eptr);
3371 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3372 if ((prop_script == prop_value) == prop_fail_result)
3373 RRETURN(MATCH_NOMATCH);
3374 }
3375 /* Control never gets here */
3376
3377 default:
3378 RRETURN(PCRE_ERROR_INTERNAL);
3379 }
3380 }
3381
3382 /* Match extended Unicode sequences. We will get here only if the
3383 support is in the binary; otherwise a compile-time error occurs. */
3384
3385 else if (ctype == OP_EXTUNI)
3386 {
3387 for (fi = min;; fi++)
3388 {
3389 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3390 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3391 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3392 GETCHARINCTEST(c, eptr);
3393 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3394 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3395 while (eptr < md->end_subject)
3396 {
3397 int len = 1;
3398 if (!utf8) c = *eptr; else
3399 {
3400 GETCHARLEN(c, eptr, len);
3401 }
3402 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3403 if (prop_category != ucp_M) break;
3404 eptr += len;
3405 }
3406 }
3407 }
3408
3409 else
3410 #endif /* SUPPORT_UCP */
3411
3412 #ifdef SUPPORT_UTF8
3413 /* UTF-8 mode */
3414 if (utf8)
3415 {
3416 for (fi = min;; fi++)
3417 {
3418 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3419 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3420 if (fi >= max || eptr >= md->end_subject ||
3421 (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&
3422 IS_NEWLINE(eptr)))
3423 RRETURN(MATCH_NOMATCH);
3424
3425 GETCHARINC(c, eptr);
3426 switch(ctype)
3427 {
3428 case OP_ANY: /* This is the DOTALL case */
3429 break;
3430
3431 case OP_ANYBYTE:
3432 break;
3433
3434 case OP_ANYNL:
3435 switch(c)
3436 {
3437 default: RRETURN(MATCH_NOMATCH);
3438 case 0x000d:
3439 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3440 break;
3441 case 0x000a:
3442 break;
3443
3444 case 0x000b:
3445 case 0x000c:
3446 case 0x0085:
3447 case 0x2028:
3448 case 0x2029:
3449 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3450 break;
3451 }
3452 break;
3453
3454 case OP_NOT_HSPACE:
3455 switch(c)
3456 {
3457 default: break;
3458 case 0x09: /* HT */
3459 case 0x20: /* SPACE */
3460 case 0xa0: /* NBSP */
3461 case 0x1680: /* OGHAM SPACE MARK */
3462 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3463 case 0x2000: /* EN QUAD */
3464 case 0x2001: /* EM QUAD */
3465 case 0x2002: /* EN SPACE */
3466 case 0x2003: /* EM SPACE */
3467 case 0x2004: /* THREE-PER-EM SPACE */
3468 case 0x2005: /* FOUR-PER-EM SPACE */
3469 case 0x2006: /* SIX-PER-EM SPACE */
3470 case 0x2007: /* FIGURE SPACE */
3471 case 0x2008: /* PUNCTUATION SPACE */
3472 case 0x2009: /* THIN SPACE */
3473 case 0x200A: /* HAIR SPACE */
3474 case 0x202f: /* NARROW NO-BREAK SPACE */
3475 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3476 case 0x3000: /* IDEOGRAPHIC SPACE */
3477 RRETURN(MATCH_NOMATCH);
3478 }
3479 break;
3480
3481 case OP_HSPACE:
3482 switch(c)
3483 {
3484 default: RRETURN(MATCH_NOMATCH);
3485 case 0x09: /* HT */
3486 case 0x20: /* SPACE */
3487 case 0xa0: /* NBSP */
3488 case 0x1680: /* OGHAM SPACE MARK */
3489 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3490 case 0x2000: /* EN QUAD */
3491 case 0x2001: /* EM QUAD */
3492 case 0x2002: /* EN SPACE */
3493 case 0x2003: /* EM SPACE */
3494 case 0x2004: /* THREE-PER-EM SPACE */
3495 case 0x2005: /* FOUR-PER-EM SPACE */
3496 case 0x2006: /* SIX-PER-EM SPACE */
3497 case 0x2007: /* FIGURE SPACE */
3498 case 0x2008: /* PUNCTUATION SPACE */
3499 case 0x2009: /* THIN SPACE */
3500 case 0x200A: /* HAIR SPACE */
3501 case 0x202f: /* NARROW NO-BREAK SPACE */
3502 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3503 case 0x3000: /* IDEOGRAPHIC SPACE */
3504 break;
3505 }
3506 break;
3507
3508 case OP_NOT_VSPACE:
3509 switch(c)
3510 {
3511 default: break;
3512 case 0x0a: /* LF */
3513 case 0x0b: /* VT */
3514 case 0x0c: /* FF */
3515 case 0x0d: /* CR */
3516 case 0x85: /* NEL */
3517 case 0x2028: /* LINE SEPARATOR */
3518 case 0x2029: /* PARAGRAPH SEPARATOR */
3519 RRETURN(MATCH_NOMATCH);
3520 }
3521 break;
3522
3523 case OP_VSPACE:
3524 switch(c)
3525 {
3526 default: RRETURN(MATCH_NOMATCH);
3527 case 0x0a: /* LF */
3528 case 0x0b: /* VT */
3529 case 0x0c: /* FF */
3530 case 0x0d: /* CR */
3531 case 0x85: /* NEL */
3532 case 0x2028: /* LINE SEPARATOR */
3533 case 0x2029: /* PARAGRAPH SEPARATOR */
3534 break;
3535 }
3536 break;
3537
3538 case OP_NOT_DIGIT:
3539 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3540 RRETURN(MATCH_NOMATCH);
3541 break;
3542
3543 case OP_DIGIT:
3544 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3545 RRETURN(MATCH_NOMATCH);
3546 break;
3547
3548 case OP_NOT_WHITESPACE:
3549 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3550 RRETURN(MATCH_NOMATCH);
3551 break;
3552
3553 case OP_WHITESPACE:
3554 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3555 RRETURN(MATCH_NOMATCH);
3556 break;
3557
3558 case OP_NOT_WORDCHAR:
3559 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3560 RRETURN(MATCH_NOMATCH);
3561 break;
3562
3563 case OP_WORDCHAR:
3564 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3565 RRETURN(MATCH_NOMATCH);
3566 break;
3567
3568 default:
3569 RRETURN(PCRE_ERROR_INTERNAL);
3570 }
3571 }
3572 }
3573 else
3574 #endif
3575 /* Not UTF-8 mode */
3576 {
3577 for (fi = min;; fi++)
3578 {
3579 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3580 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3581 if (fi >= max || eptr >= md->end_subject ||
3582 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
3583 RRETURN(MATCH_NOMATCH);
3584
3585 c = *eptr++;
3586 switch(ctype)
3587 {
3588 case OP_ANY: /* This is the DOTALL case */
3589 break;
3590
3591 case OP_ANYBYTE:
3592 break;
3593
3594 case OP_ANYNL:
3595 switch(c)
3596 {
3597 default: RRETURN(MATCH_NOMATCH);
3598 case 0x000d:
3599 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3600 break;
3601
3602 case 0x000a:
3603 break;
3604
3605 case 0x000b:
3606 case 0x000c:
3607 case 0x0085:
3608 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3609 break;
3610 }
3611 break;
3612
3613 case OP_NOT_HSPACE:
3614 switch(c)
3615 {
3616 default: break;
3617 case 0x09: /* HT */
3618 case 0x20: /* SPACE */
3619 case 0xa0: /* NBSP */
3620 RRETURN(MATCH_NOMATCH);
3621 }
3622 break;
3623
3624 case OP_HSPACE:
3625 switch(c)
3626 {
3627 default: RRETURN(MATCH_NOMATCH);
3628 case 0x09: /* HT */
3629 case 0x20: /* SPACE */
3630 case 0xa0: /* NBSP */
3631 break;
3632 }
3633 break;
3634
3635 case OP_NOT_VSPACE:
3636 switch(c)
3637 {
3638 default: break;
3639 case 0x0a: /* LF */
3640 case 0x0b: /* VT */
3641 case 0x0c: /* FF */
3642 case 0x0d: /* CR */
3643 case 0x85: /* NEL */
3644 RRETURN(MATCH_NOMATCH);
3645 }
3646 break;
3647
3648 case OP_VSPACE:
3649 switch(c)
3650 {
3651 default: RRETURN(MATCH_NOMATCH);
3652 case 0x0a: /* LF */
3653 case 0x0b: /* VT */
3654 case 0x0c: /* FF */
3655 case 0x0d: /* CR */
3656 case 0x85: /* NEL */
3657 break;
3658 }
3659 break;
3660
3661 case OP_NOT_DIGIT:
3662 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3663 break;
3664
3665 case OP_DIGIT:
3666 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3667 break;
3668
3669 case OP_NOT_WHITESPACE:
3670 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3671 break;
3672
3673 case OP_WHITESPACE:
3674 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3675 break;
3676
3677 case OP_NOT_WORDCHAR:
3678 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3679 break;
3680
3681 case OP_WORDCHAR:
3682 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3683 break;
3684
3685 default:
3686 RRETURN(PCRE_ERROR_INTERNAL);
3687 }
3688 }
3689 }
3690 /* Control never gets here */
3691 }
3692
3693 /* If maximizing, it is worth using inline code for speed, doing the type
3694 test once at the start (i.e. keep it out of the loop). Again, keep the
3695 UTF-8 and UCP stuff separate. */
3696
3697 else
3698 {
3699 pp = eptr; /* Remember where we started */
3700
3701 #ifdef SUPPORT_UCP
3702 if (prop_type >= 0)
3703 {
3704 switch(prop_type)
3705 {
3706 case PT_ANY:
3707 for (i = min; i < max; i++)
3708 {
3709 int len = 1;
3710 if (eptr >= md->end_subject) break;
3711 GETCHARLEN(c, eptr, len);
3712 if (prop_fail_result) break;
3713 eptr+= len;
3714 }
3715 break;
3716
3717 case PT_LAMP:
3718 for (i = min; i < max; i++)
3719 {
3720 int len = 1;
3721 if (eptr >= md->end_subject) break;
3722 GETCHARLEN(c, eptr, len);
3723 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3724 if ((prop_chartype == ucp_Lu ||
3725 prop_chartype == ucp_Ll ||
3726 prop_chartype == ucp_Lt) == prop_fail_result)
3727 break;
3728 eptr+= len;
3729 }
3730 break;
3731
3732 case PT_GC:
3733 for (i = min; i < max; i++)
3734 {
3735 int len = 1;
3736 if (eptr >= md->end_subject) break;
3737 GETCHARLEN(c, eptr, len);
3738 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3739 if ((prop_category == prop_value) == prop_fail_result)
3740 break;
3741 eptr+= len;
3742 }
3743 break;
3744
3745 case PT_PC:
3746 for (i = min; i < max; i++)
3747 {
3748 int len = 1;
3749 if (eptr >= md->end_subject) break;
3750 GETCHARLEN(c, eptr, len);
3751 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3752 if ((prop_chartype == prop_value) == prop_fail_result)
3753 break;
3754 eptr+= len;
3755 }
3756 break;
3757
3758 case PT_SC:
3759 for (i = min; i < max; i++)
3760 {
3761 int len = 1;
3762 if (eptr >= md->end_subject) break;
3763 GETCHARLEN(c, eptr, len);
3764 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3765 if ((prop_script == prop_value) == prop_fail_result)
3766 break;
3767 eptr+= len;
3768 }
3769 break;
3770 }
3771
3772 /* eptr is now past the end of the maximum run */
3773
3774 if (possessive) continue;
3775 for(;;)
3776 {
3777 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
3778 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3779 if (eptr-- == pp) break; /* Stop if tried at original pos */
3780 if (utf8) BACKCHAR(eptr);
3781 }
3782 }
3783
3784 /* Match extended Unicode sequences. We will get here only if the
3785 support is in the binary; otherwise a compile-time error occurs. */
3786
3787 else if (ctype == OP_EXTUNI)
3788 {
3789 for (i = min; i < max; i++)
3790 {
3791 if (eptr >= md->end_subject) break;
3792 GETCHARINCTEST(c, eptr);
3793 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3794 if (prop_category == ucp_M) break;
3795 while (eptr < md->end_subject)
3796 {
3797 int len = 1;
3798 if (!utf8) c = *eptr; else
3799 {
3800 GETCHARLEN(c, eptr, len);
3801 }
3802 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3803 if (prop_category != ucp_M) break;
3804 eptr += len;
3805 }
3806 }
3807
3808 /* eptr is now past the end of the maximum run */
3809
3810 if (possessive) continue;
3811 for(;;)
3812 {
3813 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
3814 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3815 if (eptr-- == pp) break; /* Stop if tried at original pos */
3816 for (;;) /* Move back over one extended */
3817 {
3818 int len = 1;
3819 if (!utf8) c = *eptr; else
3820 {
3821 BACKCHAR(eptr);
3822 GETCHARLEN(c, eptr, len);
3823 }
3824 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3825 if (prop_category != ucp_M) break;
3826 eptr--;
3827 }
3828 }
3829 }
3830
3831 else
3832 #endif /* SUPPORT_UCP */
3833
3834 #ifdef SUPPORT_UTF8
3835 /* UTF-8 mode */
3836
3837 if (utf8)
3838 {
3839 switch(ctype)
3840 {
3841 case OP_ANY:
3842 if (max < INT_MAX)
3843 {
3844 if ((ims & PCRE_DOTALL) == 0)
3845 {
3846 for (i = min; i < max; i++)
3847 {
3848 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3849 eptr++;
3850 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3851 }
3852 }
3853 else
3854 {
3855 for (i = min; i < max; i++)
3856 {
3857 if (eptr >= md->end_subject) break;
3858 eptr++;
3859 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3860 }
3861 }
3862 }
3863
3864 /* Handle unlimited UTF-8 repeat */
3865
3866 else
3867 {
3868 if ((ims & PCRE_DOTALL) == 0)
3869 {
3870 for (i = min; i < max; i++)
3871 {
3872 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3873 eptr++;
3874 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3875 }
3876 }
3877 else
3878 {
3879 eptr = md->end_subject;
3880 }
3881 }
3882 break;
3883
3884 /* The byte case is the same as non-UTF8 */
3885
3886 case OP_ANYBYTE:
3887 c = max - min;
3888 if (c > (unsigned int)(md->end_subject - eptr))
3889 c = md->end_subject - eptr;
3890 eptr += c;
3891 break;
3892
3893 case OP_ANYNL:
3894 for (i = min; i < max; i++)
3895 {
3896 int len = 1;
3897 if (eptr >= md->end_subject) break;
3898 GETCHARLEN(c, eptr, len);
3899 if (c == 0x000d)
3900 {
3901 if (++eptr >= md->end_subject) break;
3902 if (*eptr == 0x000a) eptr++;
3903 }
3904 else
3905 {
3906 if (c != 0x000a &&
3907 (md->bsr_anycrlf ||
3908 (c != 0x000b && c != 0x000c &&
3909 c != 0x0085 && c != 0x2028 && c != 0x2029)))
3910 break;
3911 eptr += len;
3912 }
3913 }
3914 break;
3915
3916 case OP_NOT_HSPACE:
3917 case OP_HSPACE:
3918 for (i = min; i < max; i++)
3919 {
3920 BOOL gotspace;
3921 int len = 1;
3922 if (eptr >= md->end_subject) break;
3923 GETCHARLEN(c, eptr, len);
3924 switch(c)
3925 {
3926 default: gotspace = FALSE; break;
3927 case 0x09: /* HT */
3928 case 0x20: /* SPACE */
3929 case 0xa0: /* NBSP */
3930 case 0x1680: /* OGHAM SPACE MARK */
3931 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3932 case 0x2000: /* EN QUAD */
3933 case 0x2001: /* EM QUAD */
3934 case 0x2002: /* EN SPACE */
3935 case 0x2003: /* EM SPACE */
3936 case 0x2004: /* THREE-PER-EM SPACE */
3937 case 0x2005: /* FOUR-PER-EM SPACE */
3938 case 0x2006: /* SIX-PER-EM SPACE */
3939 case 0x2007: /* FIGURE SPACE */
3940 case 0x2008: /* PUNCTUATION SPACE */
3941 case 0x2009: /* THIN SPACE */
3942 case 0x200A: /* HAIR SPACE */
3943 case 0x202f: /* NARROW NO-BREAK SPACE */
3944 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3945 case 0x3000: /* IDEOGRAPHIC SPACE */
3946 gotspace = TRUE;
3947 break;
3948 }
3949 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
3950 eptr += len;
3951 }
3952 break;
3953
3954 case OP_NOT_VSPACE:
3955 case OP_VSPACE:
3956 for (i = min; i < max; i++)
3957 {
3958 BOOL gotspace;
3959 int len = 1;
3960 if (eptr >= md->end_subject) break;
3961 GETCHARLEN(c, eptr, len);
3962 switch(c)
3963 {
3964 default: gotspace = FALSE; break;
3965 case 0x0a: /* LF */
3966 case 0x0b: /* VT */
3967 case 0x0c: /* FF */
3968 case 0x0d: /* CR */
3969 case 0x85: /* NEL */
3970 case 0x2028: /* LINE SEPARATOR */
3971 case 0x2029: /* PARAGRAPH SEPARATOR */
3972 gotspace = TRUE;
3973 break;
3974 }
3975 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
3976 eptr += len;
3977 }
3978 break;
3979
3980 case OP_NOT_DIGIT:
3981 for (i = min; i < max; i++)
3982 {
3983 int len = 1;
3984 if (eptr >= md->end_subject) break;
3985 GETCHARLEN(c, eptr, len);
3986 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
3987 eptr+= len;
3988 }
3989 break;
3990
3991 case OP_DIGIT:
3992 for (i = min; i < max; i++)
3993 {
3994 int len = 1;
3995 if (eptr >= md->end_subject) break;
3996 GETCHARLEN(c, eptr, len);
3997 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
3998 eptr+= len;
3999 }
4000 break;
4001
4002 case OP_NOT_WHITESPACE:
4003 for (i = min; i < max; i++)
4004 {
4005 int len = 1;
4006 if (eptr >= md->end_subject) break;
4007 GETCHARLEN(c, eptr, len);
4008 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
4009 eptr+= len;
4010 }
4011 break;
4012
4013 case OP_WHITESPACE:
4014 for (i = min; i < max; i++)
4015 {
4016 int len = 1;
4017 if (eptr >= md->end_subject) break;
4018 GETCHARLEN(c, eptr, len);
4019 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
4020 eptr+= len;
4021 }
4022 break;
4023
4024 case OP_NOT_WORDCHAR:
4025 for (i = min; i < max; i++)
4026 {
4027 int len = 1;
4028 if (eptr >= md->end_subject) break;
4029 GETCHARLEN(c, eptr, len);
4030 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4031 eptr+= len;
4032 }
4033 break;
4034
4035 case OP_WORDCHAR:
4036 for (i = min; i < max; i++)
4037 {
4038 int len = 1;
4039 if (eptr >= md->end_subject) break;
4040 GETCHARLEN(c, eptr, len);
4041 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4042 eptr+= len;
4043 }
4044 break;
4045
4046 default:
4047 RRETURN(PCRE_ERROR_INTERNAL);
4048 }
4049
4050 /* eptr is now past the end of the maximum run */
4051
4052 if (possessive) continue;
4053 for(;;)
4054 {
4055 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4056 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4057 if (eptr-- == pp) break; /* Stop if tried at original pos */
4058 BACKCHAR(eptr);
4059 }
4060 }
4061 else
4062 #endif /* SUPPORT_UTF8 */
4063
4064 /* Not UTF-8 mode */
4065 {
4066 switch(ctype)
4067 {
4068 case OP_ANY:
4069 if ((ims & PCRE_DOTALL) == 0)
4070 {
4071 for (i = min; i < max; i++)
4072 {
4073 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4074 eptr++;
4075 }
4076 break;
4077 }
4078 /* For DOTALL case, fall through and treat as \C */
4079
4080 case OP_ANYBYTE:
4081 c = max - min;
4082 if (c > (unsigned int)(md->end_subject - eptr))
4083 c = md->end_subject - eptr;
4084 eptr += c;
4085 break;
4086
4087 case OP_ANYNL:
4088 for (i = min; i < max; i++)
4089 {
4090 if (eptr >= md->end_subject) break;
4091 c = *eptr;
4092 if (c == 0x000d)
4093 {
4094 if (++eptr >= md->end_subject) break;
4095 if (*eptr == 0x000a) eptr++;
4096 }
4097 else
4098 {
4099 if (c != 0x000a &&
4100 (md->bsr_anycrlf ||
4101 (c != 0x000b && c != 0x000c && c != 0x0085)))
4102 break;
4103 eptr++;
4104 }
4105 }
4106 break;
4107
4108 case OP_NOT_HSPACE:
4109 for (i = min; i < max; i++)
4110 {
4111 if (eptr >= md->end_subject) break;
4112 c = *eptr;
4113 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4114 eptr++;
4115 }
4116 break;
4117
4118 case OP_HSPACE:
4119 for (i = min; i < max; i++)
4120 {
4121 if (eptr >= md->end_subject) break;
4122 c = *eptr;
4123 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4124 eptr++;
4125 }
4126 break;
4127
4128 case OP_NOT_VSPACE:
4129 for (i = min; i < max; i++)
4130 {
4131 if (eptr >= md->end_subject) break;
4132 c = *eptr;
4133 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4134 break;
4135 eptr++;
4136 }
4137 break;
4138
4139 case OP_VSPACE:
4140 for (i = min; i < max; i++)
4141 {
4142 if (eptr >= md->end_subject) break;
4143 c = *eptr;
4144 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
4145 break;
4146 eptr++;
4147 }
4148 break;
4149
4150 case OP_NOT_DIGIT:
4151 for (i = min; i < max; i++)
4152 {
4153 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4154 break;
4155 eptr++;
4156 }
4157 break;
4158
4159 case OP_DIGIT:
4160 for (i = min; i < max; i++)
4161 {
4162 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4163 break;
4164 eptr++;
4165 }
4166 break;
4167
4168 case OP_NOT_WHITESPACE:
4169 for (i = min; i < max; i++)
4170 {
4171 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4172 break;
4173 eptr++;
4174 }
4175 break;
4176
4177 case OP_WHITESPACE:
4178 for (i = min; i < max; i++)
4179 {
4180 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4181 break;
4182 eptr++;
4183 }
4184 break;
4185
4186 case OP_NOT_WORDCHAR:
4187 for (i = min; i < max; i++)
4188 {
4189 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4190 break;
4191 eptr++;
4192 }
4193 break;
4194
4195 case OP_WORDCHAR:
4196 for (i = min; i < max; i++)
4197 {
4198 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4199 break;
4200 eptr++;
4201 }
4202 break;
4203
4204 default:
4205 RRETURN(PCRE_ERROR_INTERNAL);
4206 }
4207
4208 /* eptr is now past the end of the maximum run */
4209
4210 if (possessive) continue;
4211 while (eptr >= pp)
4212 {
4213 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
4214 eptr--;
4215 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4216 }
4217 }
4218
4219 /* Get here if we can't make it match with any permitted repetitions */
4220
4221 RRETURN(MATCH_NOMATCH);
4222 }
4223 /* Control never gets here */
4224
4225 /* There's been some horrible disaster. Arrival here can only mean there is
4226 something seriously wrong in the code above or the OP_xxx definitions. */
4227
4228 default:
4229 DPRINTF(("Unknown opcode %d\n", *ecode));
4230 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
4231 }
4232
4233 /* Do not stick any code in here without much thought; it is assumed
4234 that "continue" in the code above comes out to here to repeat the main
4235 loop. */
4236
4237 } /* End of main loop */
4238 /* Control never reaches here */
4239
4240
4241 /* When compiling to use the heap rather than the stack for recursive calls to
4242 match(), the RRETURN() macro jumps here. The number that is saved in
4243 frame->Xwhere indicates which label we actually want to return to. */
4244
4245 #ifdef NO_RECURSE
4246 #define LBL(val) case val: goto L_RM##val;
4247 HEAP_RETURN:
4248 switch (frame->Xwhere)
4249 {
4250 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
4251 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
4252 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
4253 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
4254 LBL(53) LBL(54)
4255 #ifdef SUPPORT_UTF8
4256 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
4257 LBL(32) LBL(34) LBL(42) LBL(46)
4258 #ifdef SUPPORT_UCP
4259 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
4260 #endif /* SUPPORT_UCP */
4261 #endif /* SUPPORT_UTF8 */
4262 default:
4263 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
4264 return PCRE_ERROR_INTERNAL;
4265 }
4266 #undef LBL
4267 #endif /* NO_RECURSE */
4268 }
4269
4270
4271 /***************************************************************************
4272 ****************************************************************************
4273 RECURSION IN THE match() FUNCTION
4274
4275 Undefine all the macros that were defined above to handle this. */
4276
4277 #ifdef NO_RECURSE
4278 #undef eptr
4279 #undef ecode
4280 #undef mstart
4281 #undef offset_top
4282 #undef ims
4283 #undef eptrb
4284 #undef flags
4285
4286 #undef callpat
4287 #undef charptr
4288 #undef data
4289 #undef next
4290 #undef pp
4291 #undef prev
4292 #undef saved_eptr
4293
4294 #undef new_recursive
4295
4296 #undef cur_is_word
4297 #undef condition
4298 #undef prev_is_word
4299
4300 #undef original_ims
4301
4302 #undef ctype
4303 #undef length
4304 #undef max
4305 #undef min
4306 #undef number
4307 #undef offset
4308 #undef op
4309 #undef save_capture_last
4310 #undef save_offset1
4311 #undef save_offset2
4312 #undef save_offset3
4313 #undef stacksave
4314
4315 #undef newptrb
4316
4317 #endif
4318
4319 /* These two are defined as macros in both cases */
4320
4321 #undef fc
4322 #undef fi
4323
4324 /***************************************************************************
4325 ***************************************************************************/
4326
4327
4328
4329 /*************************************************
4330 * Execute a Regular Expression *
4331 *************************************************/
4332
4333 /* This function applies a compiled re to a subject string and picks out
4334 portions of the string if it matches. Two elements in the vector are set for
4335 each substring: the offsets to the start and end of the substring.
4336
4337 Arguments:
4338 argument_re points to the compiled expression
4339 extra_data points to extra data or is NULL
4340 subject points to the subject string
4341 length length of subject string (may contain binary zeros)
4342 start_offset where to start in the subject string
4343 options option bits
4344 offsets points to a vector of ints to be filled in with offsets
4345 offsetcount the number of elements in the vector
4346
4347 Returns: > 0 => success; value is the number of elements filled in
4348 = 0 => success, but offsets is not big enough
4349 -1 => failed to match
4350 < -1 => some kind of unexpected problem
4351 */
4352
4353 PCRE_EXP_DEFN int
4354 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
4355 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
4356 int offsetcount)
4357 {
4358 int rc, resetcount, ocount;
4359 int first_byte = -1;
4360 int req_byte = -1;
4361 int req_byte2 = -1;
4362 int newline;
4363 unsigned long int ims;
4364 BOOL using_temporary_offsets = FALSE;
4365 BOOL anchored;
4366 BOOL startline;
4367 BOOL firstline;
4368 BOOL first_byte_caseless = FALSE;
4369 BOOL req_byte_caseless = FALSE;
4370 BOOL utf8;
4371 match_data match_block;
4372 match_data *md = &match_block;
4373 const uschar *tables;
4374 const uschar *start_bits = NULL;
4375 USPTR start_match = (USPTR)subject + start_offset;
4376 USPTR end_subject;
4377 USPTR req_byte_ptr = start_match - 1;
4378
4379 pcre_study_data internal_study;
4380 const pcre_study_data *study;
4381
4382 real_pcre internal_re;
4383 const real_pcre *external_re = (const real_pcre *)argument_re;
4384 const real_pcre *re = external_re;
4385
4386 /* Plausibility checks */
4387
4388 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4389 if (re == NULL || subject == NULL ||
4390 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4391 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
4392
4393 /* Fish out the optional data from the extra_data structure, first setting
4394 the default values. */
4395
4396 study = NULL;
4397 md->match_limit = MATCH_LIMIT;
4398 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
4399 md->callout_data = NULL;
4400
4401 /* The table pointer is always in native byte order. */
4402
4403 tables = external_re->tables;
4404
4405 if (extra_data != NULL)
4406 {
4407 register unsigned int flags = extra_data->flags;
4408 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
4409 study = (const pcre_study_data *)extra_data->study_data;
4410 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
4411 md->match_limit = extra_data->match_limit;
4412 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
4413 md->match_limit_recursion = extra_data->match_limit_recursion;
4414 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
4415 md->callout_data = extra_data->callout_data;
4416 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
4417 }
4418
4419 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
4420 is a feature that makes it possible to save compiled regex and re-use them
4421 in other programs later. */
4422
4423 if (tables == NULL) tables = _pcre_default_tables;
4424
4425 /* Check that the first field in the block is the magic number. If it is not,
4426 test for a regex that was compiled on a host of opposite endianness. If this is
4427 the case, flipped values are put in internal_re and internal_study if there was
4428 study data too. */
4429
4430 if (re->magic_number != MAGIC_NUMBER)
4431 {
4432 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
4433 if (re == NULL) return PCRE_ERROR_BADMAGIC;
4434 if (study != NULL) study = &internal_study;
4435 }
4436
4437 /* Set up other data */
4438
4439 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4440 startline = (re->flags & PCRE_STARTLINE) != 0;
4441 firstline = (re->options & PCRE_FIRSTLINE) != 0;
4442
4443 /* The code starts after the real_pcre block and the capture name table. */
4444
4445 md->start_code = (const uschar *)external_re + re->name_table_offset +
4446 re->name_count * re->name_entry_size;
4447
4448 md->start_subject = (USPTR)subject;
4449 md->start_offset = start_offset;
4450 md->end_subject = md->start_subject + length;
4451 end_subject = md->end_subject;
4452
4453 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4454 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
4455
4456 md->notbol = (options & PCRE_NOTBOL) != 0;
4457 md->noteol = (options & PCRE_NOTEOL) != 0;
4458 md->notempty = (options & PCRE_NOTEMPTY) != 0;
4459 md->partial = (options & PCRE_PARTIAL) != 0;
4460 md->hitend = FALSE;
4461
4462 md->recursive = NULL; /* No recursion at top level */
4463
4464 md->lcc = tables + lcc_offset;
4465 md->ctypes = tables + ctypes_offset;
4466
4467 /* Handle different \R options. */
4468
4469 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
4470 {
4471 case 0:
4472 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
4473 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
4474 else
4475 #ifdef BSR_ANYCRLF
4476 md->bsr_anycrlf = TRUE;
4477 #else
4478 md->bsr_anycrlf = FALSE;
4479 #endif
4480 break;
4481
4482 case PCRE_BSR_ANYCRLF:
4483 md->bsr_anycrlf = TRUE;
4484 break;
4485
4486 case PCRE_BSR_UNICODE:
4487 md->bsr_anycrlf = FALSE;
4488 break;
4489
4490 default: return PCRE_ERROR_BADNEWLINE;
4491 }
4492
4493 /* Handle different types of newline. The three bits give eight cases. If
4494 nothing is set at run time, whatever was used at compile time applies. */
4495
4496 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
4497 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
4498 {
4499 case 0: newline = NEWLINE; break; /* Compile-time default */
4500 case PCRE_NEWLINE_CR: newline = '\r'; break;
4501 case PCRE_NEWLINE_LF: newline = '\n'; break;
4502 case PCRE_NEWLINE_CR+
4503 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
4504 case PCRE_NEWLINE_ANY: newline = -1; break;
4505 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
4506 default: return PCRE_ERROR_BADNEWLINE;
4507 }
4508
4509 if (newline == -2)
4510 {
4511 md->nltype = NLTYPE_ANYCRLF;
4512 }
4513 else if (newline < 0)
4514 {
4515 md->nltype = NLTYPE_ANY;
4516 }
4517 else
4518 {
4519 md->nltype = NLTYPE_FIXED;
4520 if (newline > 255)
4521 {
4522 md->nllen = 2;
4523 md->nl[0] = (newline >> 8) & 255;
4524 md->nl[1] = newline & 255;
4525 }
4526 else
4527 {
4528 md->nllen = 1;
4529 md->nl[0] = newline;
4530 }
4531 }
4532
4533 /* Partial matching is supported only for a restricted set of regexes at the
4534 moment. */
4535
4536 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
4537 return PCRE_ERROR_BADPARTIAL;
4538
4539 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
4540 back the character offset. */
4541
4542 #ifdef SUPPORT_UTF8
4543 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
4544 {
4545 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
4546 return PCRE_ERROR_BADUTF8;
4547 if (start_offset > 0 && start_offset < length)
4548 {
4549 int tb = ((uschar *)subject)[start_offset];
4550 if (tb > 127)
4551 {
4552 tb &= 0xc0;
4553 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
4554 }
4555 }
4556 }
4557 #endif
4558
4559 /* The ims options can vary during the matching as a result of the presence
4560 of (?ims) items in the pattern. They are kept in a local variable so that
4561 restoring at the exit of a group is easy. */
4562
4563 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4564
4565 /* If the expression has got more back references than the offsets supplied can
4566 hold, we get a temporary chunk of working store to use during the matching.
4567 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4568 of 3. */
4569
4570 ocount = offsetcount - (offsetcount % 3);
4571
4572 if (re->top_backref > 0 && re->top_backref >= ocount/3)
4573 {
4574 ocount = re->top_backref * 3 + 3;
4575 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4576 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4577 using_temporary_offsets = TRUE;
4578 DPRINTF(("Got memory to hold back references\n"));
4579 }
4580 else md->offset_vector = offsets;
4581
4582 md->offset_end = ocount;
4583 md->offset_max = (2*ocount)/3;
4584 md->offset_overflow = FALSE;
4585 md->capture_last = -1;
4586
4587 /* Compute the minimum number of offsets that we need to reset each time. Doing
4588 this makes a huge difference to execution time when there aren't many brackets
4589 in the pattern. */
4590
4591 resetcount = 2 + re->top_bracket * 2;
4592 if (resetcount > offsetcount) resetcount = ocount;
4593
4594 /* Reset the working variable associated with each extraction. These should
4595 never be used unless previously set, but they get saved and restored, and so we
4596 initialize them to avoid reading uninitialized locations. */
4597
4598 if (md->offset_vector != NULL)
4599 {
4600 register int *iptr = md->offset_vector + ocount;
4601 register int *iend = iptr - resetcount/2 + 1;
4602 while (--iptr >= iend) *iptr = -1;
4603 }
4604
4605 /* Set up the first character to match, if available. The first_byte value is
4606 never set for an anchored regular expression, but the anchoring may be forced
4607 at run time, so we have to test for anchoring. The first char may be unset for
4608 an unanchored pattern, of course. If there's no first char and the pattern was
4609 studied, there may be a bitmap of possible first characters. */
4610
4611 if (!anchored)
4612 {
4613 if ((re->flags & PCRE_FIRSTSET) != 0)
4614 {
4615 first_byte = re->first_byte & 255;
4616 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
4617 first_byte = md->lcc[first_byte];
4618 }
4619 else
4620 if (!startline && study != NULL &&
4621 (study->options & PCRE_STUDY_MAPPED) != 0)
4622 start_bits = study->start_bits;
4623 }
4624
4625 /* For anchored or unanchored matches, there may be a "last known required
4626 character" set. */
4627
4628 if ((re->flags & PCRE_REQCHSET) != 0)
4629 {
4630 req_byte = re->req_byte & 255;
4631 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
4632 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
4633 }
4634
4635
4636 /* ==========================================================================*/
4637
4638 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
4639 the loop runs just once. */
4640
4641 for(;;)
4642 {
4643 USPTR save_end_subject = end_subject;
4644 USPTR new_start_match;
4645
4646 /* Reset the maximum number of extractions we might see. */
4647
4648 if (md->offset_vector != NULL)
4649 {
4650 register int *iptr = md->offset_vector;
4651 register int *iend = iptr + resetcount;
4652 while (iptr < iend) *iptr++ = -1;
4653 }
4654
4655 /* Advance to a unique first char if possible. If firstline is TRUE, the
4656 start of the match is constrained to the first line of a multiline string.
4657 That is, the match must be before or at the first newline. Implement this by
4658 temporarily adjusting end_subject so that we stop scanning at a newline. If
4659 the match fails at the newline, later code breaks this loop. */
4660
4661 if (firstline)
4662 {
4663 USPTR t = start_match;
4664 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
4665 end_subject = t;
4666 }
4667
4668 /* Now test for a unique first byte */
4669
4670 if (first_byte >= 0)
4671 {
4672 if (first_byte_caseless)
4673 while (start_match < end_subject &&
4674 md->lcc[*start_match] != first_byte)
4675 start_match++;
4676 else
4677 while (start_match < end_subject && *start_match != first_byte)
4678 start_match++;
4679 }
4680
4681 /* Or to just after a linebreak for a multiline match if possible */
4682
4683 else if (startline)
4684 {
4685 if (start_match > md->start_subject + start_offset)
4686 {
4687 while (start_match <= end_subject && !WAS_NEWLINE(start_match))
4688 start_match++;
4689
4690 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
4691 and we are now at a LF, advance the match position by one more character.
4692 */
4693
4694 if (start_match[-1] == '\r' &&
4695 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
4696 start_match < end_subject &&
4697 *start_match == '\n')
4698 start_match++;
4699 }
4700 }
4701
4702 /* Or to a non-unique first char after study */
4703
4704 else if (start_bits != NULL)
4705 {
4706 while (start_match < end_subject)
4707 {
4708 register unsigned int c = *start_match;
4709 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
4710 }
4711 }
4712
4713 /* Restore fudged end_subject */
4714
4715 end_subject = save_end_subject;
4716
4717 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4718 printf(">>>> Match against: ");
4719 pchars(start_match, end_subject - start_match, TRUE, md);
4720 printf("\n");
4721 #endif
4722
4723 /* If req_byte is set, we know that that character must appear in the subject
4724 for the match to succeed. If the first character is set, req_byte must be
4725 later in the subject; otherwise the test starts at the match point. This
4726 optimization can save a huge amount of backtracking in patterns with nested
4727 unlimited repeats that aren't going to match. Writing separate code for
4728 cased/caseless versions makes it go faster, as does using an autoincrement
4729 and backing off on a match.
4730
4731 HOWEVER: when the subject string is very, very long, searching to its end can
4732 take a long time, and give bad performance on quite ordinary patterns. This
4733 showed up when somebody was matching something like /^\d+C/ on a 32-megabyte
4734 string... so we don't do this when the string is sufficiently long.
4735
4736 ALSO: this processing is disabled when partial matching is requested.
4737 */
4738
4739 if (req_byte >= 0 &&
4740 end_subject - start_match < REQ_BYTE_MAX &&
4741 !md->partial)
4742 {
4743 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
4744
4745 /* We don't need to repeat the search if we haven't yet reached the
4746 place we found it at last time. */
4747
4748 if (p > req_byte_ptr)
4749 {
4750 if (req_byte_caseless)
4751 {
4752 while (p < end_subject)
4753 {
4754 register int pp = *p++;
4755 if (pp == req_byte || pp == req_byte2) { p--; break; }
4756 }
4757 }
4758 else
4759 {
4760 while (p < end_subject)
4761 {
4762 if (*p++ == req_byte) { p--; break; }
4763 }
4764 }
4765
4766 /* If we can't find the required character, break the matching loop,
4767 forcing a match failure. */
4768
4769 if (p >= end_subject)
4770 {
4771 rc = MATCH_NOMATCH;
4772 break;
4773 }
4774
4775 /* If we have found the required character, save the point where we
4776 found it, so that we don't search again next time round the loop if
4777 the start hasn't passed this character yet. */
4778
4779 req_byte_ptr = p;
4780 }
4781 }
4782
4783 /* OK, we can now run the match. */
4784
4785 md->start_match_ptr = start_match;
4786 md->match_call_count = 0;
4787 rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0);
4788
4789 switch(rc)
4790 {
4791 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
4792 exactly like PRUNE. */
4793
4794 case MATCH_NOMATCH:
4795 case MATCH_PRUNE:
4796 case MATCH_THEN:
4797 new_start_match = start_match + 1;
4798 #ifdef SUPPORT_UTF8
4799 if (utf8)
4800 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
4801 new_start_match++;
4802 #endif
4803 break;
4804
4805 /* SKIP passes back the next starting point explicitly. */
4806
4807 case MATCH_SKIP:
4808 new_start_match = md->start_match_ptr;
4809 break;
4810
4811 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
4812
4813 case MATCH_COMMIT:
4814 rc = MATCH_NOMATCH;
4815 goto ENDLOOP;
4816
4817 /* Any other return is some kind of error. */
4818
4819 default:
4820 goto ENDLOOP;
4821 }
4822
4823 /* Control reaches here for the various types of "no match at this point"
4824 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
4825
4826 rc = MATCH_NOMATCH;
4827
4828 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
4829 newline in the subject (though it may continue over the newline). Therefore,
4830 if we have just failed to match, starting at a newline, do not continue. */
4831
4832 if (firstline && IS_NEWLINE(start_match)) break;
4833
4834 /* Advance to new matching position */
4835
4836 start_match = new_start_match;
4837
4838 /* Break the loop if the pattern is anchored or if we have passed the end of
4839 the subject. */
4840
4841 if (anchored || start_match > end_subject) break;
4842
4843 /* If we have just passed a CR and we are now at a LF, and the pattern does
4844 not contain any explicit matches for \r or \n, and the newline option is CRLF
4845 or ANY or ANYCRLF, advance the match position by one more character. */
4846
4847 if (start_match[-1] == '\r' &&
4848 start_match < end_subject &&
4849 *start_match == '\n' &&
4850 (re->flags & PCRE_HASCRORLF) == 0 &&
4851 (md->nltype == NLTYPE_ANY ||
4852 md->nltype == NLTYPE_ANYCRLF ||
4853 md->nllen == 2))
4854 start_match++;
4855
4856 } /* End of for(;;) "bumpalong" loop */
4857
4858 /* ==========================================================================*/
4859
4860 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
4861 conditions is true:
4862
4863 (1) The pattern is anchored or the match was failed by (*COMMIT);
4864
4865 (2) We are past the end of the subject;
4866
4867 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
4868 this option requests that a match occur at or before the first newline in
4869 the subject.
4870
4871 When we have a match and the offset vector is big enough to deal with any
4872 backreferences, captured substring offsets will already be set up. In the case
4873 where we had to get some local store to hold offsets for backreference
4874 processing, copy those that we can. In this case there need not be overflow if
4875 certain parts of the pattern were not used, even though there are more
4876 capturing parentheses than vector slots. */
4877
4878 ENDLOOP:
4879
4880 if (rc == MATCH_MATCH)
4881 {
4882 if (using_temporary_offsets)
4883 {
4884 if (offsetcount >= 4)
4885 {
4886 memcpy(offsets + 2, md->offset_vector + 2,
4887 (offsetcount - 2) * sizeof(int));
4888 DPRINTF(("Copied offsets from temporary memory\n"));
4889 }
4890 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
4891 DPRINTF(("Freeing temporary memory\n"));
4892 (pcre_free)(md->offset_vector);
4893 }
4894
4895 /* Set the return code to the number of captured strings, or 0 if there are
4896 too many to fit into the vector. */
4897
4898 rc = md->offset_overflow? 0 : md->end_offset_top/2;
4899
4900 /* If there is space, set up the whole thing as substring 0. The value of
4901 md->start_match_ptr might be modified if \K was encountered on the success
4902 matching path. */
4903
4904 if (offsetcount < 2) rc = 0; else
4905 {
4906 offsets[0] = md->start_match_ptr - md->start_subject;
4907 offsets[1] = md->end_match_ptr - md->start_subject;
4908 }
4909
4910 DPRINTF((">>>> returning %d\n", rc));
4911 return rc;
4912 }
4913
4914 /* Control gets here if there has been an error, or if the overall match
4915 attempt has failed at all permitted starting positions. */
4916
4917 if (using_temporary_offsets)
4918 {
4919 DPRINTF(("Freeing temporary memory\n"));
4920 (pcre_free)(md->offset_vector);
4921 }
4922
4923 if (rc != MATCH_NOMATCH)
4924 {
4925 DPRINTF((">>>> error: returning %d\n", rc));
4926 return rc;
4927 }
4928 else if (md->partial && md->hitend)
4929 {
4930 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
4931 return PCRE_ERROR_PARTIAL;
4932 }
4933 else
4934 {
4935 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
4936 return PCRE_ERROR_NOMATCH;
4937 }
4938 }
4939
4940 /* End of pcre_exec.c */