Installed PCRE release 7.0.
[exim.git] / src / src / pcre / pcre_exec.c
CommitLineData
6bf342e1 1/* $Cambridge: exim/src/src/pcre/pcre_exec.c,v 1.4 2007/01/23 15:08:45 ph10 Exp $ */
8ac170f3
PH
2
3/*************************************************
4* Perl-Compatible Regular Expressions *
5*************************************************/
6
7/* PCRE is a library of functions to support regular expressions whose syntax
8and semantics are as close as possible to those of the Perl 5 language.
9
10 Written by Philip Hazel
aa41d2de 11 Copyright (c) 1997-2006 University of Cambridge
8ac170f3
PH
12
13-----------------------------------------------------------------------------
14Redistribution and use in source and binary forms, with or without
15modification, are permitted provided that the following conditions are met:
16
17 * Redistributions of source code must retain the above copyright notice,
18 this list of conditions and the following disclaimer.
19
20 * Redistributions in binary form must reproduce the above copyright
21 notice, this list of conditions and the following disclaimer in the
22 documentation and/or other materials provided with the distribution.
23
24 * Neither the name of the University of Cambridge nor the names of its
25 contributors may be used to endorse or promote products derived from
26 this software without specific prior written permission.
27
28THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38POSSIBILITY OF SUCH DAMAGE.
39-----------------------------------------------------------------------------
40*/
41
42
43/* This module contains pcre_exec(), the externally visible function that does
44pattern matching using an NFA algorithm, trying to mimic Perl as closely as
45possible. There are also some static supporting functions. */
46
6bf342e1
PH
47#define NLBLOCK md /* Block containing newline information */
48#define PSSTART start_subject /* Field containing processed string start */
49#define PSEND end_subject /* Field containing processed string end */
8ac170f3 50
6bf342e1 51#include "pcre_internal.h"
8ac170f3 52
6bf342e1
PH
53/* The chain of eptrblocks for tail recursions uses memory in stack workspace,
54obtained at top level, the size of which is defined by EPTR_WORK_SIZE. */
8ac170f3 55
6bf342e1 56#define EPTR_WORK_SIZE (1000)
8ac170f3
PH
57
58/* Flag bits for the match() function */
59
6bf342e1
PH
60#define match_condassert 0x01 /* Called to check a condition assertion */
61#define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
62#define match_tail_recursed 0x04 /* Tail recursive call */
8ac170f3
PH
63
64/* Non-error returns from the match() function. Error returns are externally
65defined PCRE_ERROR_xxx codes, which are all negative. */
66
67#define MATCH_MATCH 1
68#define MATCH_NOMATCH 0
69
70/* Maximum number of ints of offset to save on the stack for recursive calls.
71If the offset vector is bigger, malloc is used. This should be a multiple of 3,
72because the offset vector is always a multiple of 3 long. */
73
74#define REC_STACK_SAVE_MAX 30
75
76/* Min and max values for the common repeats; for the maxima, 0 => infinity */
77
78static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
79static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
80
81
82
83#ifdef DEBUG
84/*************************************************
85* Debugging function to print chars *
86*************************************************/
87
88/* Print a sequence of chars in printable format, stopping at the end of the
89subject if the requested.
90
91Arguments:
92 p points to characters
93 length number to print
94 is_subject TRUE if printing from within md->start_subject
95 md pointer to matching data block, if is_subject is TRUE
96
97Returns: nothing
98*/
99
100static void
101pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
102{
6bf342e1 103unsigned int c;
8ac170f3
PH
104if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
105while (length-- > 0)
106 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
107}
108#endif
109
110
111
112/*************************************************
113* Match a back-reference *
114*************************************************/
115
116/* If a back reference hasn't been set, the length that is passed is greater
117than the number of characters left in the string, so the match fails.
118
119Arguments:
120 offset index into the offset vector
121 eptr points into the subject
122 length length to be matched
123 md points to match data block
124 ims the ims flags
125
126Returns: TRUE if matched
127*/
128
129static BOOL
aa41d2de 130match_ref(int offset, register USPTR eptr, int length, match_data *md,
8ac170f3
PH
131 unsigned long int ims)
132{
aa41d2de 133USPTR p = md->start_subject + md->offset_vector[offset];
8ac170f3
PH
134
135#ifdef DEBUG
136if (eptr >= md->end_subject)
137 printf("matching subject <null>");
138else
139 {
140 printf("matching subject ");
141 pchars(eptr, length, TRUE, md);
142 }
143printf(" against backref ");
144pchars(p, length, FALSE, md);
145printf("\n");
146#endif
147
148/* Always fail if not enough characters left */
149
150if (length > md->end_subject - eptr) return FALSE;
151
152/* Separate the caselesss case for speed */
153
154if ((ims & PCRE_CASELESS) != 0)
155 {
156 while (length-- > 0)
157 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
158 }
159else
160 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
161
162return TRUE;
163}
164
165
166
167/***************************************************************************
168****************************************************************************
169 RECURSION IN THE match() FUNCTION
170
aa41d2de
PH
171The match() function is highly recursive, though not every recursive call
172increases the recursive depth. Nevertheless, some regular expressions can cause
173it to recurse to a great depth. I was writing for Unix, so I just let it call
174itself recursively. This uses the stack for saving everything that has to be
175saved for a recursive call. On Unix, the stack can be large, and this works
176fine.
8ac170f3 177
aa41d2de
PH
178It turns out that on some non-Unix-like systems there are problems with
179programs that use a lot of stack. (This despite the fact that every last chip
180has oodles of memory these days, and techniques for extending the stack have
181been known for decades.) So....
8ac170f3
PH
182
183There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
184calls by keeping local variables that need to be preserved in blocks of memory
aa41d2de 185obtained from malloc() instead instead of on the stack. Macros are used to
8ac170f3
PH
186achieve this so that the actual code doesn't look very different to what it
187always used to.
188****************************************************************************
189***************************************************************************/
190
191
aa41d2de
PH
192/* These versions of the macros use the stack, as normal. There are debugging
193versions and production versions. */
8ac170f3
PH
194
195#ifndef NO_RECURSE
196#define REGISTER register
aa41d2de
PH
197#ifdef DEBUG
198#define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) \
199 { \
200 printf("match() called in line %d\n", __LINE__); \
201 rx = match(ra,rb,rc,rd,re,rf,rg,rdepth+1); \
202 printf("to line %d\n", __LINE__); \
203 }
204#define RRETURN(ra) \
205 { \
206 printf("match() returned %d from line %d ", ra, __LINE__); \
207 return ra; \
208 }
209#else
210#define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) \
211 rx = match(ra,rb,rc,rd,re,rf,rg,rdepth+1)
8ac170f3 212#define RRETURN(ra) return ra
aa41d2de
PH
213#endif
214
8ac170f3
PH
215#else
216
217
218/* These versions of the macros manage a private stack on the heap. Note
219that the rd argument of RMATCH isn't actually used. It's the md argument of
220match(), which never changes. */
221
222#define REGISTER
223
224#define RMATCH(rx,ra,rb,rc,rd,re,rf,rg)\
225 {\
226 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
227 if (setjmp(frame->Xwhere) == 0)\
228 {\
229 newframe->Xeptr = ra;\
230 newframe->Xecode = rb;\
231 newframe->Xoffset_top = rc;\
232 newframe->Xims = re;\
233 newframe->Xeptrb = rf;\
234 newframe->Xflags = rg;\
aa41d2de 235 newframe->Xrdepth = frame->Xrdepth + 1;\
8ac170f3
PH
236 newframe->Xprevframe = frame;\
237 frame = newframe;\
238 DPRINTF(("restarting from line %d\n", __LINE__));\
239 goto HEAP_RECURSE;\
240 }\
241 else\
242 {\
243 DPRINTF(("longjumped back to line %d\n", __LINE__));\
244 frame = md->thisframe;\
245 rx = frame->Xresult;\
246 }\
247 }
248
249#define RRETURN(ra)\
250 {\
251 heapframe *newframe = frame;\
252 frame = newframe->Xprevframe;\
253 (pcre_stack_free)(newframe);\
254 if (frame != NULL)\
255 {\
256 frame->Xresult = ra;\
257 md->thisframe = frame;\
258 longjmp(frame->Xwhere, 1);\
259 }\
260 return ra;\
261 }
262
263
264/* Structure for remembering the local variables in a private frame */
265
266typedef struct heapframe {
267 struct heapframe *Xprevframe;
268
269 /* Function arguments that may change */
270
271 const uschar *Xeptr;
272 const uschar *Xecode;
273 int Xoffset_top;
274 long int Xims;
275 eptrblock *Xeptrb;
276 int Xflags;
aa41d2de 277 unsigned int Xrdepth;
8ac170f3
PH
278
279 /* Function local variables */
280
281 const uschar *Xcallpat;
282 const uschar *Xcharptr;
283 const uschar *Xdata;
284 const uschar *Xnext;
285 const uschar *Xpp;
286 const uschar *Xprev;
287 const uschar *Xsaved_eptr;
288
289 recursion_info Xnew_recursive;
290
291 BOOL Xcur_is_word;
292 BOOL Xcondition;
8ac170f3
PH
293 BOOL Xprev_is_word;
294
295 unsigned long int Xoriginal_ims;
296
297#ifdef SUPPORT_UCP
298 int Xprop_type;
aa41d2de 299 int Xprop_value;
8ac170f3
PH
300 int Xprop_fail_result;
301 int Xprop_category;
302 int Xprop_chartype;
aa41d2de 303 int Xprop_script;
8ac170f3
PH
304#endif
305
306 int Xctype;
6bf342e1 307 unsigned int Xfc;
8ac170f3
PH
308 int Xfi;
309 int Xlength;
310 int Xmax;
311 int Xmin;
312 int Xnumber;
313 int Xoffset;
314 int Xop;
315 int Xsave_capture_last;
316 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
317 int Xstacksave[REC_STACK_SAVE_MAX];
318
319 eptrblock Xnewptrb;
320
321 /* Place to pass back result, and where to jump back to */
322
323 int Xresult;
324 jmp_buf Xwhere;
325
326} heapframe;
327
328#endif
329
330
331/***************************************************************************
332***************************************************************************/
333
334
335
336/*************************************************
337* Match from current position *
338*************************************************/
339
6bf342e1 340/* This function is called recursively in many circumstances. Whenever it
8ac170f3
PH
341returns a negative (error) response, the outer incarnation must also return the
342same response.
343
344Performance note: It might be tempting to extract commonly used fields from the
345md structure (e.g. utf8, end_subject) into individual variables to improve
346performance. Tests using gcc on a SPARC disproved this; in the first case, it
347made performance worse.
348
349Arguments:
6bf342e1
PH
350 eptr pointer to current character in subject
351 ecode pointer to current position in compiled code
8ac170f3
PH
352 offset_top current top pointer
353 md pointer to "static" info for the match
354 ims current /i, /m, and /s options
355 eptrb pointer to chain of blocks containing eptr at start of
356 brackets - for testing for empty matches
357 flags can contain
358 match_condassert - this is an assertion condition
6bf342e1
PH
359 match_cbegroup - this is the start of an unlimited repeat
360 group that can match an empty string
361 match_tail_recursed - this is a tail_recursed group
aa41d2de 362 rdepth the recursion depth
8ac170f3
PH
363
364Returns: MATCH_MATCH if matched ) these values are >= 0
365 MATCH_NOMATCH if failed to match )
366 a negative PCRE_ERROR_xxx value if aborted by an error condition
aa41d2de 367 (e.g. stopped by repeated call or recursion limit)
8ac170f3
PH
368*/
369
370static int
aa41d2de 371match(REGISTER USPTR eptr, REGISTER const uschar *ecode,
8ac170f3 372 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
aa41d2de 373 int flags, unsigned int rdepth)
8ac170f3
PH
374{
375/* These variables do not need to be preserved over recursion in this function,
6bf342e1
PH
376so they can be ordinary variables in all cases. Mark some of them with
377"register" because they are used a lot in loops. */
8ac170f3 378
aa41d2de
PH
379register int rrc; /* Returns from recursive calls */
380register int i; /* Used for loops not involving calls to RMATCH() */
6bf342e1 381register unsigned int c; /* Character values not kept over RMATCH() calls */
aa41d2de 382register BOOL utf8; /* Local copy of UTF-8 flag for speed */
8ac170f3 383
6bf342e1
PH
384BOOL minimize, possessive; /* Quantifier options */
385
8ac170f3
PH
386/* When recursion is not being used, all "local" variables that have to be
387preserved over calls to RMATCH() are part of a "frame" which is obtained from
388heap storage. Set up the top-level frame here; others are obtained from the
389heap whenever RMATCH() does a "recursion". See the macro definitions above. */
390
391#ifdef NO_RECURSE
392heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
393frame->Xprevframe = NULL; /* Marks the top level */
394
395/* Copy in the original argument variables */
396
397frame->Xeptr = eptr;
398frame->Xecode = ecode;
399frame->Xoffset_top = offset_top;
400frame->Xims = ims;
401frame->Xeptrb = eptrb;
402frame->Xflags = flags;
aa41d2de 403frame->Xrdepth = rdepth;
8ac170f3
PH
404
405/* This is where control jumps back to to effect "recursion" */
406
407HEAP_RECURSE:
408
409/* Macros make the argument variables come from the current frame */
410
411#define eptr frame->Xeptr
412#define ecode frame->Xecode
413#define offset_top frame->Xoffset_top
414#define ims frame->Xims
415#define eptrb frame->Xeptrb
416#define flags frame->Xflags
aa41d2de 417#define rdepth frame->Xrdepth
8ac170f3
PH
418
419/* Ditto for the local variables */
420
421#ifdef SUPPORT_UTF8
422#define charptr frame->Xcharptr
423#endif
424#define callpat frame->Xcallpat
425#define data frame->Xdata
426#define next frame->Xnext
427#define pp frame->Xpp
428#define prev frame->Xprev
429#define saved_eptr frame->Xsaved_eptr
430
431#define new_recursive frame->Xnew_recursive
432
433#define cur_is_word frame->Xcur_is_word
434#define condition frame->Xcondition
8ac170f3
PH
435#define prev_is_word frame->Xprev_is_word
436
437#define original_ims frame->Xoriginal_ims
438
439#ifdef SUPPORT_UCP
440#define prop_type frame->Xprop_type
aa41d2de 441#define prop_value frame->Xprop_value
8ac170f3
PH
442#define prop_fail_result frame->Xprop_fail_result
443#define prop_category frame->Xprop_category
444#define prop_chartype frame->Xprop_chartype
aa41d2de 445#define prop_script frame->Xprop_script
8ac170f3
PH
446#endif
447
448#define ctype frame->Xctype
449#define fc frame->Xfc
450#define fi frame->Xfi
451#define length frame->Xlength
452#define max frame->Xmax
453#define min frame->Xmin
454#define number frame->Xnumber
455#define offset frame->Xoffset
456#define op frame->Xop
457#define save_capture_last frame->Xsave_capture_last
458#define save_offset1 frame->Xsave_offset1
459#define save_offset2 frame->Xsave_offset2
460#define save_offset3 frame->Xsave_offset3
461#define stacksave frame->Xstacksave
462
463#define newptrb frame->Xnewptrb
464
465/* When recursion is being used, local variables are allocated on the stack and
466get preserved during recursion in the normal way. In this environment, fi and
467i, and fc and c, can be the same variables. */
468
6bf342e1 469#else /* NO_RECURSE not defined */
8ac170f3
PH
470#define fi i
471#define fc c
472
473
aa41d2de
PH
474#ifdef SUPPORT_UTF8 /* Many of these variables are used only */
475const uschar *charptr; /* in small blocks of the code. My normal */
476#endif /* style of coding would have declared */
477const uschar *callpat; /* them within each of those blocks. */
478const uschar *data; /* However, in order to accommodate the */
479const uschar *next; /* version of this code that uses an */
480USPTR pp; /* external "stack" implemented on the */
481const uschar *prev; /* heap, it is easier to declare them all */
482USPTR saved_eptr; /* here, so the declarations can be cut */
483 /* out in a block. The only declarations */
484recursion_info new_recursive; /* within blocks below are for variables */
485 /* that do not have to be preserved over */
486BOOL cur_is_word; /* a recursive call to RMATCH(). */
487BOOL condition;
8ac170f3
PH
488BOOL prev_is_word;
489
490unsigned long int original_ims;
491
492#ifdef SUPPORT_UCP
493int prop_type;
aa41d2de 494int prop_value;
8ac170f3
PH
495int prop_fail_result;
496int prop_category;
497int prop_chartype;
aa41d2de 498int prop_script;
8ac170f3
PH
499#endif
500
501int ctype;
502int length;
503int max;
504int min;
505int number;
506int offset;
507int op;
508int save_capture_last;
509int save_offset1, save_offset2, save_offset3;
510int stacksave[REC_STACK_SAVE_MAX];
511
512eptrblock newptrb;
6bf342e1 513#endif /* NO_RECURSE */
8ac170f3
PH
514
515/* These statements are here to stop the compiler complaining about unitialized
516variables. */
517
518#ifdef SUPPORT_UCP
aa41d2de 519prop_value = 0;
8ac170f3 520prop_fail_result = 0;
8ac170f3
PH
521#endif
522
6bf342e1 523
aa41d2de
PH
524/* This label is used for tail recursion, which is used in a few cases even
525when NO_RECURSE is not defined, in order to reduce the amount of stack that is
526used. Thanks to Ian Taylor for noticing this possibility and sending the
527original patch. */
528
529TAIL_RECURSE:
530
531/* OK, now we can get on with the real code of the function. Recursive calls
532are specified by the macro RMATCH and RRETURN is used to return. When
533NO_RECURSE is *not* defined, these just turn into a recursive call to match()
534and a "return", respectively (possibly with some debugging if DEBUG is
535defined). However, RMATCH isn't like a function call because it's quite a
536complicated macro. It has to be used in one particular way. This shouldn't,
537however, impact performance when true recursion is being used. */
538
539/* First check that we haven't called match() too many times, or that we
540haven't exceeded the recursive call limit. */
8ac170f3
PH
541
542if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
aa41d2de 543if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
8ac170f3
PH
544
545original_ims = ims; /* Save for resetting on ')' */
aa41d2de
PH
546
547#ifdef SUPPORT_UTF8
8ac170f3 548utf8 = md->utf8; /* Local copy of the flag */
aa41d2de
PH
549#else
550utf8 = FALSE;
551#endif
8ac170f3 552
6bf342e1
PH
553/* At the start of a group with an unlimited repeat that may match an empty
554string, the match_cbegroup flag is set. When this is the case, add the current
555subject pointer to the chain of such remembered pointers, to be checked when we
556hit the closing ket, in order to break infinite loops that match no characters.
557When match() is called in other circumstances, don't add to the chain. If this
558is a tail recursion, use a block from the workspace, as the one on the stack is
559already used. */
8ac170f3 560
6bf342e1 561if ((flags & match_cbegroup) != 0)
8ac170f3 562 {
6bf342e1
PH
563 eptrblock *p;
564 if ((flags & match_tail_recursed) != 0)
565 {
566 if (md->eptrn >= EPTR_WORK_SIZE) RRETURN(PCRE_ERROR_NULLWSLIMIT);
567 p = md->eptrchain + md->eptrn++;
568 }
569 else p = &newptrb;
570 p->epb_saved_eptr = eptr;
571 p->epb_prev = eptrb;
572 eptrb = p;
8ac170f3
PH
573 }
574
6bf342e1 575/* Now start processing the opcodes. */
8ac170f3
PH
576
577for (;;)
578 {
6bf342e1 579 minimize = possessive = FALSE;
8ac170f3 580 op = *ecode;
8ac170f3
PH
581
582 /* For partial matching, remember if we ever hit the end of the subject after
583 matching at least one subject character. */
584
585 if (md->partial &&
586 eptr >= md->end_subject &&
587 eptr > md->start_match)
588 md->hitend = TRUE;
589
6bf342e1 590 switch(op)
8ac170f3 591 {
6bf342e1
PH
592 /* Handle a capturing bracket. If there is space in the offset vector, save
593 the current subject position in the working slot at the top of the vector.
594 We mustn't change the current values of the data slot, because they may be
595 set from a previous iteration of this group, and be referred to by a
596 reference inside the group.
597
598 If the bracket fails to match, we need to restore this value and also the
599 values of the final offsets, in case they were set by a previous iteration
600 of the same bracket.
601
602 If there isn't enough space in the offset vector, treat this as if it were
603 a non-capturing bracket. Don't worry about setting the flag for the error
604 case here; that is handled in the code for KET. */
605
606 case OP_CBRA:
607 case OP_SCBRA:
608 number = GET2(ecode, 1+LINK_SIZE);
8ac170f3
PH
609 offset = number << 1;
610
611#ifdef DEBUG
6bf342e1
PH
612 printf("start bracket %d\n", number);
613 printf("subject=");
8ac170f3
PH
614 pchars(eptr, 16, TRUE, md);
615 printf("\n");
616#endif
617
618 if (offset < md->offset_max)
619 {
620 save_offset1 = md->offset_vector[offset];
621 save_offset2 = md->offset_vector[offset+1];
622 save_offset3 = md->offset_vector[md->offset_end - number];
623 save_capture_last = md->capture_last;
624
625 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
626 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
627
6bf342e1 628 flags = (op == OP_SCBRA)? match_cbegroup : 0;
8ac170f3
PH
629 do
630 {
6bf342e1
PH
631 RMATCH(rrc, eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
632 ims, eptrb, flags);
8ac170f3
PH
633 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
634 md->capture_last = save_capture_last;
635 ecode += GET(ecode, 1);
636 }
637 while (*ecode == OP_ALT);
638
639 DPRINTF(("bracket %d failed\n", number));
640
641 md->offset_vector[offset] = save_offset1;
642 md->offset_vector[offset+1] = save_offset2;
643 md->offset_vector[md->offset_end - number] = save_offset3;
644
645 RRETURN(MATCH_NOMATCH);
646 }
647
6bf342e1
PH
648 /* Insufficient room for saving captured contents. Treat as a non-capturing
649 bracket. */
8ac170f3 650
6bf342e1 651 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
8ac170f3 652
6bf342e1
PH
653 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
654 final alternative within the brackets, we would return the result of a
655 recursive call to match() whatever happened. We can reduce stack usage by
656 turning this into a tail recursion. */
aa41d2de 657
6bf342e1
PH
658 case OP_BRA:
659 case OP_SBRA:
660 DPRINTF(("start non-capturing bracket\n"));
661 flags = (op >= OP_SBRA)? match_cbegroup : 0;
aa41d2de 662 for (;;)
8ac170f3 663 {
aa41d2de 664 if (ecode[GET(ecode, 1)] != OP_ALT)
6bf342e1
PH
665 {
666 ecode += _pcre_OP_lengths[*ecode];
667 flags |= match_tail_recursed;
668 DPRINTF(("bracket 0 tail recursion\n"));
669 goto TAIL_RECURSE;
670 }
aa41d2de
PH
671
672 /* For non-final alternatives, continue the loop for a NOMATCH result;
673 otherwise return. */
674
6bf342e1
PH
675 RMATCH(rrc, eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
676 eptrb, flags);
8ac170f3
PH
677 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
678 ecode += GET(ecode, 1);
679 }
aa41d2de 680 /* Control never reaches here. */
8ac170f3
PH
681
682 /* Conditional group: compilation checked that there are no more than
683 two branches. If the condition is false, skipping the first branch takes us
684 past the end if there is only one branch, but that's OK because that is
aa41d2de
PH
685 exactly what going to the ket would do. As there is only one branch to be
686 obeyed, we can use tail recursion to avoid using another stack frame. */
8ac170f3
PH
687
688 case OP_COND:
6bf342e1
PH
689 case OP_SCOND:
690 if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
691 {
692 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
693 condition = md->recursive != NULL &&
694 (offset == RREF_ANY || offset == md->recursive->group_num);
695 ecode += condition? 3 : GET(ecode, 1);
696 }
697
698 else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
8ac170f3
PH
699 {
700 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
6bf342e1
PH
701 condition = offset < offset_top && md->offset_vector[offset] >= 0;
702 ecode += condition? 3 : GET(ecode, 1);
703 }
704
705 else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
706 {
707 condition = FALSE;
708 ecode += GET(ecode, 1);
8ac170f3
PH
709 }
710
711 /* The condition is an assertion. Call match() to evaluate it - setting
6bf342e1
PH
712 the final argument match_condassert causes it to stop at the end of an
713 assertion. */
8ac170f3
PH
714
715 else
716 {
717 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
6bf342e1 718 match_condassert);
8ac170f3
PH
719 if (rrc == MATCH_MATCH)
720 {
6bf342e1
PH
721 condition = TRUE;
722 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
8ac170f3
PH
723 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
724 }
725 else if (rrc != MATCH_NOMATCH)
726 {
727 RRETURN(rrc); /* Need braces because of following else */
728 }
6bf342e1
PH
729 else
730 {
731 condition = FALSE;
732 ecode += GET(ecode, 1);
733 }
734 }
aa41d2de 735
6bf342e1
PH
736 /* We are now at the branch that is to be obeyed. As there is only one,
737 we can use tail recursion to avoid using another stack frame. If the second
738 alternative doesn't exist, we can just plough on. */
aa41d2de 739
6bf342e1
PH
740 if (condition || *ecode == OP_ALT)
741 {
aa41d2de 742 ecode += 1 + LINK_SIZE;
6bf342e1 743 flags = match_tail_recursed | ((op == OP_SCOND)? match_cbegroup : 0);
aa41d2de 744 goto TAIL_RECURSE;
8ac170f3 745 }
6bf342e1
PH
746 else
747 {
748 ecode += 1 + LINK_SIZE;
749 }
8ac170f3
PH
750 break;
751
6bf342e1
PH
752
753 /* End of the pattern. If we are in a top-level recursion, we should
754 restore the offsets appropriately and continue from after the call. */
8ac170f3
PH
755
756 case OP_END:
757 if (md->recursive != NULL && md->recursive->group_num == 0)
758 {
759 recursion_info *rec = md->recursive;
aa41d2de 760 DPRINTF(("End of pattern in a (?0) recursion\n"));
8ac170f3
PH
761 md->recursive = rec->prevrec;
762 memmove(md->offset_vector, rec->offset_save,
763 rec->saved_max * sizeof(int));
764 md->start_match = rec->save_start;
765 ims = original_ims;
766 ecode = rec->after_call;
767 break;
768 }
769
770 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
771 string - backtracking will then try other alternatives, if any. */
772
773 if (md->notempty && eptr == md->start_match) RRETURN(MATCH_NOMATCH);
774 md->end_match_ptr = eptr; /* Record where we ended */
775 md->end_offset_top = offset_top; /* and how many extracts were taken */
776 RRETURN(MATCH_MATCH);
777
778 /* Change option settings */
779
780 case OP_OPT:
781 ims = ecode[1];
782 ecode += 2;
783 DPRINTF(("ims set to %02lx\n", ims));
784 break;
785
786 /* Assertion brackets. Check the alternative branches in turn - the
787 matching won't pass the KET for an assertion. If any one branch matches,
788 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
789 start of each branch to move the current point backwards, so the code at
790 this level is identical to the lookahead case. */
791
792 case OP_ASSERT:
793 case OP_ASSERTBACK:
794 do
795 {
6bf342e1 796 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0);
8ac170f3
PH
797 if (rrc == MATCH_MATCH) break;
798 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
799 ecode += GET(ecode, 1);
800 }
801 while (*ecode == OP_ALT);
802 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
803
804 /* If checking an assertion for a condition, return MATCH_MATCH. */
805
806 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
807
808 /* Continue from after the assertion, updating the offsets high water
809 mark, since extracts may have been taken during the assertion. */
810
811 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
812 ecode += 1 + LINK_SIZE;
813 offset_top = md->end_offset_top;
814 continue;
815
816 /* Negative assertion: all branches must fail to match */
817
818 case OP_ASSERT_NOT:
819 case OP_ASSERTBACK_NOT:
820 do
821 {
6bf342e1 822 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0);
8ac170f3
PH
823 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
824 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
825 ecode += GET(ecode,1);
826 }
827 while (*ecode == OP_ALT);
828
829 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
830
831 ecode += 1 + LINK_SIZE;
832 continue;
833
834 /* Move the subject pointer back. This occurs only at the start of
835 each branch of a lookbehind assertion. If we are too close to the start to
836 move back, this match function fails. When working with UTF-8 we move
837 back a number of characters, not bytes. */
838
839 case OP_REVERSE:
840#ifdef SUPPORT_UTF8
841 if (utf8)
842 {
6bf342e1
PH
843 i = GET(ecode, 1);
844 while (i-- > 0)
8ac170f3
PH
845 {
846 eptr--;
847 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
848 BACKCHAR(eptr)
849 }
850 }
851 else
852#endif
853
854 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
855
856 {
6bf342e1 857 eptr -= GET(ecode, 1);
8ac170f3
PH
858 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
859 }
860
861 /* Skip to next op code */
862
863 ecode += 1 + LINK_SIZE;
864 break;
865
866 /* The callout item calls an external function, if one is provided, passing
867 details of the match so far. This is mainly for debugging, though the
868 function is able to force a failure. */
869
870 case OP_CALLOUT:
871 if (pcre_callout != NULL)
872 {
873 pcre_callout_block cb;
874 cb.version = 1; /* Version 1 of the callout block */
875 cb.callout_number = ecode[1];
876 cb.offset_vector = md->offset_vector;
aa41d2de 877 cb.subject = (PCRE_SPTR)md->start_subject;
8ac170f3
PH
878 cb.subject_length = md->end_subject - md->start_subject;
879 cb.start_match = md->start_match - md->start_subject;
880 cb.current_position = eptr - md->start_subject;
881 cb.pattern_position = GET(ecode, 2);
882 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
883 cb.capture_top = offset_top/2;
884 cb.capture_last = md->capture_last;
885 cb.callout_data = md->callout_data;
886 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
887 if (rrc < 0) RRETURN(rrc);
888 }
889 ecode += 2 + 2*LINK_SIZE;
890 break;
891
892 /* Recursion either matches the current regex, or some subexpression. The
893 offset data is the offset to the starting bracket from the start of the
894 whole pattern. (This is so that it works from duplicated subpatterns.)
895
896 If there are any capturing brackets started but not finished, we have to
897 save their starting points and reinstate them after the recursion. However,
898 we don't know how many such there are (offset_top records the completed
899 total) so we just have to save all the potential data. There may be up to
900 65535 such values, which is too large to put on the stack, but using malloc
901 for small numbers seems expensive. As a compromise, the stack is used when
902 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
903 is used. A problem is what to do if the malloc fails ... there is no way of
904 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
905 values on the stack, and accept that the rest may be wrong.
906
907 There are also other values that have to be saved. We use a chained
908 sequence of blocks that actually live on the stack. Thanks to Robin Houston
909 for the original version of this logic. */
910
911 case OP_RECURSE:
912 {
913 callpat = md->start_code + GET(ecode, 1);
6bf342e1
PH
914 new_recursive.group_num = (callpat == md->start_code)? 0 :
915 GET2(callpat, 1 + LINK_SIZE);
8ac170f3
PH
916
917 /* Add to "recursing stack" */
918
919 new_recursive.prevrec = md->recursive;
920 md->recursive = &new_recursive;
921
922 /* Find where to continue from afterwards */
923
924 ecode += 1 + LINK_SIZE;
925 new_recursive.after_call = ecode;
926
927 /* Now save the offset data. */
928
929 new_recursive.saved_max = md->offset_end;
930 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
931 new_recursive.offset_save = stacksave;
932 else
933 {
934 new_recursive.offset_save =
935 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
936 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
937 }
938
939 memcpy(new_recursive.offset_save, md->offset_vector,
940 new_recursive.saved_max * sizeof(int));
941 new_recursive.save_start = md->start_match;
942 md->start_match = eptr;
943
944 /* OK, now we can do the recursion. For each top-level alternative we
945 restore the offset and recursion data. */
946
947 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
6bf342e1 948 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
8ac170f3
PH
949 do
950 {
6bf342e1
PH
951 RMATCH(rrc, eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
952 md, ims, eptrb, flags);
8ac170f3
PH
953 if (rrc == MATCH_MATCH)
954 {
aa41d2de 955 DPRINTF(("Recursion matched\n"));
8ac170f3
PH
956 md->recursive = new_recursive.prevrec;
957 if (new_recursive.offset_save != stacksave)
958 (pcre_free)(new_recursive.offset_save);
959 RRETURN(MATCH_MATCH);
960 }
aa41d2de
PH
961 else if (rrc != MATCH_NOMATCH)
962 {
963 DPRINTF(("Recursion gave error %d\n", rrc));
964 RRETURN(rrc);
965 }
8ac170f3
PH
966
967 md->recursive = &new_recursive;
968 memcpy(md->offset_vector, new_recursive.offset_save,
969 new_recursive.saved_max * sizeof(int));
970 callpat += GET(callpat, 1);
971 }
972 while (*callpat == OP_ALT);
973
974 DPRINTF(("Recursion didn't match\n"));
975 md->recursive = new_recursive.prevrec;
976 if (new_recursive.offset_save != stacksave)
977 (pcre_free)(new_recursive.offset_save);
978 RRETURN(MATCH_NOMATCH);
979 }
980 /* Control never reaches here */
981
982 /* "Once" brackets are like assertion brackets except that after a match,
983 the point in the subject string is not moved back. Thus there can never be
984 a move back into the brackets. Friedl calls these "atomic" subpatterns.
985 Check the alternative branches in turn - the matching won't pass the KET
986 for this kind of subpattern. If any one branch matches, we carry on as at
987 the end of a normal bracket, leaving the subject pointer. */
988
989 case OP_ONCE:
aa41d2de
PH
990 prev = ecode;
991 saved_eptr = eptr;
8ac170f3 992
aa41d2de
PH
993 do
994 {
995 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
6bf342e1 996 eptrb, 0);
aa41d2de
PH
997 if (rrc == MATCH_MATCH) break;
998 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
999 ecode += GET(ecode,1);
1000 }
1001 while (*ecode == OP_ALT);
8ac170f3 1002
aa41d2de 1003 /* If hit the end of the group (which could be repeated), fail */
8ac170f3 1004
aa41d2de 1005 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
8ac170f3 1006
aa41d2de
PH
1007 /* Continue as from after the assertion, updating the offsets high water
1008 mark, since extracts may have been taken. */
8ac170f3 1009
6bf342e1 1010 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
8ac170f3 1011
aa41d2de
PH
1012 offset_top = md->end_offset_top;
1013 eptr = md->end_match_ptr;
8ac170f3 1014
aa41d2de
PH
1015 /* For a non-repeating ket, just continue at this level. This also
1016 happens for a repeating ket if no characters were matched in the group.
1017 This is the forcible breaking of infinite loops as implemented in Perl
1018 5.005. If there is an options reset, it will get obeyed in the normal
1019 course of events. */
8ac170f3 1020
aa41d2de
PH
1021 if (*ecode == OP_KET || eptr == saved_eptr)
1022 {
1023 ecode += 1+LINK_SIZE;
1024 break;
1025 }
8ac170f3 1026
aa41d2de
PH
1027 /* The repeating kets try the rest of the pattern or restart from the
1028 preceding bracket, in the appropriate order. The second "call" of match()
1029 uses tail recursion, to avoid using another stack frame. We need to reset
1030 any options that changed within the bracket before re-running it, so
1031 check the next opcode. */
8ac170f3 1032
aa41d2de
PH
1033 if (ecode[1+LINK_SIZE] == OP_OPT)
1034 {
1035 ims = (ims & ~PCRE_IMS) | ecode[4];
1036 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1037 }
8ac170f3 1038
aa41d2de
PH
1039 if (*ecode == OP_KETRMIN)
1040 {
1041 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);
1042 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1043 ecode = prev;
6bf342e1 1044 flags = match_tail_recursed;
aa41d2de
PH
1045 goto TAIL_RECURSE;
1046 }
1047 else /* OP_KETRMAX */
1048 {
6bf342e1 1049 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_cbegroup);
aa41d2de
PH
1050 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1051 ecode += 1 + LINK_SIZE;
6bf342e1 1052 flags = match_tail_recursed;
aa41d2de 1053 goto TAIL_RECURSE;
8ac170f3 1054 }
aa41d2de 1055 /* Control never gets here */
8ac170f3
PH
1056
1057 /* An alternation is the end of a branch; scan along to find the end of the
1058 bracketed group and go to there. */
1059
1060 case OP_ALT:
1061 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1062 break;
1063
1064 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
1065 that it may occur zero times. It may repeat infinitely, or not at all -
1066 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
1067 repeat limits are compiled as a number of copies, with the optional ones
1068 preceded by BRAZERO or BRAMINZERO. */
1069
1070 case OP_BRAZERO:
1071 {
1072 next = ecode+1;
6bf342e1 1073 RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, 0);
8ac170f3
PH
1074 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1075 do next += GET(next,1); while (*next == OP_ALT);
6bf342e1 1076 ecode = next + 1 + LINK_SIZE;
8ac170f3
PH
1077 }
1078 break;
1079
1080 case OP_BRAMINZERO:
1081 {
1082 next = ecode+1;
6bf342e1
PH
1083 do next += GET(next, 1); while (*next == OP_ALT);
1084 RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
8ac170f3
PH
1085 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1086 ecode++;
1087 }
1088 break;
1089
6bf342e1 1090 /* End of a group, repeated or non-repeating. */
8ac170f3
PH
1091
1092 case OP_KET:
1093 case OP_KETRMIN:
1094 case OP_KETRMAX:
aa41d2de 1095 prev = ecode - GET(ecode, 1);
8ac170f3 1096
6bf342e1
PH
1097 /* If this was a group that remembered the subject start, in order to break
1098 infinite repeats of empty string matches, retrieve the subject start from
1099 the chain. Otherwise, set it NULL. */
1100
1101 if (*prev >= OP_SBRA)
1102 {
1103 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1104 eptrb = eptrb->epb_prev; /* Backup to previous group */
1105 }
1106 else saved_eptr = NULL;
8ac170f3 1107
6bf342e1
PH
1108 /* If we are at the end of an assertion group, stop matching and return
1109 MATCH_MATCH, but record the current high water mark for use by positive
1110 assertions. Do this also for the "once" (atomic) groups. */
8ac170f3 1111
aa41d2de
PH
1112 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1113 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1114 *prev == OP_ONCE)
1115 {
1116 md->end_match_ptr = eptr; /* For ONCE */
1117 md->end_offset_top = offset_top;
1118 RRETURN(MATCH_MATCH);
1119 }
8ac170f3 1120
6bf342e1
PH
1121 /* For capturing groups we have to check the group number back at the start
1122 and if necessary complete handling an extraction by setting the offsets and
1123 bumping the high water mark. Note that whole-pattern recursion is coded as
1124 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1125 when the OP_END is reached. Other recursion is handled here. */
8ac170f3 1126
6bf342e1 1127 if (*prev == OP_CBRA || *prev == OP_SCBRA)
aa41d2de 1128 {
6bf342e1 1129 number = GET2(prev, 1+LINK_SIZE);
aa41d2de 1130 offset = number << 1;
8ac170f3
PH
1131
1132#ifdef DEBUG
aa41d2de
PH
1133 printf("end bracket %d", number);
1134 printf("\n");
8ac170f3
PH
1135#endif
1136
6bf342e1
PH
1137 md->capture_last = number;
1138 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
aa41d2de 1139 {
6bf342e1
PH
1140 md->offset_vector[offset] =
1141 md->offset_vector[md->offset_end - number];
1142 md->offset_vector[offset+1] = eptr - md->start_subject;
1143 if (offset_top <= offset) offset_top = offset + 2;
1144 }
8ac170f3 1145
6bf342e1
PH
1146 /* Handle a recursively called group. Restore the offsets
1147 appropriately and continue from after the call. */
8ac170f3 1148
6bf342e1
PH
1149 if (md->recursive != NULL && md->recursive->group_num == number)
1150 {
1151 recursion_info *rec = md->recursive;
1152 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1153 md->recursive = rec->prevrec;
1154 md->start_match = rec->save_start;
1155 memcpy(md->offset_vector, rec->offset_save,
1156 rec->saved_max * sizeof(int));
1157 ecode = rec->after_call;
1158 ims = original_ims;
1159 break;
8ac170f3 1160 }
aa41d2de 1161 }
8ac170f3 1162
6bf342e1
PH
1163 /* For both capturing and non-capturing groups, reset the value of the ims
1164 flags, in case they got changed during the group. */
8ac170f3 1165
aa41d2de
PH
1166 ims = original_ims;
1167 DPRINTF(("ims reset to %02lx\n", ims));
8ac170f3 1168
aa41d2de
PH
1169 /* For a non-repeating ket, just continue at this level. This also
1170 happens for a repeating ket if no characters were matched in the group.
1171 This is the forcible breaking of infinite loops as implemented in Perl
1172 5.005. If there is an options reset, it will get obeyed in the normal
1173 course of events. */
8ac170f3 1174
aa41d2de
PH
1175 if (*ecode == OP_KET || eptr == saved_eptr)
1176 {
1177 ecode += 1 + LINK_SIZE;
1178 break;
1179 }
8ac170f3 1180
aa41d2de
PH
1181 /* The repeating kets try the rest of the pattern or restart from the
1182 preceding bracket, in the appropriate order. In the second case, we can use
1183 tail recursion to avoid using another stack frame. */
8ac170f3 1184
6bf342e1
PH
1185 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1186
aa41d2de
PH
1187 if (*ecode == OP_KETRMIN)
1188 {
1189 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
1190 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1191 ecode = prev;
6bf342e1 1192 flags |= match_tail_recursed;
aa41d2de 1193 goto TAIL_RECURSE;
8ac170f3 1194 }
aa41d2de
PH
1195 else /* OP_KETRMAX */
1196 {
6bf342e1 1197 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, flags);
aa41d2de
PH
1198 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1199 ecode += 1 + LINK_SIZE;
6bf342e1 1200 flags = match_tail_recursed;
aa41d2de
PH
1201 goto TAIL_RECURSE;
1202 }
1203 /* Control never gets here */
8ac170f3
PH
1204
1205 /* Start of subject unless notbol, or after internal newline if multiline */
1206
1207 case OP_CIRC:
1208 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1209 if ((ims & PCRE_MULTILINE) != 0)
1210 {
aa41d2de 1211 if (eptr != md->start_subject &&
6bf342e1 1212 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
8ac170f3
PH
1213 RRETURN(MATCH_NOMATCH);
1214 ecode++;
1215 break;
1216 }
1217 /* ... else fall through */
1218
1219 /* Start of subject assertion */
1220
1221 case OP_SOD:
1222 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1223 ecode++;
1224 break;
1225
1226 /* Start of match assertion */
1227
1228 case OP_SOM:
1229 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1230 ecode++;
1231 break;
1232
1233 /* Assert before internal newline if multiline, or before a terminating
1234 newline unless endonly is set, else end of subject unless noteol is set. */
1235
1236 case OP_DOLL:
1237 if ((ims & PCRE_MULTILINE) != 0)
1238 {
1239 if (eptr < md->end_subject)
aa41d2de 1240 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
8ac170f3
PH
1241 else
1242 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1243 ecode++;
1244 break;
1245 }
1246 else
1247 {
1248 if (md->noteol) RRETURN(MATCH_NOMATCH);
1249 if (!md->endonly)
1250 {
aa41d2de 1251 if (eptr != md->end_subject &&
6bf342e1 1252 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
8ac170f3
PH
1253 RRETURN(MATCH_NOMATCH);
1254 ecode++;
1255 break;
1256 }
1257 }
aa41d2de 1258 /* ... else fall through for endonly */
8ac170f3
PH
1259
1260 /* End of subject assertion (\z) */
1261
1262 case OP_EOD:
1263 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1264 ecode++;
1265 break;
1266
1267 /* End of subject or ending \n assertion (\Z) */
1268
1269 case OP_EODN:
aa41d2de 1270 if (eptr != md->end_subject &&
6bf342e1 1271 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
aa41d2de 1272 RRETURN(MATCH_NOMATCH);
8ac170f3
PH
1273 ecode++;
1274 break;
1275
1276 /* Word boundary assertions */
1277
1278 case OP_NOT_WORD_BOUNDARY:
1279 case OP_WORD_BOUNDARY:
1280 {
1281
1282 /* Find out if the previous and current characters are "word" characters.
1283 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1284 be "non-word" characters. */
1285
1286#ifdef SUPPORT_UTF8
1287 if (utf8)
1288 {
1289 if (eptr == md->start_subject) prev_is_word = FALSE; else
1290 {
1291 const uschar *lastptr = eptr - 1;
1292 while((*lastptr & 0xc0) == 0x80) lastptr--;
1293 GETCHAR(c, lastptr);
1294 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1295 }
1296 if (eptr >= md->end_subject) cur_is_word = FALSE; else
1297 {
1298 GETCHAR(c, eptr);
1299 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1300 }
1301 }
1302 else
1303#endif
1304
1305 /* More streamlined when not in UTF-8 mode */
1306
1307 {
1308 prev_is_word = (eptr != md->start_subject) &&
1309 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1310 cur_is_word = (eptr < md->end_subject) &&
1311 ((md->ctypes[*eptr] & ctype_word) != 0);
1312 }
1313
1314 /* Now see if the situation is what we want */
1315
1316 if ((*ecode++ == OP_WORD_BOUNDARY)?
1317 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1318 RRETURN(MATCH_NOMATCH);
1319 }
1320 break;
1321
1322 /* Match a single character type; inline for speed */
1323
1324 case OP_ANY:
aa41d2de
PH
1325 if ((ims & PCRE_DOTALL) == 0)
1326 {
6bf342e1 1327 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
aa41d2de 1328 }
8ac170f3 1329 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
8ac170f3
PH
1330 if (utf8)
1331 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8ac170f3
PH
1332 ecode++;
1333 break;
1334
1335 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1336 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1337
1338 case OP_ANYBYTE:
1339 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1340 ecode++;
1341 break;
1342
1343 case OP_NOT_DIGIT:
1344 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1345 GETCHARINCTEST(c, eptr);
1346 if (
1347#ifdef SUPPORT_UTF8
1348 c < 256 &&
1349#endif
1350 (md->ctypes[c] & ctype_digit) != 0
1351 )
1352 RRETURN(MATCH_NOMATCH);
1353 ecode++;
1354 break;
1355
1356 case OP_DIGIT:
1357 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1358 GETCHARINCTEST(c, eptr);
1359 if (
1360#ifdef SUPPORT_UTF8
1361 c >= 256 ||
1362#endif
1363 (md->ctypes[c] & ctype_digit) == 0
1364 )
1365 RRETURN(MATCH_NOMATCH);
1366 ecode++;
1367 break;
1368
1369 case OP_NOT_WHITESPACE:
1370 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1371 GETCHARINCTEST(c, eptr);
1372 if (
1373#ifdef SUPPORT_UTF8
1374 c < 256 &&
1375#endif
1376 (md->ctypes[c] & ctype_space) != 0
1377 )
1378 RRETURN(MATCH_NOMATCH);
1379 ecode++;
1380 break;
1381
1382 case OP_WHITESPACE:
1383 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1384 GETCHARINCTEST(c, eptr);
1385 if (
1386#ifdef SUPPORT_UTF8
1387 c >= 256 ||
1388#endif
1389 (md->ctypes[c] & ctype_space) == 0
1390 )
1391 RRETURN(MATCH_NOMATCH);
1392 ecode++;
1393 break;
1394
1395 case OP_NOT_WORDCHAR:
1396 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1397 GETCHARINCTEST(c, eptr);
1398 if (
1399#ifdef SUPPORT_UTF8
1400 c < 256 &&
1401#endif
1402 (md->ctypes[c] & ctype_word) != 0
1403 )
1404 RRETURN(MATCH_NOMATCH);
1405 ecode++;
1406 break;
1407
1408 case OP_WORDCHAR:
1409 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1410 GETCHARINCTEST(c, eptr);
1411 if (
1412#ifdef SUPPORT_UTF8
1413 c >= 256 ||
1414#endif
1415 (md->ctypes[c] & ctype_word) == 0
1416 )
1417 RRETURN(MATCH_NOMATCH);
1418 ecode++;
1419 break;
1420
6bf342e1
PH
1421 case OP_ANYNL:
1422 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1423 GETCHARINCTEST(c, eptr);
1424 switch(c)
1425 {
1426 default: RRETURN(MATCH_NOMATCH);
1427 case 0x000d:
1428 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1429 break;
1430 case 0x000a:
1431 case 0x000b:
1432 case 0x000c:
1433 case 0x0085:
1434 case 0x2028:
1435 case 0x2029:
1436 break;
1437 }
1438 ecode++;
1439 break;
1440
8ac170f3
PH
1441#ifdef SUPPORT_UCP
1442 /* Check the next character by Unicode property. We will get here only
1443 if the support is in the binary; otherwise a compile-time error occurs. */
1444
1445 case OP_PROP:
1446 case OP_NOTPROP:
1447 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1448 GETCHARINCTEST(c, eptr);
1449 {
aa41d2de
PH
1450 int chartype, script;
1451 int category = _pcre_ucp_findprop(c, &chartype, &script);
8ac170f3 1452
aa41d2de 1453 switch(ecode[1])
8ac170f3 1454 {
aa41d2de
PH
1455 case PT_ANY:
1456 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1457 break;
1458
1459 case PT_LAMP:
1460 if ((chartype == ucp_Lu ||
1461 chartype == ucp_Ll ||
1462 chartype == ucp_Lt) == (op == OP_NOTPROP))
8ac170f3 1463 RRETURN(MATCH_NOMATCH);
aa41d2de
PH
1464 break;
1465
1466 case PT_GC:
1467 if ((ecode[2] != category) == (op == OP_PROP))
1468 RRETURN(MATCH_NOMATCH);
1469 break;
1470
1471 case PT_PC:
1472 if ((ecode[2] != chartype) == (op == OP_PROP))
8ac170f3 1473 RRETURN(MATCH_NOMATCH);
aa41d2de
PH
1474 break;
1475
1476 case PT_SC:
1477 if ((ecode[2] != script) == (op == OP_PROP))
1478 RRETURN(MATCH_NOMATCH);
1479 break;
1480
1481 default:
1482 RRETURN(PCRE_ERROR_INTERNAL);
8ac170f3 1483 }
aa41d2de
PH
1484
1485 ecode += 3;
8ac170f3
PH
1486 }
1487 break;
1488
1489 /* Match an extended Unicode sequence. We will get here only if the support
1490 is in the binary; otherwise a compile-time error occurs. */
1491
1492 case OP_EXTUNI:
1493 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1494 GETCHARINCTEST(c, eptr);
1495 {
aa41d2de
PH
1496 int chartype, script;
1497 int category = _pcre_ucp_findprop(c, &chartype, &script);
8ac170f3
PH
1498 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1499 while (eptr < md->end_subject)
1500 {
1501 int len = 1;
1502 if (!utf8) c = *eptr; else
1503 {
1504 GETCHARLEN(c, eptr, len);
1505 }
aa41d2de 1506 category = _pcre_ucp_findprop(c, &chartype, &script);
8ac170f3
PH
1507 if (category != ucp_M) break;
1508 eptr += len;
1509 }
1510 }
1511 ecode++;
1512 break;
1513#endif
1514
1515
1516 /* Match a back reference, possibly repeatedly. Look past the end of the
1517 item to see if there is repeat information following. The code is similar
1518 to that for character classes, but repeated for efficiency. Then obey
1519 similar code to character type repeats - written out again for speed.
1520 However, if the referenced string is the empty string, always treat
1521 it as matched, any number of times (otherwise there could be infinite
1522 loops). */
1523
1524 case OP_REF:
1525 {
1526 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1527 ecode += 3; /* Advance past item */
1528
1529 /* If the reference is unset, set the length to be longer than the amount
1530 of subject left; this ensures that every attempt at a match fails. We
1531 can't just fail here, because of the possibility of quantifiers with zero
1532 minima. */
1533
1534 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
1535 md->end_subject - eptr + 1 :
1536 md->offset_vector[offset+1] - md->offset_vector[offset];
1537
1538 /* Set up for repetition, or handle the non-repeated case */
1539
1540 switch (*ecode)
1541 {
1542 case OP_CRSTAR:
1543 case OP_CRMINSTAR:
1544 case OP_CRPLUS:
1545 case OP_CRMINPLUS:
1546 case OP_CRQUERY:
1547 case OP_CRMINQUERY:
1548 c = *ecode++ - OP_CRSTAR;
1549 minimize = (c & 1) != 0;
1550 min = rep_min[c]; /* Pick up values from tables; */
1551 max = rep_max[c]; /* zero for max => infinity */
1552 if (max == 0) max = INT_MAX;
1553 break;
1554
1555 case OP_CRRANGE:
1556 case OP_CRMINRANGE:
1557 minimize = (*ecode == OP_CRMINRANGE);
1558 min = GET2(ecode, 1);
1559 max = GET2(ecode, 3);
1560 if (max == 0) max = INT_MAX;
1561 ecode += 5;
1562 break;
1563
1564 default: /* No repeat follows */
1565 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1566 eptr += length;
1567 continue; /* With the main loop */
1568 }
1569
1570 /* If the length of the reference is zero, just continue with the
1571 main loop. */
1572
1573 if (length == 0) continue;
1574
1575 /* First, ensure the minimum number of matches are present. We get back
1576 the length of the reference string explicitly rather than passing the
1577 address of eptr, so that eptr can be a register variable. */
1578
1579 for (i = 1; i <= min; i++)
1580 {
1581 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1582 eptr += length;
1583 }
1584
1585 /* If min = max, continue at the same level without recursion.
1586 They are not both allowed to be zero. */
1587
1588 if (min == max) continue;
1589
1590 /* If minimizing, keep trying and advancing the pointer */
1591
1592 if (minimize)
1593 {
1594 for (fi = min;; fi++)
1595 {
1596 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1597 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1598 if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1599 RRETURN(MATCH_NOMATCH);
1600 eptr += length;
1601 }
1602 /* Control never gets here */
1603 }
1604
1605 /* If maximizing, find the longest string and work backwards */
1606
1607 else
1608 {
1609 pp = eptr;
1610 for (i = min; i < max; i++)
1611 {
1612 if (!match_ref(offset, eptr, length, md, ims)) break;
1613 eptr += length;
1614 }
1615 while (eptr >= pp)
1616 {
1617 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1618 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1619 eptr -= length;
1620 }
1621 RRETURN(MATCH_NOMATCH);
1622 }
1623 }
1624 /* Control never gets here */
1625
1626
1627
1628 /* Match a bit-mapped character class, possibly repeatedly. This op code is
1629 used when all the characters in the class have values in the range 0-255,
1630 and either the matching is caseful, or the characters are in the range
1631 0-127 when UTF-8 processing is enabled. The only difference between
1632 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1633 encountered.
1634
1635 First, look past the end of the item to see if there is repeat information
1636 following. Then obey similar code to character type repeats - written out
1637 again for speed. */
1638
1639 case OP_NCLASS:
1640 case OP_CLASS:
1641 {
1642 data = ecode + 1; /* Save for matching */
1643 ecode += 33; /* Advance past the item */
1644
1645 switch (*ecode)
1646 {
1647 case OP_CRSTAR:
1648 case OP_CRMINSTAR:
1649 case OP_CRPLUS:
1650 case OP_CRMINPLUS:
1651 case OP_CRQUERY:
1652 case OP_CRMINQUERY:
1653 c = *ecode++ - OP_CRSTAR;
1654 minimize = (c & 1) != 0;
1655 min = rep_min[c]; /* Pick up values from tables; */
1656 max = rep_max[c]; /* zero for max => infinity */
1657 if (max == 0) max = INT_MAX;
1658 break;
1659
1660 case OP_CRRANGE:
1661 case OP_CRMINRANGE:
1662 minimize = (*ecode == OP_CRMINRANGE);
1663 min = GET2(ecode, 1);
1664 max = GET2(ecode, 3);
1665 if (max == 0) max = INT_MAX;
1666 ecode += 5;
1667 break;
1668
1669 default: /* No repeat follows */
1670 min = max = 1;
1671 break;
1672 }
1673
1674 /* First, ensure the minimum number of matches are present. */
1675
1676#ifdef SUPPORT_UTF8
1677 /* UTF-8 mode */
1678 if (utf8)
1679 {
1680 for (i = 1; i <= min; i++)
1681 {
1682 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1683 GETCHARINC(c, eptr);
1684 if (c > 255)
1685 {
1686 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1687 }
1688 else
1689 {
1690 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1691 }
1692 }
1693 }
1694 else
1695#endif
1696 /* Not UTF-8 mode */
1697 {
1698 for (i = 1; i <= min; i++)
1699 {
1700 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1701 c = *eptr++;
1702 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1703 }
1704 }
1705
1706 /* If max == min we can continue with the main loop without the
1707 need to recurse. */
1708
1709 if (min == max) continue;
1710
1711 /* If minimizing, keep testing the rest of the expression and advancing
1712 the pointer while it matches the class. */
1713
1714 if (minimize)
1715 {
1716#ifdef SUPPORT_UTF8
1717 /* UTF-8 mode */
1718 if (utf8)
1719 {
1720 for (fi = min;; fi++)
1721 {
1722 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1723 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1724 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1725 GETCHARINC(c, eptr);
1726 if (c > 255)
1727 {
1728 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1729 }
1730 else
1731 {
1732 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1733 }
1734 }
1735 }
1736 else
1737#endif
1738 /* Not UTF-8 mode */
1739 {
1740 for (fi = min;; fi++)
1741 {
1742 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1743 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1744 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1745 c = *eptr++;
1746 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1747 }
1748 }
1749 /* Control never gets here */
1750 }
1751
1752 /* If maximizing, find the longest possible run, then work backwards. */
1753
1754 else
1755 {
1756 pp = eptr;
1757
1758#ifdef SUPPORT_UTF8
1759 /* UTF-8 mode */
1760 if (utf8)
1761 {
1762 for (i = min; i < max; i++)
1763 {
1764 int len = 1;
1765 if (eptr >= md->end_subject) break;
1766 GETCHARLEN(c, eptr, len);
1767 if (c > 255)
1768 {
1769 if (op == OP_CLASS) break;
1770 }
1771 else
1772 {
1773 if ((data[c/8] & (1 << (c&7))) == 0) break;
1774 }
1775 eptr += len;
1776 }
1777 for (;;)
1778 {
1779 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1780 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1781 if (eptr-- == pp) break; /* Stop if tried at original pos */
1782 BACKCHAR(eptr);
1783 }
1784 }
1785 else
1786#endif
1787 /* Not UTF-8 mode */
1788 {
1789 for (i = min; i < max; i++)
1790 {
1791 if (eptr >= md->end_subject) break;
1792 c = *eptr;
1793 if ((data[c/8] & (1 << (c&7))) == 0) break;
1794 eptr++;
1795 }
1796 while (eptr >= pp)
1797 {
1798 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8ac170f3 1799 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
aa41d2de 1800 eptr--;
8ac170f3
PH
1801 }
1802 }
1803
1804 RRETURN(MATCH_NOMATCH);
1805 }
1806 }
1807 /* Control never gets here */
1808
1809
1810 /* Match an extended character class. This opcode is encountered only
1811 in UTF-8 mode, because that's the only time it is compiled. */
1812
1813#ifdef SUPPORT_UTF8
1814 case OP_XCLASS:
1815 {
1816 data = ecode + 1 + LINK_SIZE; /* Save for matching */
1817 ecode += GET(ecode, 1); /* Advance past the item */
1818
1819 switch (*ecode)
1820 {
1821 case OP_CRSTAR:
1822 case OP_CRMINSTAR:
1823 case OP_CRPLUS:
1824 case OP_CRMINPLUS:
1825 case OP_CRQUERY:
1826 case OP_CRMINQUERY:
1827 c = *ecode++ - OP_CRSTAR;
1828 minimize = (c & 1) != 0;
1829 min = rep_min[c]; /* Pick up values from tables; */
1830 max = rep_max[c]; /* zero for max => infinity */
1831 if (max == 0) max = INT_MAX;
1832 break;
1833
1834 case OP_CRRANGE:
1835 case OP_CRMINRANGE:
1836 minimize = (*ecode == OP_CRMINRANGE);
1837 min = GET2(ecode, 1);
1838 max = GET2(ecode, 3);
1839 if (max == 0) max = INT_MAX;
1840 ecode += 5;
1841 break;
1842
1843 default: /* No repeat follows */
1844 min = max = 1;
1845 break;
1846 }
1847
1848 /* First, ensure the minimum number of matches are present. */
1849
1850 for (i = 1; i <= min; i++)
1851 {
1852 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1853 GETCHARINC(c, eptr);
1854 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1855 }
1856
1857 /* If max == min we can continue with the main loop without the
1858 need to recurse. */
1859
1860 if (min == max) continue;
1861
1862 /* If minimizing, keep testing the rest of the expression and advancing
1863 the pointer while it matches the class. */
1864
1865 if (minimize)
1866 {
1867 for (fi = min;; fi++)
1868 {
1869 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1870 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1871 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1872 GETCHARINC(c, eptr);
1873 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1874 }
1875 /* Control never gets here */
1876 }
1877
1878 /* If maximizing, find the longest possible run, then work backwards. */
1879
1880 else
1881 {
1882 pp = eptr;
1883 for (i = min; i < max; i++)
1884 {
1885 int len = 1;
1886 if (eptr >= md->end_subject) break;
1887 GETCHARLEN(c, eptr, len);
1888 if (!_pcre_xclass(c, data)) break;
1889 eptr += len;
1890 }
1891 for(;;)
1892 {
1893 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1894 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1895 if (eptr-- == pp) break; /* Stop if tried at original pos */
1896 BACKCHAR(eptr)
1897 }
1898 RRETURN(MATCH_NOMATCH);
1899 }
1900
1901 /* Control never gets here */
1902 }
1903#endif /* End of XCLASS */
1904
1905 /* Match a single character, casefully */
1906
1907 case OP_CHAR:
1908#ifdef SUPPORT_UTF8
1909 if (utf8)
1910 {
1911 length = 1;
1912 ecode++;
1913 GETCHARLEN(fc, ecode, length);
1914 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1915 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
1916 }
1917 else
1918#endif
1919
1920 /* Non-UTF-8 mode */
1921 {
1922 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
1923 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
1924 ecode += 2;
1925 }
1926 break;
1927
1928 /* Match a single character, caselessly */
1929
1930 case OP_CHARNC:
1931#ifdef SUPPORT_UTF8
1932 if (utf8)
1933 {
1934 length = 1;
1935 ecode++;
1936 GETCHARLEN(fc, ecode, length);
1937
1938 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1939
1940 /* If the pattern character's value is < 128, we have only one byte, and
1941 can use the fast lookup table. */
1942
1943 if (fc < 128)
1944 {
1945 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
1946 }
1947
1948 /* Otherwise we must pick up the subject character */
1949
1950 else
1951 {
6bf342e1 1952 unsigned int dc;
8ac170f3
PH
1953 GETCHARINC(dc, eptr);
1954 ecode += length;
1955
1956 /* If we have Unicode property support, we can use it to test the other
aa41d2de 1957 case of the character, if there is one. */
8ac170f3
PH
1958
1959 if (fc != dc)
1960 {
1961#ifdef SUPPORT_UCP
aa41d2de 1962 if (dc != _pcre_ucp_othercase(fc))
8ac170f3
PH
1963#endif
1964 RRETURN(MATCH_NOMATCH);
1965 }
1966 }
1967 }
1968 else
1969#endif /* SUPPORT_UTF8 */
1970
1971 /* Non-UTF-8 mode */
1972 {
1973 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
1974 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
1975 ecode += 2;
1976 }
1977 break;
1978
6bf342e1 1979 /* Match a single character repeatedly. */
8ac170f3
PH
1980
1981 case OP_EXACT:
1982 min = max = GET2(ecode, 1);
1983 ecode += 3;
1984 goto REPEATCHAR;
1985
6bf342e1
PH
1986 case OP_POSUPTO:
1987 possessive = TRUE;
1988 /* Fall through */
1989
8ac170f3
PH
1990 case OP_UPTO:
1991 case OP_MINUPTO:
1992 min = 0;
1993 max = GET2(ecode, 1);
1994 minimize = *ecode == OP_MINUPTO;
1995 ecode += 3;
1996 goto REPEATCHAR;
1997
6bf342e1
PH
1998 case OP_POSSTAR:
1999 possessive = TRUE;
2000 min = 0;
2001 max = INT_MAX;
2002 ecode++;
2003 goto REPEATCHAR;
2004
2005 case OP_POSPLUS:
2006 possessive = TRUE;
2007 min = 1;
2008 max = INT_MAX;
2009 ecode++;
2010 goto REPEATCHAR;
2011
2012 case OP_POSQUERY:
2013 possessive = TRUE;
2014 min = 0;
2015 max = 1;
2016 ecode++;
2017 goto REPEATCHAR;
2018
8ac170f3
PH
2019 case OP_STAR:
2020 case OP_MINSTAR:
2021 case OP_PLUS:
2022 case OP_MINPLUS:
2023 case OP_QUERY:
2024 case OP_MINQUERY:
2025 c = *ecode++ - OP_STAR;
2026 minimize = (c & 1) != 0;
2027 min = rep_min[c]; /* Pick up values from tables; */
2028 max = rep_max[c]; /* zero for max => infinity */
2029 if (max == 0) max = INT_MAX;
2030
2031 /* Common code for all repeated single-character matches. We can give
2032 up quickly if there are fewer than the minimum number of characters left in
2033 the subject. */
2034
2035 REPEATCHAR:
2036#ifdef SUPPORT_UTF8
2037 if (utf8)
2038 {
2039 length = 1;
2040 charptr = ecode;
2041 GETCHARLEN(fc, ecode, length);
2042 if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2043 ecode += length;
2044
2045 /* Handle multibyte character matching specially here. There is
2046 support for caseless matching if UCP support is present. */
2047
2048 if (length > 1)
2049 {
2050 int oclength = 0;
2051 uschar occhars[8];
2052
2053#ifdef SUPPORT_UCP
6bf342e1 2054 unsigned int othercase;
8ac170f3 2055 if ((ims & PCRE_CASELESS) != 0 &&
6bf342e1 2056 (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR)
8ac170f3
PH
2057 oclength = _pcre_ord2utf8(othercase, occhars);
2058#endif /* SUPPORT_UCP */
2059
2060 for (i = 1; i <= min; i++)
2061 {
2062 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2063 /* Need braces because of following else */
2064 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2065 else
2066 {
2067 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2068 eptr += oclength;
2069 }
2070 }
2071
2072 if (min == max) continue;
2073
2074 if (minimize)
2075 {
2076 for (fi = min;; fi++)
2077 {
2078 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2079 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2080 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2081 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2082 /* Need braces because of following else */
2083 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2084 else
2085 {
2086 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2087 eptr += oclength;
2088 }
2089 }
2090 /* Control never gets here */
2091 }
6bf342e1
PH
2092
2093 else /* Maximize */
8ac170f3
PH
2094 {
2095 pp = eptr;
2096 for (i = min; i < max; i++)
2097 {
2098 if (eptr > md->end_subject - length) break;
2099 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2100 else if (oclength == 0) break;
2101 else
2102 {
2103 if (memcmp(eptr, occhars, oclength) != 0) break;
2104 eptr += oclength;
2105 }
2106 }
6bf342e1
PH
2107
2108 if (possessive) continue;
8ac170f3
PH
2109 while (eptr >= pp)
2110 {
2111 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2112 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2113 eptr -= length;
2114 }
2115 RRETURN(MATCH_NOMATCH);
2116 }
2117 /* Control never gets here */
2118 }
2119
2120 /* If the length of a UTF-8 character is 1, we fall through here, and
2121 obey the code as for non-UTF-8 characters below, though in this case the
2122 value of fc will always be < 128. */
2123 }
2124 else
2125#endif /* SUPPORT_UTF8 */
2126
2127 /* When not in UTF-8 mode, load a single-byte character. */
2128 {
2129 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2130 fc = *ecode++;
2131 }
2132
2133 /* The value of fc at this point is always less than 256, though we may or
2134 may not be in UTF-8 mode. The code is duplicated for the caseless and
2135 caseful cases, for speed, since matching characters is likely to be quite
2136 common. First, ensure the minimum number of matches are present. If min =
2137 max, continue at the same level without recursing. Otherwise, if
2138 minimizing, keep trying the rest of the expression and advancing one
2139 matching character if failing, up to the maximum. Alternatively, if
2140 maximizing, find the maximum number of characters and work backwards. */
2141
2142 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2143 max, eptr));
2144
2145 if ((ims & PCRE_CASELESS) != 0)
2146 {
2147 fc = md->lcc[fc];
2148 for (i = 1; i <= min; i++)
2149 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2150 if (min == max) continue;
2151 if (minimize)
2152 {
2153 for (fi = min;; fi++)
2154 {
2155 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2156 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2157 if (fi >= max || eptr >= md->end_subject ||
2158 fc != md->lcc[*eptr++])
2159 RRETURN(MATCH_NOMATCH);
2160 }
2161 /* Control never gets here */
2162 }
6bf342e1 2163 else /* Maximize */
8ac170f3
PH
2164 {
2165 pp = eptr;
2166 for (i = min; i < max; i++)
2167 {
2168 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2169 eptr++;
2170 }
6bf342e1 2171 if (possessive) continue;
8ac170f3
PH
2172 while (eptr >= pp)
2173 {
2174 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2175 eptr--;
2176 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2177 }
2178 RRETURN(MATCH_NOMATCH);
2179 }
2180 /* Control never gets here */
2181 }
2182
2183 /* Caseful comparisons (includes all multi-byte characters) */
2184
2185 else
2186 {
2187 for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2188 if (min == max) continue;
2189 if (minimize)
2190 {
2191 for (fi = min;; fi++)
2192 {
2193 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2194 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2195 if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2196 RRETURN(MATCH_NOMATCH);
2197 }
2198 /* Control never gets here */
2199 }
6bf342e1 2200 else /* Maximize */
8ac170f3
PH
2201 {
2202 pp = eptr;
2203 for (i = min; i < max; i++)
2204 {
2205 if (eptr >= md->end_subject || fc != *eptr) break;
2206 eptr++;
2207 }
6bf342e1 2208 if (possessive) continue;
8ac170f3
PH
2209 while (eptr >= pp)
2210 {
2211 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2212 eptr--;
2213 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2214 }
2215 RRETURN(MATCH_NOMATCH);
2216 }
2217 }
2218 /* Control never gets here */
2219
2220 /* Match a negated single one-byte character. The character we are
2221 checking can be multibyte. */
2222
2223 case OP_NOT:
2224 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2225 ecode++;
2226 GETCHARINCTEST(c, eptr);
2227 if ((ims & PCRE_CASELESS) != 0)
2228 {
2229#ifdef SUPPORT_UTF8
2230 if (c < 256)
2231#endif
2232 c = md->lcc[c];
2233 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2234 }
2235 else
2236 {
2237 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2238 }
2239 break;
2240
2241 /* Match a negated single one-byte character repeatedly. This is almost a
2242 repeat of the code for a repeated single character, but I haven't found a
2243 nice way of commoning these up that doesn't require a test of the
2244 positive/negative option for each character match. Maybe that wouldn't add
2245 very much to the time taken, but character matching *is* what this is all
2246 about... */
2247
2248 case OP_NOTEXACT:
2249 min = max = GET2(ecode, 1);
2250 ecode += 3;
2251 goto REPEATNOTCHAR;
2252
2253 case OP_NOTUPTO:
2254 case OP_NOTMINUPTO:
2255 min = 0;
2256 max = GET2(ecode, 1);
2257 minimize = *ecode == OP_NOTMINUPTO;
2258 ecode += 3;
2259 goto REPEATNOTCHAR;
2260
6bf342e1
PH
2261 case OP_NOTPOSSTAR:
2262 possessive = TRUE;
2263 min = 0;
2264 max = INT_MAX;
2265 ecode++;
2266 goto REPEATNOTCHAR;
2267
2268 case OP_NOTPOSPLUS:
2269 possessive = TRUE;
2270 min = 1;
2271 max = INT_MAX;
2272 ecode++;
2273 goto REPEATNOTCHAR;
2274
2275 case OP_NOTPOSQUERY:
2276 possessive = TRUE;
2277 min = 0;
2278 max = 1;
2279 ecode++;
2280 goto REPEATNOTCHAR;
2281
2282 case OP_NOTPOSUPTO:
2283 possessive = TRUE;
2284 min = 0;
2285 max = GET2(ecode, 1);
2286 ecode += 3;
2287 goto REPEATNOTCHAR;
2288
8ac170f3
PH
2289 case OP_NOTSTAR:
2290 case OP_NOTMINSTAR:
2291 case OP_NOTPLUS:
2292 case OP_NOTMINPLUS:
2293 case OP_NOTQUERY:
2294 case OP_NOTMINQUERY:
2295 c = *ecode++ - OP_NOTSTAR;
2296 minimize = (c & 1) != 0;
2297 min = rep_min[c]; /* Pick up values from tables; */
2298 max = rep_max[c]; /* zero for max => infinity */
2299 if (max == 0) max = INT_MAX;
2300
2301 /* Common code for all repeated single-byte matches. We can give up quickly
2302 if there are fewer than the minimum number of bytes left in the
2303 subject. */
2304
2305 REPEATNOTCHAR:
2306 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2307 fc = *ecode++;
2308
2309 /* The code is duplicated for the caseless and caseful cases, for speed,
2310 since matching characters is likely to be quite common. First, ensure the
2311 minimum number of matches are present. If min = max, continue at the same
2312 level without recursing. Otherwise, if minimizing, keep trying the rest of
2313 the expression and advancing one matching character if failing, up to the
2314 maximum. Alternatively, if maximizing, find the maximum number of
2315 characters and work backwards. */
2316
2317 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2318 max, eptr));
2319
2320 if ((ims & PCRE_CASELESS) != 0)
2321 {
2322 fc = md->lcc[fc];
2323
2324#ifdef SUPPORT_UTF8
2325 /* UTF-8 mode */
2326 if (utf8)
2327 {
6bf342e1 2328 register unsigned int d;
8ac170f3
PH
2329 for (i = 1; i <= min; i++)
2330 {
2331 GETCHARINC(d, eptr);
2332 if (d < 256) d = md->lcc[d];
2333 if (fc == d) RRETURN(MATCH_NOMATCH);
2334 }
2335 }
2336 else
2337#endif
2338
2339 /* Not UTF-8 mode */
2340 {
2341 for (i = 1; i <= min; i++)
2342 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2343 }
2344
2345 if (min == max) continue;
2346
2347 if (minimize)
2348 {
2349#ifdef SUPPORT_UTF8
2350 /* UTF-8 mode */
2351 if (utf8)
2352 {
6bf342e1 2353 register unsigned int d;
8ac170f3
PH
2354 for (fi = min;; fi++)
2355 {
2356 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2357 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2358 GETCHARINC(d, eptr);
2359 if (d < 256) d = md->lcc[d];
2360 if (fi >= max || eptr >= md->end_subject || fc == d)
2361 RRETURN(MATCH_NOMATCH);
2362 }
2363 }
2364 else
2365#endif
2366 /* Not UTF-8 mode */
2367 {
2368 for (fi = min;; fi++)
2369 {
2370 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2371 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2372 if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2373 RRETURN(MATCH_NOMATCH);
2374 }
2375 }
2376 /* Control never gets here */
2377 }
2378
2379 /* Maximize case */
2380
2381 else
2382 {
2383 pp = eptr;
2384
2385#ifdef SUPPORT_UTF8
2386 /* UTF-8 mode */
2387 if (utf8)
2388 {
6bf342e1 2389 register unsigned int d;
8ac170f3
PH
2390 for (i = min; i < max; i++)
2391 {
2392 int len = 1;
2393 if (eptr >= md->end_subject) break;
2394 GETCHARLEN(d, eptr, len);
2395 if (d < 256) d = md->lcc[d];
2396 if (fc == d) break;
2397 eptr += len;
2398 }
6bf342e1
PH
2399 if (possessive) continue;
2400 for(;;)
8ac170f3
PH
2401 {
2402 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2403 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2404 if (eptr-- == pp) break; /* Stop if tried at original pos */
2405 BACKCHAR(eptr);
2406 }
2407 }
2408 else
2409#endif
2410 /* Not UTF-8 mode */
2411 {
2412 for (i = min; i < max; i++)
2413 {
2414 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2415 eptr++;
2416 }
6bf342e1 2417 if (possessive) continue;
8ac170f3
PH
2418 while (eptr >= pp)
2419 {
2420 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2421 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2422 eptr--;
2423 }
2424 }
2425
2426 RRETURN(MATCH_NOMATCH);
2427 }
2428 /* Control never gets here */
2429 }
2430
2431 /* Caseful comparisons */
2432
2433 else
2434 {
2435#ifdef SUPPORT_UTF8
2436 /* UTF-8 mode */
2437 if (utf8)
2438 {
6bf342e1 2439 register unsigned int d;
8ac170f3
PH
2440 for (i = 1; i <= min; i++)
2441 {
2442 GETCHARINC(d, eptr);
2443 if (fc == d) RRETURN(MATCH_NOMATCH);
2444 }
2445 }
2446 else
2447#endif
2448 /* Not UTF-8 mode */
2449 {
2450 for (i = 1; i <= min; i++)
2451 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2452 }
2453
2454 if (min == max) continue;
2455
2456 if (minimize)
2457 {
2458#ifdef SUPPORT_UTF8
2459 /* UTF-8 mode */
2460 if (utf8)
2461 {
6bf342e1 2462 register unsigned int d;
8ac170f3
PH
2463 for (fi = min;; fi++)
2464 {
2465 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2466 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2467 GETCHARINC(d, eptr);
2468 if (fi >= max || eptr >= md->end_subject || fc == d)
2469 RRETURN(MATCH_NOMATCH);
2470 }
2471 }
2472 else
2473#endif
2474 /* Not UTF-8 mode */
2475 {
2476 for (fi = min;; fi++)
2477 {
2478 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2479 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2480 if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2481 RRETURN(MATCH_NOMATCH);
2482 }
2483 }
2484 /* Control never gets here */
2485 }
2486
2487 /* Maximize case */
2488
2489 else
2490 {
2491 pp = eptr;
2492
2493#ifdef SUPPORT_UTF8
2494 /* UTF-8 mode */
2495 if (utf8)
2496 {
6bf342e1 2497 register unsigned int d;
8ac170f3
PH
2498 for (i = min; i < max; i++)
2499 {
2500 int len = 1;
2501 if (eptr >= md->end_subject) break;
2502 GETCHARLEN(d, eptr, len);
2503 if (fc == d) break;
2504 eptr += len;
2505 }
6bf342e1 2506 if (possessive) continue;
8ac170f3
PH
2507 for(;;)
2508 {
2509 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2510 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2511 if (eptr-- == pp) break; /* Stop if tried at original pos */
2512 BACKCHAR(eptr);
2513 }
2514 }
2515 else
2516#endif
2517 /* Not UTF-8 mode */
2518 {
2519 for (i = min; i < max; i++)
2520 {
2521 if (eptr >= md->end_subject || fc == *eptr) break;
2522 eptr++;
2523 }
6bf342e1 2524 if (possessive) continue;
8ac170f3
PH
2525 while (eptr >= pp)
2526 {
2527 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2528 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2529 eptr--;
2530 }
2531 }
2532
2533 RRETURN(MATCH_NOMATCH);
2534 }
2535 }
2536 /* Control never gets here */
2537
2538 /* Match a single character type repeatedly; several different opcodes
2539 share code. This is very similar to the code for single characters, but we
2540 repeat it in the interests of efficiency. */
2541
2542 case OP_TYPEEXACT:
2543 min = max = GET2(ecode, 1);
2544 minimize = TRUE;
2545 ecode += 3;
2546 goto REPEATTYPE;
2547
2548 case OP_TYPEUPTO:
2549 case OP_TYPEMINUPTO:
2550 min = 0;
2551 max = GET2(ecode, 1);
2552 minimize = *ecode == OP_TYPEMINUPTO;
2553 ecode += 3;
2554 goto REPEATTYPE;
2555
6bf342e1
PH
2556 case OP_TYPEPOSSTAR:
2557 possessive = TRUE;
2558 min = 0;
2559 max = INT_MAX;
2560 ecode++;
2561 goto REPEATTYPE;
2562
2563 case OP_TYPEPOSPLUS:
2564 possessive = TRUE;
2565 min = 1;
2566 max = INT_MAX;
2567 ecode++;
2568 goto REPEATTYPE;
2569
2570 case OP_TYPEPOSQUERY:
2571 possessive = TRUE;
2572 min = 0;
2573 max = 1;
2574 ecode++;
2575 goto REPEATTYPE;
2576
2577 case OP_TYPEPOSUPTO:
2578 possessive = TRUE;
2579 min = 0;
2580 max = GET2(ecode, 1);
2581 ecode += 3;
2582 goto REPEATTYPE;
2583
8ac170f3
PH
2584 case OP_TYPESTAR:
2585 case OP_TYPEMINSTAR:
2586 case OP_TYPEPLUS:
2587 case OP_TYPEMINPLUS:
2588 case OP_TYPEQUERY:
2589 case OP_TYPEMINQUERY:
2590 c = *ecode++ - OP_TYPESTAR;
2591 minimize = (c & 1) != 0;
2592 min = rep_min[c]; /* Pick up values from tables; */
2593 max = rep_max[c]; /* zero for max => infinity */
2594 if (max == 0) max = INT_MAX;
2595
2596 /* Common code for all repeated single character type matches. Note that
2597 in UTF-8 mode, '.' matches a character of any length, but for the other
2598 character types, the valid characters are all one-byte long. */
2599
2600 REPEATTYPE:
2601 ctype = *ecode++; /* Code for the character type */
2602
2603#ifdef SUPPORT_UCP
2604 if (ctype == OP_PROP || ctype == OP_NOTPROP)
2605 {
2606 prop_fail_result = ctype == OP_NOTPROP;
2607 prop_type = *ecode++;
aa41d2de 2608 prop_value = *ecode++;
8ac170f3
PH
2609 }
2610 else prop_type = -1;
2611#endif
2612
2613 /* First, ensure the minimum number of matches are present. Use inline
2614 code for maximizing the speed, and do the type test once at the start
2615 (i.e. keep it out of the loop). Also we can test that there are at least
2616 the minimum number of bytes before we start. This isn't as effective in
2617 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2618 is tidier. Also separate the UCP code, which can be the same for both UTF-8
2619 and single-bytes. */
2620
2621 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2622 if (min > 0)
2623 {
2624#ifdef SUPPORT_UCP
aa41d2de 2625 if (prop_type >= 0)
8ac170f3 2626 {
aa41d2de 2627 switch(prop_type)
8ac170f3 2628 {
aa41d2de
PH
2629 case PT_ANY:
2630 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2631 for (i = 1; i <= min; i++)
2632 {
2633 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2634 GETCHARINC(c, eptr);
2635 }
2636 break;
2637
2638 case PT_LAMP:
2639 for (i = 1; i <= min; i++)
2640 {
2641 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2642 GETCHARINC(c, eptr);
2643 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2644 if ((prop_chartype == ucp_Lu ||
2645 prop_chartype == ucp_Ll ||
2646 prop_chartype == ucp_Lt) == prop_fail_result)
2647 RRETURN(MATCH_NOMATCH);
2648 }
2649 break;
2650
2651 case PT_GC:
2652 for (i = 1; i <= min; i++)
2653 {
2654 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2655 GETCHARINC(c, eptr);
2656 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2657 if ((prop_category == prop_value) == prop_fail_result)
2658 RRETURN(MATCH_NOMATCH);
2659 }
2660 break;
2661
2662 case PT_PC:
2663 for (i = 1; i <= min; i++)
2664 {
2665 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2666 GETCHARINC(c, eptr);
2667 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2668 if ((prop_chartype == prop_value) == prop_fail_result)
2669 RRETURN(MATCH_NOMATCH);
2670 }
2671 break;
2672
2673 case PT_SC:
2674 for (i = 1; i <= min; i++)
2675 {
2676 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2677 GETCHARINC(c, eptr);
2678 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2679 if ((prop_script == prop_value) == prop_fail_result)
2680 RRETURN(MATCH_NOMATCH);
2681 }
2682 break;
2683
2684 default:
2685 RRETURN(PCRE_ERROR_INTERNAL);
8ac170f3
PH
2686 }
2687 }
2688
2689 /* Match extended Unicode sequences. We will get here only if the
2690 support is in the binary; otherwise a compile-time error occurs. */
2691
2692 else if (ctype == OP_EXTUNI)
2693 {
2694 for (i = 1; i <= min; i++)
2695 {
2696 GETCHARINCTEST(c, eptr);
aa41d2de 2697 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
8ac170f3
PH
2698 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2699 while (eptr < md->end_subject)
2700 {
2701 int len = 1;
2702 if (!utf8) c = *eptr; else
2703 {
2704 GETCHARLEN(c, eptr, len);
2705 }
aa41d2de 2706 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
8ac170f3
PH
2707 if (prop_category != ucp_M) break;
2708 eptr += len;
2709 }
2710 }
2711 }
2712
2713 else
2714#endif /* SUPPORT_UCP */
2715
2716/* Handle all other cases when the coding is UTF-8 */
2717
2718#ifdef SUPPORT_UTF8
2719 if (utf8) switch(ctype)
2720 {
2721 case OP_ANY:
2722 for (i = 1; i <= min; i++)
2723 {
2724 if (eptr >= md->end_subject ||
6bf342e1 2725 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
8ac170f3 2726 RRETURN(MATCH_NOMATCH);
aa41d2de 2727 eptr++;
8ac170f3
PH
2728 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2729 }
2730 break;
2731
2732 case OP_ANYBYTE:
2733 eptr += min;
2734 break;
2735
6bf342e1
PH
2736 case OP_ANYNL:
2737 for (i = 1; i <= min; i++)
2738 {
2739 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2740 GETCHARINC(c, eptr);
2741 switch(c)
2742 {
2743 default: RRETURN(MATCH_NOMATCH);
2744 case 0x000d:
2745 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2746 break;
2747 case 0x000a:
2748 case 0x000b:
2749 case 0x000c:
2750 case 0x0085:
2751 case 0x2028:
2752 case 0x2029:
2753 break;
2754 }
2755 }
2756 break;
2757
8ac170f3
PH
2758 case OP_NOT_DIGIT:
2759 for (i = 1; i <= min; i++)
2760 {
2761 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2762 GETCHARINC(c, eptr);
2763 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
2764 RRETURN(MATCH_NOMATCH);
2765 }
2766 break;
2767
2768 case OP_DIGIT:
2769 for (i = 1; i <= min; i++)
2770 {
2771 if (eptr >= md->end_subject ||
2772 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
2773 RRETURN(MATCH_NOMATCH);
2774 /* No need to skip more bytes - we know it's a 1-byte character */
2775 }
2776 break;
2777
2778 case OP_NOT_WHITESPACE:
2779 for (i = 1; i <= min; i++)
2780 {
2781 if (eptr >= md->end_subject ||
2782 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
2783 RRETURN(MATCH_NOMATCH);
2784 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2785 }
2786 break;
2787
2788 case OP_WHITESPACE:
2789 for (i = 1; i <= min; i++)
2790 {
2791 if (eptr >= md->end_subject ||
2792 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
2793 RRETURN(MATCH_NOMATCH);
2794 /* No need to skip more bytes - we know it's a 1-byte character */
2795 }
2796 break;
2797
2798 case OP_NOT_WORDCHAR:
2799 for (i = 1; i <= min; i++)
2800 {
2801 if (eptr >= md->end_subject ||
2802 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
2803 RRETURN(MATCH_NOMATCH);
2804 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2805 }
2806 break;
2807
2808 case OP_WORDCHAR:
2809 for (i = 1; i <= min; i++)
2810 {
2811 if (eptr >= md->end_subject ||
2812 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
2813 RRETURN(MATCH_NOMATCH);
2814 /* No need to skip more bytes - we know it's a 1-byte character */
2815 }
2816 break;
2817
2818 default:
2819 RRETURN(PCRE_ERROR_INTERNAL);
2820 } /* End switch(ctype) */
2821
2822 else
2823#endif /* SUPPORT_UTF8 */
2824
2825 /* Code for the non-UTF-8 case for minimum matching of operators other
6bf342e1
PH
2826 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
2827 number of bytes present, as this was tested above. */
8ac170f3
PH
2828
2829 switch(ctype)
2830 {
2831 case OP_ANY:
2832 if ((ims & PCRE_DOTALL) == 0)
2833 {
2834 for (i = 1; i <= min; i++)
aa41d2de 2835 {
6bf342e1 2836 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
aa41d2de
PH
2837 eptr++;
2838 }
8ac170f3
PH
2839 }
2840 else eptr += min;
2841 break;
2842
2843 case OP_ANYBYTE:
2844 eptr += min;
2845 break;
2846
6bf342e1
PH
2847 /* Because of the CRLF case, we can't assume the minimum number of
2848 bytes are present in this case. */
2849
2850 case OP_ANYNL:
2851 for (i = 1; i <= min; i++)
2852 {
2853 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2854 switch(*eptr++)
2855 {
2856 default: RRETURN(MATCH_NOMATCH);
2857 case 0x000d:
2858 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2859 break;
2860 case 0x000a:
2861 case 0x000b:
2862 case 0x000c:
2863 case 0x0085:
2864 break;
2865 }
2866 }
2867 break;
2868
8ac170f3
PH
2869 case OP_NOT_DIGIT:
2870 for (i = 1; i <= min; i++)
2871 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
2872 break;
2873
2874 case OP_DIGIT:
2875 for (i = 1; i <= min; i++)
2876 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
2877 break;
2878
2879 case OP_NOT_WHITESPACE:
2880 for (i = 1; i <= min; i++)
2881 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
2882 break;
2883
2884 case OP_WHITESPACE:
2885 for (i = 1; i <= min; i++)
2886 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
2887 break;
2888
2889 case OP_NOT_WORDCHAR:
2890 for (i = 1; i <= min; i++)
2891 if ((md->ctypes[*eptr++] & ctype_word) != 0)
2892 RRETURN(MATCH_NOMATCH);
2893 break;
2894
2895 case OP_WORDCHAR:
2896 for (i = 1; i <= min; i++)
2897 if ((md->ctypes[*eptr++] & ctype_word) == 0)
2898 RRETURN(MATCH_NOMATCH);
2899 break;
2900
2901 default:
2902 RRETURN(PCRE_ERROR_INTERNAL);
2903 }
2904 }
2905
2906 /* If min = max, continue at the same level without recursing */
2907
2908 if (min == max) continue;
2909
2910 /* If minimizing, we have to test the rest of the pattern before each
2911 subsequent match. Again, separate the UTF-8 case for speed, and also
2912 separate the UCP cases. */
2913
2914 if (minimize)
2915 {
2916#ifdef SUPPORT_UCP
aa41d2de 2917 if (prop_type >= 0)
8ac170f3 2918 {
aa41d2de 2919 switch(prop_type)
8ac170f3 2920 {
aa41d2de
PH
2921 case PT_ANY:
2922 for (fi = min;; fi++)
2923 {
2924 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2925 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2926 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2927 GETCHARINC(c, eptr);
2928 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2929 }
6bf342e1 2930 /* Control never gets here */
aa41d2de
PH
2931
2932 case PT_LAMP:
2933 for (fi = min;; fi++)
2934 {
2935 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2936 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2937 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2938 GETCHARINC(c, eptr);
2939 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2940 if ((prop_chartype == ucp_Lu ||
2941 prop_chartype == ucp_Ll ||
2942 prop_chartype == ucp_Lt) == prop_fail_result)
2943 RRETURN(MATCH_NOMATCH);
2944 }
6bf342e1 2945 /* Control never gets here */
aa41d2de
PH
2946
2947 case PT_GC:
2948 for (fi = min;; fi++)
2949 {
2950 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2951 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2952 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2953 GETCHARINC(c, eptr);
2954 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2955 if ((prop_category == prop_value) == prop_fail_result)
2956 RRETURN(MATCH_NOMATCH);
2957 }
6bf342e1 2958 /* Control never gets here */
aa41d2de
PH
2959
2960 case PT_PC:
2961 for (fi = min;; fi++)
2962 {
2963 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2964 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2965 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2966 GETCHARINC(c, eptr);
2967 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2968 if ((prop_chartype == prop_value) == prop_fail_result)
2969 RRETURN(MATCH_NOMATCH);
2970 }
6bf342e1 2971 /* Control never gets here */
aa41d2de
PH
2972
2973 case PT_SC:
2974 for (fi = min;; fi++)
2975 {
2976 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2977 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2978 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2979 GETCHARINC(c, eptr);
2980 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2981 if ((prop_script == prop_value) == prop_fail_result)
2982 RRETURN(MATCH_NOMATCH);
2983 }
6bf342e1 2984 /* Control never gets here */
aa41d2de
PH
2985
2986 default:
2987 RRETURN(PCRE_ERROR_INTERNAL);
8ac170f3
PH
2988 }
2989 }
2990
2991 /* Match extended Unicode sequences. We will get here only if the
2992 support is in the binary; otherwise a compile-time error occurs. */
2993
2994 else if (ctype == OP_EXTUNI)
2995 {
2996 for (fi = min;; fi++)
2997 {
2998 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2999 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3000 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3001 GETCHARINCTEST(c, eptr);
aa41d2de 3002 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
8ac170f3
PH
3003 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3004 while (eptr < md->end_subject)
3005 {
3006 int len = 1;
3007 if (!utf8) c = *eptr; else
3008 {
3009 GETCHARLEN(c, eptr, len);
3010 }
aa41d2de 3011 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
8ac170f3
PH
3012 if (prop_category != ucp_M) break;
3013 eptr += len;
3014 }
3015 }
3016 }
3017
3018 else
3019#endif /* SUPPORT_UCP */
3020
3021#ifdef SUPPORT_UTF8
3022 /* UTF-8 mode */
3023 if (utf8)
3024 {
3025 for (fi = min;; fi++)
3026 {
3027 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3028 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
aa41d2de
PH
3029 if (fi >= max || eptr >= md->end_subject ||
3030 (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&
6bf342e1 3031 IS_NEWLINE(eptr)))
aa41d2de 3032 RRETURN(MATCH_NOMATCH);
8ac170f3
PH
3033
3034 GETCHARINC(c, eptr);
3035 switch(ctype)
3036 {
aa41d2de 3037 case OP_ANY: /* This is the DOTALL case */
8ac170f3
PH
3038 break;
3039
3040 case OP_ANYBYTE:
3041 break;
3042
6bf342e1
PH
3043 case OP_ANYNL:
3044 switch(c)
3045 {
3046 default: RRETURN(MATCH_NOMATCH);
3047 case 0x000d:
3048 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3049 break;
3050 case 0x000a:
3051 case 0x000b:
3052 case 0x000c:
3053 case 0x0085:
3054 case 0x2028:
3055 case 0x2029:
3056 break;
3057 }
3058 break;
3059
8ac170f3
PH
3060 case OP_NOT_DIGIT:
3061 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3062 RRETURN(MATCH_NOMATCH);
3063 break;
3064
3065 case OP_DIGIT:
3066 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3067 RRETURN(MATCH_NOMATCH);
3068 break;
3069
3070 case OP_NOT_WHITESPACE:
3071 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3072 RRETURN(MATCH_NOMATCH);
3073 break;
3074
3075 case OP_WHITESPACE:
3076 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3077 RRETURN(MATCH_NOMATCH);
3078 break;
3079
3080 case OP_NOT_WORDCHAR:
3081 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3082 RRETURN(MATCH_NOMATCH);
3083 break;
3084
3085 case OP_WORDCHAR:
3086 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3087 RRETURN(MATCH_NOMATCH);
3088 break;
3089
3090 default:
3091 RRETURN(PCRE_ERROR_INTERNAL);
3092 }
3093 }
3094 }
3095 else
3096#endif
3097 /* Not UTF-8 mode */
3098 {
3099 for (fi = min;; fi++)
3100 {
3101 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3102 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
aa41d2de 3103 if (fi >= max || eptr >= md->end_subject ||
6bf342e1 3104 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
aa41d2de
PH
3105 RRETURN(MATCH_NOMATCH);
3106
8ac170f3
PH
3107 c = *eptr++;
3108 switch(ctype)
3109 {
aa41d2de 3110 case OP_ANY: /* This is the DOTALL case */
8ac170f3
PH
3111 break;
3112
3113 case OP_ANYBYTE:
3114 break;
3115
6bf342e1
PH
3116 case OP_ANYNL:
3117 switch(c)
3118 {
3119 default: RRETURN(MATCH_NOMATCH);
3120 case 0x000d:
3121 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3122 break;
3123 case 0x000a:
3124 case 0x000b:
3125 case 0x000c:
3126 case 0x0085:
3127 break;
3128 }
3129 break;
3130
8ac170f3
PH
3131 case OP_NOT_DIGIT:
3132 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3133 break;
3134
3135 case OP_DIGIT:
3136 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3137 break;
3138
3139 case OP_NOT_WHITESPACE:
3140 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3141 break;
3142
3143 case OP_WHITESPACE:
3144 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3145 break;
3146
3147 case OP_NOT_WORDCHAR:
3148 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3149 break;
3150
3151 case OP_WORDCHAR:
3152 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3153 break;
3154
3155 default:
3156 RRETURN(PCRE_ERROR_INTERNAL);
3157 }
3158 }
3159 }
3160 /* Control never gets here */
3161 }
3162
6bf342e1 3163 /* If maximizing, it is worth using inline code for speed, doing the type
8ac170f3
PH
3164 test once at the start (i.e. keep it out of the loop). Again, keep the
3165 UTF-8 and UCP stuff separate. */
3166
3167 else
3168 {
3169 pp = eptr; /* Remember where we started */
3170
3171#ifdef SUPPORT_UCP
aa41d2de 3172 if (prop_type >= 0)
8ac170f3 3173 {
aa41d2de 3174 switch(prop_type)
8ac170f3 3175 {
aa41d2de
PH
3176 case PT_ANY:
3177 for (i = min; i < max; i++)
3178 {
3179 int len = 1;
3180 if (eptr >= md->end_subject) break;
3181 GETCHARLEN(c, eptr, len);
3182 if (prop_fail_result) break;
3183 eptr+= len;
3184 }
3185 break;
3186
3187 case PT_LAMP:
3188 for (i = min; i < max; i++)
3189 {
3190 int len = 1;
3191 if (eptr >= md->end_subject) break;
3192 GETCHARLEN(c, eptr, len);
3193 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3194 if ((prop_chartype == ucp_Lu ||
3195 prop_chartype == ucp_Ll ||
3196 prop_chartype == ucp_Lt) == prop_fail_result)
3197 break;
3198 eptr+= len;
3199 }
3200 break;
3201
3202 case PT_GC:
3203 for (i = min; i < max; i++)
3204 {
3205 int len = 1;
3206 if (eptr >= md->end_subject) break;
3207 GETCHARLEN(c, eptr, len);
3208 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3209 if ((prop_category == prop_value) == prop_fail_result)
3210 break;
3211 eptr+= len;
3212 }
3213 break;
3214
3215 case PT_PC:
3216 for (i = min; i < max; i++)
3217 {
3218 int len = 1;
3219 if (eptr >= md->end_subject) break;
3220 GETCHARLEN(c, eptr, len);
3221 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3222 if ((prop_chartype == prop_value) == prop_fail_result)
3223 break;
3224 eptr+= len;
3225 }
3226 break;
3227
3228 case PT_SC:
3229 for (i = min; i < max; i++)
3230 {
3231 int len = 1;
3232 if (eptr >= md->end_subject) break;
3233 GETCHARLEN(c, eptr, len);
3234 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3235 if ((prop_script == prop_value) == prop_fail_result)
3236 break;
3237 eptr+= len;
3238 }
3239 break;
8ac170f3
PH
3240 }
3241
3242 /* eptr is now past the end of the maximum run */
3243
6bf342e1 3244 if (possessive) continue;
8ac170f3
PH
3245 for(;;)
3246 {
3247 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3248 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3249 if (eptr-- == pp) break; /* Stop if tried at original pos */
3250 BACKCHAR(eptr);
3251 }
3252 }
3253
3254 /* Match extended Unicode sequences. We will get here only if the
3255 support is in the binary; otherwise a compile-time error occurs. */
3256
3257 else if (ctype == OP_EXTUNI)
3258 {
3259 for (i = min; i < max; i++)
3260 {
3261 if (eptr >= md->end_subject) break;
3262 GETCHARINCTEST(c, eptr);
aa41d2de 3263 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
8ac170f3
PH
3264 if (prop_category == ucp_M) break;
3265 while (eptr < md->end_subject)
3266 {
3267 int len = 1;
3268 if (!utf8) c = *eptr; else
3269 {
3270 GETCHARLEN(c, eptr, len);
3271 }
aa41d2de 3272 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
8ac170f3
PH
3273 if (prop_category != ucp_M) break;
3274 eptr += len;
3275 }
3276 }
3277
3278 /* eptr is now past the end of the maximum run */
3279
6bf342e1 3280 if (possessive) continue;
8ac170f3
PH
3281 for(;;)
3282 {
3283 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3284 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3285 if (eptr-- == pp) break; /* Stop if tried at original pos */
3286 for (;;) /* Move back over one extended */
3287 {
3288 int len = 1;
3289 BACKCHAR(eptr);
3290 if (!utf8) c = *eptr; else
3291 {
3292 GETCHARLEN(c, eptr, len);
3293 }
aa41d2de 3294 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
8ac170f3
PH
3295 if (prop_category != ucp_M) break;
3296 eptr--;
3297 }
3298 }
3299 }
3300
3301 else
3302#endif /* SUPPORT_UCP */
3303
3304#ifdef SUPPORT_UTF8
3305 /* UTF-8 mode */
3306
3307 if (utf8)
3308 {
3309 switch(ctype)
3310 {
3311 case OP_ANY:
3312
aa41d2de
PH
3313 /* Special code is required for UTF8, but when the maximum is
3314 unlimited we don't need it, so we repeat the non-UTF8 code. This is
3315 probably worth it, because .* is quite a common idiom. */
8ac170f3
PH
3316
3317 if (max < INT_MAX)
3318 {
3319 if ((ims & PCRE_DOTALL) == 0)
3320 {
3321 for (i = min; i < max; i++)
3322 {
6bf342e1 3323 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
8ac170f3
PH
3324 eptr++;
3325 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3326 }
3327 }
3328 else
3329 {
3330 for (i = min; i < max; i++)
3331 {
aa41d2de 3332 if (eptr >= md->end_subject) break;
8ac170f3
PH
3333 eptr++;
3334 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3335 }
3336 }
3337 }
3338
3339 /* Handle unlimited UTF-8 repeat */
3340
3341 else
3342 {
3343 if ((ims & PCRE_DOTALL) == 0)
3344 {
3345 for (i = min; i < max; i++)
3346 {
6bf342e1 3347 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
8ac170f3
PH
3348 eptr++;
3349 }
3350 break;
3351 }
3352 else
3353 {
3354 c = max - min;
6bf342e1
PH
3355 if (c > (unsigned int)(md->end_subject - eptr))
3356 c = md->end_subject - eptr;
8ac170f3
PH
3357 eptr += c;
3358 }
3359 }
3360 break;
3361
3362 /* The byte case is the same as non-UTF8 */
3363
3364 case OP_ANYBYTE:
3365 c = max - min;
6bf342e1
PH
3366 if (c > (unsigned int)(md->end_subject - eptr))
3367 c = md->end_subject - eptr;
8ac170f3
PH
3368 eptr += c;
3369 break;
3370
6bf342e1
PH
3371 case OP_ANYNL:
3372 for (i = min; i < max; i++)
3373 {
3374 int len = 1;
3375 if (eptr >= md->end_subject) break;
3376 GETCHARLEN(c, eptr, len);
3377 if (c == 0x000d)
3378 {
3379 if (++eptr >= md->end_subject) break;
3380 if (*eptr == 0x000a) eptr++;
3381 }
3382 else
3383 {
3384 if (c != 0x000a && c != 0x000b && c != 0x000c &&
3385 c != 0x0085 && c != 0x2028 && c != 0x2029)
3386 break;
3387 eptr += len;
3388 }
3389 }
3390 break;
3391
8ac170f3
PH
3392 case OP_NOT_DIGIT:
3393 for (i = min; i < max; i++)
3394 {
3395 int len = 1;
3396 if (eptr >= md->end_subject) break;
3397 GETCHARLEN(c, eptr, len);
3398 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
3399 eptr+= len;
3400 }
3401 break;
3402
3403 case OP_DIGIT:
3404 for (i = min; i < max; i++)
3405 {
3406 int len = 1;
3407 if (eptr >= md->end_subject) break;
3408 GETCHARLEN(c, eptr, len);
3409 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
3410 eptr+= len;
3411 }
3412 break;
3413
3414 case OP_NOT_WHITESPACE:
3415 for (i = min; i < max; i++)
3416 {
3417 int len = 1;
3418 if (eptr >= md->end_subject) break;
3419 GETCHARLEN(c, eptr, len);
3420 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
3421 eptr+= len;
3422 }
3423 break;
3424
3425 case OP_WHITESPACE:
3426 for (i = min; i < max; i++)
3427 {
3428 int len = 1;
3429 if (eptr >= md->end_subject) break;
3430 GETCHARLEN(c, eptr, len);
3431 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
3432 eptr+= len;
3433 }
3434 break;
3435
3436 case OP_NOT_WORDCHAR:
3437 for (i = min; i < max; i++)
3438 {
3439 int len = 1;
3440 if (eptr >= md->end_subject) break;
3441 GETCHARLEN(c, eptr, len);
3442 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
3443 eptr+= len;
3444 }
3445 break;
3446
3447 case OP_WORDCHAR:
3448 for (i = min; i < max; i++)
3449 {
3450 int len = 1;
3451 if (eptr >= md->end_subject) break;
3452 GETCHARLEN(c, eptr, len);
3453 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
3454 eptr+= len;
3455 }
3456 break;
3457
3458 default:
3459 RRETURN(PCRE_ERROR_INTERNAL);
3460 }
3461
3462 /* eptr is now past the end of the maximum run */
3463
6bf342e1 3464 if (possessive) continue;
8ac170f3
PH
3465 for(;;)
3466 {
3467 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3468 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3469 if (eptr-- == pp) break; /* Stop if tried at original pos */
3470 BACKCHAR(eptr);
3471 }
3472 }
3473 else
3474#endif
3475
3476 /* Not UTF-8 mode */
3477 {
3478 switch(ctype)
3479 {
3480 case OP_ANY:
3481 if ((ims & PCRE_DOTALL) == 0)
3482 {
3483 for (i = min; i < max; i++)
3484 {
6bf342e1 3485 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
8ac170f3
PH
3486 eptr++;
3487 }
3488 break;
3489 }
3490 /* For DOTALL case, fall through and treat as \C */
3491
3492 case OP_ANYBYTE:
3493 c = max - min;
6bf342e1
PH
3494 if (c > (unsigned int)(md->end_subject - eptr))
3495 c = md->end_subject - eptr;
8ac170f3
PH
3496 eptr += c;
3497 break;
3498
6bf342e1
PH
3499 case OP_ANYNL:
3500 for (i = min; i < max; i++)
3501 {
3502 if (eptr >= md->end_subject) break;
3503 c = *eptr;
3504 if (c == 0x000d)
3505 {
3506 if (++eptr >= md->end_subject) break;
3507 if (*eptr == 0x000a) eptr++;
3508 }
3509 else
3510 {
3511 if (c != 0x000a && c != 0x000b && c != 0x000c && c != 0x0085)
3512 break;
3513 eptr++;
3514 }
3515 }
3516 break;
3517
8ac170f3
PH
3518 case OP_NOT_DIGIT:
3519 for (i = min; i < max; i++)
3520 {
3521 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
3522 break;
3523 eptr++;
3524 }
3525 break;
3526
3527 case OP_DIGIT:
3528 for (i = min; i < max; i++)
3529 {
3530 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
3531 break;
3532 eptr++;
3533 }
3534 break;
3535
3536 case OP_NOT_WHITESPACE:
3537 for (i = min; i < max; i++)
3538 {
3539 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
3540 break;
3541 eptr++;
3542 }
3543 break;
3544
3545 case OP_WHITESPACE:
3546 for (i = min; i < max; i++)
3547 {
3548 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
3549 break;
3550 eptr++;
3551 }
3552 break;
3553
3554 case OP_NOT_WORDCHAR:
3555 for (i = min; i < max; i++)
3556 {
3557 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
3558 break;
3559 eptr++;
3560 }
3561 break;
3562
3563 case OP_WORDCHAR:
3564 for (i = min; i < max; i++)
3565 {
3566 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
3567 break;
3568 eptr++;
3569 }
3570 break;
3571
3572 default:
3573 RRETURN(PCRE_ERROR_INTERNAL);
3574 }
3575
3576 /* eptr is now past the end of the maximum run */
3577
6bf342e1 3578 if (possessive) continue;
8ac170f3
PH
3579 while (eptr >= pp)
3580 {
3581 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3582 eptr--;
3583 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3584 }
3585 }
3586
3587 /* Get here if we can't make it match with any permitted repetitions */
3588
3589 RRETURN(MATCH_NOMATCH);
3590 }
3591 /* Control never gets here */
3592
6bf342e1
PH
3593 /* There's been some horrible disaster. Arrival here can only mean there is
3594 something seriously wrong in the code above or the OP_xxx definitions. */
8ac170f3
PH
3595
3596 default:
3597 DPRINTF(("Unknown opcode %d\n", *ecode));
6bf342e1 3598 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
8ac170f3
PH
3599 }
3600
3601 /* Do not stick any code in here without much thought; it is assumed
3602 that "continue" in the code above comes out to here to repeat the main
3603 loop. */
3604
3605 } /* End of main loop */
3606/* Control never reaches here */
3607}
3608
3609
3610/***************************************************************************
3611****************************************************************************
3612 RECURSION IN THE match() FUNCTION
3613
3614Undefine all the macros that were defined above to handle this. */
3615
3616#ifdef NO_RECURSE
3617#undef eptr
3618#undef ecode
3619#undef offset_top
3620#undef ims
3621#undef eptrb
3622#undef flags
3623
3624#undef callpat
3625#undef charptr
3626#undef data
3627#undef next
3628#undef pp
3629#undef prev
3630#undef saved_eptr
3631
3632#undef new_recursive
3633
3634#undef cur_is_word
3635#undef condition
8ac170f3
PH
3636#undef prev_is_word
3637
3638#undef original_ims
3639
3640#undef ctype
3641#undef length
3642#undef max
3643#undef min
3644#undef number
3645#undef offset
3646#undef op
3647#undef save_capture_last
3648#undef save_offset1
3649#undef save_offset2
3650#undef save_offset3
3651#undef stacksave
3652
3653#undef newptrb
3654
3655#endif
3656
3657/* These two are defined as macros in both cases */
3658
3659#undef fc
3660#undef fi
3661
3662/***************************************************************************
3663***************************************************************************/
3664
3665
3666
3667/*************************************************
3668* Execute a Regular Expression *
3669*************************************************/
3670
3671/* This function applies a compiled re to a subject string and picks out
3672portions of the string if it matches. Two elements in the vector are set for
3673each substring: the offsets to the start and end of the substring.
3674
3675Arguments:
3676 argument_re points to the compiled expression
3677 extra_data points to extra data or is NULL
3678 subject points to the subject string
3679 length length of subject string (may contain binary zeros)
3680 start_offset where to start in the subject string
3681 options option bits
3682 offsets points to a vector of ints to be filled in with offsets
3683 offsetcount the number of elements in the vector
3684
3685Returns: > 0 => success; value is the number of elements filled in
3686 = 0 => success, but offsets is not big enough
3687 -1 => failed to match
3688 < -1 => some kind of unexpected problem
3689*/
3690
aa41d2de 3691PCRE_DATA_SCOPE int
8ac170f3 3692pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
aa41d2de 3693 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
8ac170f3
PH
3694 int offsetcount)
3695{
3696int rc, resetcount, ocount;
3697int first_byte = -1;
3698int req_byte = -1;
3699int req_byte2 = -1;
aa41d2de
PH
3700int newline;
3701unsigned long int ims;
8ac170f3
PH
3702BOOL using_temporary_offsets = FALSE;
3703BOOL anchored;
3704BOOL startline;
3705BOOL firstline;
3706BOOL first_byte_caseless = FALSE;
3707BOOL req_byte_caseless = FALSE;
6bf342e1 3708BOOL utf8;
8ac170f3 3709match_data match_block;
aa41d2de 3710match_data *md = &match_block;
8ac170f3
PH
3711const uschar *tables;
3712const uschar *start_bits = NULL;
aa41d2de
PH
3713USPTR start_match = (USPTR)subject + start_offset;
3714USPTR end_subject;
3715USPTR req_byte_ptr = start_match - 1;
6bf342e1 3716eptrblock eptrchain[EPTR_WORK_SIZE];
8ac170f3
PH
3717
3718pcre_study_data internal_study;
3719const pcre_study_data *study;
3720
3721real_pcre internal_re;
3722const real_pcre *external_re = (const real_pcre *)argument_re;
3723const real_pcre *re = external_re;
3724
3725/* Plausibility checks */
3726
3727if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3728if (re == NULL || subject == NULL ||
3729 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3730if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3731
3732/* Fish out the optional data from the extra_data structure, first setting
3733the default values. */
3734
3735study = NULL;
aa41d2de
PH
3736md->match_limit = MATCH_LIMIT;
3737md->match_limit_recursion = MATCH_LIMIT_RECURSION;
3738md->callout_data = NULL;
8ac170f3
PH
3739
3740/* The table pointer is always in native byte order. */
3741
3742tables = external_re->tables;
3743
3744if (extra_data != NULL)
3745 {
3746 register unsigned int flags = extra_data->flags;
3747 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3748 study = (const pcre_study_data *)extra_data->study_data;
3749 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
aa41d2de
PH
3750 md->match_limit = extra_data->match_limit;
3751 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3752 md->match_limit_recursion = extra_data->match_limit_recursion;
8ac170f3 3753 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
aa41d2de 3754 md->callout_data = extra_data->callout_data;
8ac170f3
PH
3755 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
3756 }
3757
3758/* If the exec call supplied NULL for tables, use the inbuilt ones. This
3759is a feature that makes it possible to save compiled regex and re-use them
3760in other programs later. */
3761
3762if (tables == NULL) tables = _pcre_default_tables;
3763
3764/* Check that the first field in the block is the magic number. If it is not,
3765test for a regex that was compiled on a host of opposite endianness. If this is
3766the case, flipped values are put in internal_re and internal_study if there was
3767study data too. */
3768
3769if (re->magic_number != MAGIC_NUMBER)
3770 {
3771 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
3772 if (re == NULL) return PCRE_ERROR_BADMAGIC;
3773 if (study != NULL) study = &internal_study;
3774 }
3775
3776/* Set up other data */
3777
3778anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
3779startline = (re->options & PCRE_STARTLINE) != 0;
3780firstline = (re->options & PCRE_FIRSTLINE) != 0;
3781
3782/* The code starts after the real_pcre block and the capture name table. */
3783
aa41d2de 3784md->start_code = (const uschar *)external_re + re->name_table_offset +
8ac170f3
PH
3785 re->name_count * re->name_entry_size;
3786
aa41d2de
PH
3787md->start_subject = (USPTR)subject;
3788md->start_offset = start_offset;
3789md->end_subject = md->start_subject + length;
3790end_subject = md->end_subject;
3791
3792md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6bf342e1 3793utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
8ac170f3 3794
aa41d2de
PH
3795md->notbol = (options & PCRE_NOTBOL) != 0;
3796md->noteol = (options & PCRE_NOTEOL) != 0;
3797md->notempty = (options & PCRE_NOTEMPTY) != 0;
3798md->partial = (options & PCRE_PARTIAL) != 0;
3799md->hitend = FALSE;
8ac170f3 3800
aa41d2de 3801md->recursive = NULL; /* No recursion at top level */
6bf342e1 3802md->eptrchain = eptrchain; /* Make workspace generally available */
8ac170f3 3803
aa41d2de
PH
3804md->lcc = tables + lcc_offset;
3805md->ctypes = tables + ctypes_offset;
8ac170f3 3806
aa41d2de
PH
3807/* Handle different types of newline. The two bits give four cases. If nothing
3808is set at run time, whatever was used at compile time applies. */
3809
6bf342e1
PH
3810switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : options) &
3811 PCRE_NEWLINE_BITS)
aa41d2de 3812 {
6bf342e1 3813 case 0: newline = NEWLINE; break; /* Compile-time default */
aa41d2de
PH
3814 case PCRE_NEWLINE_CR: newline = '\r'; break;
3815 case PCRE_NEWLINE_LF: newline = '\n'; break;
3816 case PCRE_NEWLINE_CR+
3817 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
6bf342e1
PH
3818 case PCRE_NEWLINE_ANY: newline = -1; break;
3819 default: return PCRE_ERROR_BADNEWLINE;
aa41d2de
PH
3820 }
3821
6bf342e1 3822if (newline < 0)
aa41d2de 3823 {
6bf342e1 3824 md->nltype = NLTYPE_ANY;
aa41d2de
PH
3825 }
3826else
3827 {
6bf342e1
PH
3828 md->nltype = NLTYPE_FIXED;
3829 if (newline > 255)
3830 {
3831 md->nllen = 2;
3832 md->nl[0] = (newline >> 8) & 255;
3833 md->nl[1] = newline & 255;
3834 }
3835 else
3836 {
3837 md->nllen = 1;
3838 md->nl[0] = newline;
3839 }
aa41d2de 3840 }
8ac170f3
PH
3841
3842/* Partial matching is supported only for a restricted set of regexes at the
3843moment. */
3844
aa41d2de 3845if (md->partial && (re->options & PCRE_NOPARTIAL) != 0)
8ac170f3
PH
3846 return PCRE_ERROR_BADPARTIAL;
3847
3848/* Check a UTF-8 string if required. Unfortunately there's no way of passing
3849back the character offset. */
3850
3851#ifdef SUPPORT_UTF8
6bf342e1 3852if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
8ac170f3
PH
3853 {
3854 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
3855 return PCRE_ERROR_BADUTF8;
3856 if (start_offset > 0 && start_offset < length)
3857 {
3858 int tb = ((uschar *)subject)[start_offset];
3859 if (tb > 127)
3860 {
3861 tb &= 0xc0;
3862 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
3863 }
3864 }
3865 }
3866#endif
3867
3868/* The ims options can vary during the matching as a result of the presence
3869of (?ims) items in the pattern. They are kept in a local variable so that
3870restoring at the exit of a group is easy. */
3871
3872ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
3873
3874/* If the expression has got more back references than the offsets supplied can
3875hold, we get a temporary chunk of working store to use during the matching.
3876Otherwise, we can use the vector supplied, rounding down its size to a multiple
3877of 3. */
3878
3879ocount = offsetcount - (offsetcount % 3);
3880
3881if (re->top_backref > 0 && re->top_backref >= ocount/3)
3882 {
3883 ocount = re->top_backref * 3 + 3;
aa41d2de
PH
3884 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
3885 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
8ac170f3
PH
3886 using_temporary_offsets = TRUE;
3887 DPRINTF(("Got memory to hold back references\n"));
3888 }
aa41d2de 3889else md->offset_vector = offsets;
8ac170f3 3890
aa41d2de
PH
3891md->offset_end = ocount;
3892md->offset_max = (2*ocount)/3;
3893md->offset_overflow = FALSE;
3894md->capture_last = -1;
8ac170f3
PH
3895
3896/* Compute the minimum number of offsets that we need to reset each time. Doing
3897this makes a huge difference to execution time when there aren't many brackets
3898in the pattern. */
3899
3900resetcount = 2 + re->top_bracket * 2;
3901if (resetcount > offsetcount) resetcount = ocount;
3902
3903/* Reset the working variable associated with each extraction. These should
3904never be used unless previously set, but they get saved and restored, and so we
3905initialize them to avoid reading uninitialized locations. */
3906
aa41d2de 3907if (md->offset_vector != NULL)
8ac170f3 3908 {
aa41d2de 3909 register int *iptr = md->offset_vector + ocount;
8ac170f3
PH
3910 register int *iend = iptr - resetcount/2 + 1;
3911 while (--iptr >= iend) *iptr = -1;
3912 }
3913
3914/* Set up the first character to match, if available. The first_byte value is
3915never set for an anchored regular expression, but the anchoring may be forced
3916at run time, so we have to test for anchoring. The first char may be unset for
3917an unanchored pattern, of course. If there's no first char and the pattern was
3918studied, there may be a bitmap of possible first characters. */
3919
3920if (!anchored)
3921 {
3922 if ((re->options & PCRE_FIRSTSET) != 0)
3923 {
3924 first_byte = re->first_byte & 255;
3925 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
aa41d2de 3926 first_byte = md->lcc[first_byte];
8ac170f3
PH
3927 }
3928 else
3929 if (!startline && study != NULL &&
3930 (study->options & PCRE_STUDY_MAPPED) != 0)
3931 start_bits = study->start_bits;
3932 }
3933
3934/* For anchored or unanchored matches, there may be a "last known required
3935character" set. */
3936
3937if ((re->options & PCRE_REQCHSET) != 0)
3938 {
3939 req_byte = re->req_byte & 255;
3940 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
3941 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
3942 }
3943
6bf342e1
PH
3944
3945/* ==========================================================================*/
3946
8ac170f3
PH
3947/* Loop for handling unanchored repeated matching attempts; for anchored regexs
3948the loop runs just once. */
3949
6bf342e1 3950for(;;)
8ac170f3 3951 {
aa41d2de 3952 USPTR save_end_subject = end_subject;
8ac170f3
PH
3953
3954 /* Reset the maximum number of extractions we might see. */
3955
aa41d2de 3956 if (md->offset_vector != NULL)
8ac170f3 3957 {
aa41d2de 3958 register int *iptr = md->offset_vector;
8ac170f3
PH
3959 register int *iend = iptr + resetcount;
3960 while (iptr < iend) *iptr++ = -1;
3961 }
3962
3963 /* Advance to a unique first char if possible. If firstline is TRUE, the
3964 start of the match is constrained to the first line of a multiline string.
6bf342e1
PH
3965 That is, the match must be before or at the first newline. Implement this by
3966 temporarily adjusting end_subject so that we stop scanning at a newline. If
3967 the match fails at the newline, later code breaks this loop. */
8ac170f3
PH
3968
3969 if (firstline)
3970 {
aa41d2de 3971 USPTR t = start_match;
6bf342e1 3972 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
8ac170f3
PH
3973 end_subject = t;
3974 }
3975
3976 /* Now test for a unique first byte */
3977
3978 if (first_byte >= 0)
3979 {
3980 if (first_byte_caseless)
3981 while (start_match < end_subject &&
aa41d2de 3982 md->lcc[*start_match] != first_byte)
8ac170f3
PH
3983 start_match++;
3984 else
3985 while (start_match < end_subject && *start_match != first_byte)
3986 start_match++;
3987 }
3988
aa41d2de 3989 /* Or to just after a linebreak for a multiline match if possible */
8ac170f3
PH
3990
3991 else if (startline)
3992 {
6bf342e1 3993 if (start_match > md->start_subject + start_offset)
8ac170f3 3994 {
6bf342e1 3995 while (start_match <= end_subject && !WAS_NEWLINE(start_match))
8ac170f3
PH
3996 start_match++;
3997 }
3998 }
3999
4000 /* Or to a non-unique first char after study */
4001
4002 else if (start_bits != NULL)
4003 {
4004 while (start_match < end_subject)
4005 {
4006 register unsigned int c = *start_match;
4007 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
4008 }
4009 }
4010
4011 /* Restore fudged end_subject */
4012
4013 end_subject = save_end_subject;
4014
4015#ifdef DEBUG /* Sigh. Some compilers never learn. */
4016 printf(">>>> Match against: ");
aa41d2de 4017 pchars(start_match, end_subject - start_match, TRUE, md);
8ac170f3
PH
4018 printf("\n");
4019#endif
4020
4021 /* If req_byte is set, we know that that character must appear in the subject
4022 for the match to succeed. If the first character is set, req_byte must be
4023 later in the subject; otherwise the test starts at the match point. This
4024 optimization can save a huge amount of backtracking in patterns with nested
4025 unlimited repeats that aren't going to match. Writing separate code for
4026 cased/caseless versions makes it go faster, as does using an autoincrement
4027 and backing off on a match.
4028
4029 HOWEVER: when the subject string is very, very long, searching to its end can
4030 take a long time, and give bad performance on quite ordinary patterns. This
6bf342e1
PH
4031 showed up when somebody was matching something like /^\d+C/ on a 32-megabyte
4032 string... so we don't do this when the string is sufficiently long.
8ac170f3
PH
4033
4034 ALSO: this processing is disabled when partial matching is requested.
4035 */
4036
4037 if (req_byte >= 0 &&
4038 end_subject - start_match < REQ_BYTE_MAX &&
aa41d2de 4039 !md->partial)
8ac170f3 4040 {
aa41d2de 4041 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
8ac170f3
PH
4042
4043 /* We don't need to repeat the search if we haven't yet reached the
4044 place we found it at last time. */
4045
4046 if (p > req_byte_ptr)
4047 {
4048 if (req_byte_caseless)
4049 {
4050 while (p < end_subject)
4051 {
4052 register int pp = *p++;
4053 if (pp == req_byte || pp == req_byte2) { p--; break; }
4054 }
4055 }
4056 else
4057 {
4058 while (p < end_subject)
4059 {
4060 if (*p++ == req_byte) { p--; break; }
4061 }
4062 }
4063
6bf342e1
PH
4064 /* If we can't find the required character, break the matching loop,
4065 forcing a match failure. */
8ac170f3 4066
6bf342e1
PH
4067 if (p >= end_subject)
4068 {
4069 rc = MATCH_NOMATCH;
4070 break;
4071 }
8ac170f3
PH
4072
4073 /* If we have found the required character, save the point where we
4074 found it, so that we don't search again next time round the loop if
4075 the start hasn't passed this character yet. */
4076
4077 req_byte_ptr = p;
4078 }
4079 }
4080
6bf342e1 4081 /* OK, we can now run the match. */
8ac170f3 4082
aa41d2de
PH
4083 md->start_match = start_match;
4084 md->match_call_count = 0;
6bf342e1
PH
4085 md->eptrn = 0; /* Next free eptrchain slot */
4086 rc = match(start_match, md->start_code, 2, md, ims, NULL, 0, 0);
8ac170f3 4087
6bf342e1 4088 /* Any return other than MATCH_NOMATCH breaks the loop. */
8ac170f3 4089
6bf342e1 4090 if (rc != MATCH_NOMATCH) break;
8ac170f3 4091
6bf342e1
PH
4092 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
4093 newline in the subject (though it may continue over the newline). Therefore,
4094 if we have just failed to match, starting at a newline, do not continue. */
4095
4096 if (firstline && IS_NEWLINE(start_match)) break;
4097
4098 /* Advance the match position by one character. */
4099
4100 start_match++;
8ac170f3 4101#ifdef SUPPORT_UTF8
6bf342e1
PH
4102 if (utf8)
4103 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
4104 start_match++;
8ac170f3 4105#endif
8ac170f3 4106
6bf342e1
PH
4107 /* Break the loop if the pattern is anchored or if we have passed the end of
4108 the subject. */
4109
4110 if (anchored || start_match > end_subject) break;
4111
4112 /* If we have just passed a CR and the newline option is CRLF or ANY, and we
4113 are now at a LF, advance the match position by one more character. */
4114
4115 if (start_match[-1] == '\r' &&
4116 (md->nltype == NLTYPE_ANY || md->nllen == 2) &&
4117 start_match < end_subject &&
4118 *start_match == '\n')
4119 start_match++;
4120
4121 } /* End of for(;;) "bumpalong" loop */
4122
4123/* ==========================================================================*/
4124
4125/* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
4126conditions is true:
8ac170f3 4127
6bf342e1 4128(1) The pattern is anchored;
8ac170f3 4129
6bf342e1
PH
4130(2) We are past the end of the subject;
4131
4132(3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
4133 this option requests that a match occur at or before the first newline in
4134 the subject.
4135
4136When we have a match and the offset vector is big enough to deal with any
4137backreferences, captured substring offsets will already be set up. In the case
4138where we had to get some local store to hold offsets for backreference
4139processing, copy those that we can. In this case there need not be overflow if
4140certain parts of the pattern were not used, even though there are more
4141capturing parentheses than vector slots. */
4142
4143if (rc == MATCH_MATCH)
4144 {
8ac170f3
PH
4145 if (using_temporary_offsets)
4146 {
4147 if (offsetcount >= 4)
4148 {
aa41d2de 4149 memcpy(offsets + 2, md->offset_vector + 2,
8ac170f3
PH
4150 (offsetcount - 2) * sizeof(int));
4151 DPRINTF(("Copied offsets from temporary memory\n"));
4152 }
6bf342e1 4153 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
8ac170f3 4154 DPRINTF(("Freeing temporary memory\n"));
aa41d2de 4155 (pcre_free)(md->offset_vector);
8ac170f3
PH
4156 }
4157
6bf342e1
PH
4158 /* Set the return code to the number of captured strings, or 0 if there are
4159 too many to fit into the vector. */
4160
aa41d2de 4161 rc = md->offset_overflow? 0 : md->end_offset_top/2;
8ac170f3 4162
6bf342e1
PH
4163 /* If there is space, set up the whole thing as substring 0. */
4164
8ac170f3
PH
4165 if (offsetcount < 2) rc = 0; else
4166 {
aa41d2de
PH
4167 offsets[0] = start_match - md->start_subject;
4168 offsets[1] = md->end_match_ptr - md->start_subject;
8ac170f3
PH
4169 }
4170
4171 DPRINTF((">>>> returning %d\n", rc));
4172 return rc;
4173 }
4174
6bf342e1
PH
4175/* Control gets here if there has been an error, or if the overall match
4176attempt has failed at all permitted starting positions. */
8ac170f3
PH
4177
4178if (using_temporary_offsets)
4179 {
4180 DPRINTF(("Freeing temporary memory\n"));
aa41d2de 4181 (pcre_free)(md->offset_vector);
8ac170f3
PH
4182 }
4183
6bf342e1
PH
4184if (rc != MATCH_NOMATCH)
4185 {
4186 DPRINTF((">>>> error: returning %d\n", rc));
4187 return rc;
4188 }
4189else if (md->partial && md->hitend)
8ac170f3
PH
4190 {
4191 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
4192 return PCRE_ERROR_PARTIAL;
4193 }
4194else
4195 {
4196 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
4197 return PCRE_ERROR_NOMATCH;
4198 }
4199}
4200
4201/* End of pcre_exec.c */