Commit | Line | Data |
---|---|---|
c86f6258 PH |
1 | /************************************************* |
2 | * Perl-Compatible Regular Expressions * | |
3 | *************************************************/ | |
4 | ||
5 | /* | |
6 | This is a library of functions to support regular expressions whose syntax | |
7 | and semantics are as close as possible to those of the Perl 5 language. See | |
8 | the file Tech.Notes for some information on the internals. | |
9 | ||
10 | Written by: Philip Hazel <ph10@cam.ac.uk> | |
11 | ||
12 | Copyright (c) 1997-2004 University of Cambridge | |
13 | ||
14 | ----------------------------------------------------------------------------- | |
15 | Redistribution and use in source and binary forms, with or without | |
16 | modification, are permitted provided that the following conditions are met: | |
17 | ||
18 | * Redistributions of source code must retain the above copyright notice, | |
19 | this list of conditions and the following disclaimer. | |
20 | ||
21 | * Redistributions in binary form must reproduce the above copyright | |
22 | notice, this list of conditions and the following disclaimer in the | |
23 | documentation and/or other materials provided with the distribution. | |
24 | ||
25 | * Neither the name of the University of Cambridge nor the names of its | |
26 | contributors may be used to endorse or promote products derived from | |
27 | this software without specific prior written permission. | |
28 | ||
29 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
30 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
31 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
32 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |
33 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
34 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
35 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
36 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
37 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
38 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
39 | POSSIBILITY OF SUCH DAMAGE. | |
40 | ----------------------------------------------------------------------------- | |
41 | */ | |
42 | ||
43 | ||
44 | /* This module contains a debugging function for printing out the internal form | |
45 | of a compiled regular expression. It is kept in a separate file so that it can | |
46 | be #included both in the pcretest program, and in the library itself when | |
47 | compiled with the debugging switch. */ | |
48 | ||
49 | ||
50 | static const char *OP_names[] = { OP_NAME_LIST }; | |
51 | ||
52 | ||
53 | /************************************************* | |
54 | * Print single- or multi-byte character * | |
55 | *************************************************/ | |
56 | ||
57 | /* These tables are actually copies of ones in pcre.c. If we compile the | |
58 | library with debugging, they are included twice, but that isn't really a | |
59 | problem - compiling with debugging is pretty rare and these are very small. */ | |
60 | ||
61 | static const int utf8_t3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01}; | |
62 | ||
63 | static const uschar utf8_t4[] = { | |
64 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
65 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
66 | 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, | |
67 | 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 }; | |
68 | ||
69 | static int | |
70 | print_char(FILE *f, uschar *ptr, BOOL utf8) | |
71 | { | |
72 | int c = *ptr; | |
73 | ||
74 | if (!utf8 || (c & 0xc0) != 0xc0) | |
75 | { | |
76 | if (isprint(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c); | |
77 | return 0; | |
78 | } | |
79 | else | |
80 | { | |
81 | int i; | |
82 | int a = utf8_t4[c & 0x3f]; /* Number of additional bytes */ | |
83 | int s = 6*a; | |
84 | c = (c & utf8_t3[a]) << s; | |
85 | for (i = 1; i <= a; i++) | |
86 | { | |
87 | /* This is a check for malformed UTF-8; it should only occur if the sanity | |
88 | check has been turned off. Rather than swallow random bytes, just stop if | |
89 | we hit a bad one. Print it with \X instead of \x as an indication. */ | |
90 | ||
91 | if ((ptr[i] & 0xc0) != 0x80) | |
92 | { | |
93 | fprintf(f, "\\X{%x}", c); | |
94 | return i - 1; | |
95 | } | |
96 | ||
97 | /* The byte is OK */ | |
98 | ||
99 | s -= 6; | |
100 | c |= (ptr[i] & 0x3f) << s; | |
101 | } | |
102 | if (c < 128) fprintf(f, "\\x%02x", c); else fprintf(f, "\\x{%x}", c); | |
103 | return a; | |
104 | } | |
105 | } | |
106 | ||
107 | ||
108 | ||
109 | ||
110 | /************************************************* | |
111 | * Find Unicode property name * | |
112 | *************************************************/ | |
113 | ||
114 | static const char * | |
115 | get_ucpname(int property) | |
116 | { | |
117 | #ifdef SUPPORT_UCP | |
118 | int i; | |
119 | for (i = sizeof(utt)/sizeof(ucp_type_table); i >= 0; i--) | |
120 | { | |
121 | if (property == utt[i].value) break; | |
122 | } | |
123 | return (i >= 0)? utt[i].name : "??"; | |
124 | #else | |
125 | return "??"; | |
126 | #endif | |
127 | } | |
128 | ||
129 | ||
130 | ||
131 | /************************************************* | |
132 | * Print compiled regex * | |
133 | *************************************************/ | |
134 | ||
135 | /* Make this function work for a regex with integers either byte order. | |
136 | However, we assume that what we are passed is a compiled regex. */ | |
137 | ||
138 | static void | |
139 | print_internals(pcre *external_re, FILE *f) | |
140 | { | |
141 | real_pcre *re = (real_pcre *)external_re; | |
142 | uschar *codestart, *code; | |
143 | BOOL utf8; | |
144 | ||
145 | unsigned int options = re->options; | |
146 | int offset = re->name_table_offset; | |
147 | int count = re->name_count; | |
148 | int size = re->name_entry_size; | |
149 | ||
150 | if (re->magic_number != MAGIC_NUMBER) | |
151 | { | |
152 | offset = ((offset << 8) & 0xff00) | ((offset >> 8) & 0xff); | |
153 | count = ((count << 8) & 0xff00) | ((count >> 8) & 0xff); | |
154 | size = ((size << 8) & 0xff00) | ((size >> 8) & 0xff); | |
155 | options = ((options << 24) & 0xff000000) | | |
156 | ((options << 8) & 0x00ff0000) | | |
157 | ((options >> 8) & 0x0000ff00) | | |
158 | ((options >> 24) & 0x000000ff); | |
159 | } | |
160 | ||
161 | code = codestart = (uschar *)re + offset + count * size; | |
162 | utf8 = (options & PCRE_UTF8) != 0; | |
163 | ||
164 | for(;;) | |
165 | { | |
166 | uschar *ccode; | |
167 | int c; | |
168 | int extra = 0; | |
169 | ||
170 | fprintf(f, "%3d ", (int)(code - codestart)); | |
171 | ||
172 | if (*code >= OP_BRA) | |
173 | { | |
174 | if (*code - OP_BRA > EXTRACT_BASIC_MAX) | |
175 | fprintf(f, "%3d Bra extra\n", GET(code, 1)); | |
176 | else | |
177 | fprintf(f, "%3d Bra %d\n", GET(code, 1), *code - OP_BRA); | |
178 | code += OP_lengths[OP_BRA]; | |
179 | continue; | |
180 | } | |
181 | ||
182 | switch(*code) | |
183 | { | |
184 | case OP_END: | |
185 | fprintf(f, " %s\n", OP_names[*code]); | |
186 | fprintf(f, "------------------------------------------------------------------\n"); | |
187 | return; | |
188 | ||
189 | case OP_OPT: | |
190 | fprintf(f, " %.2x %s", code[1], OP_names[*code]); | |
191 | break; | |
192 | ||
193 | case OP_CHAR: | |
194 | { | |
195 | fprintf(f, " "); | |
196 | do | |
197 | { | |
198 | code++; | |
199 | code += 1 + print_char(f, code, utf8); | |
200 | } | |
201 | while (*code == OP_CHAR); | |
202 | fprintf(f, "\n"); | |
203 | continue; | |
204 | } | |
205 | break; | |
206 | ||
207 | case OP_CHARNC: | |
208 | { | |
209 | fprintf(f, " NC "); | |
210 | do | |
211 | { | |
212 | code++; | |
213 | code += 1 + print_char(f, code, utf8); | |
214 | } | |
215 | while (*code == OP_CHARNC); | |
216 | fprintf(f, "\n"); | |
217 | continue; | |
218 | } | |
219 | break; | |
220 | ||
221 | case OP_KETRMAX: | |
222 | case OP_KETRMIN: | |
223 | case OP_ALT: | |
224 | case OP_KET: | |
225 | case OP_ASSERT: | |
226 | case OP_ASSERT_NOT: | |
227 | case OP_ASSERTBACK: | |
228 | case OP_ASSERTBACK_NOT: | |
229 | case OP_ONCE: | |
230 | case OP_COND: | |
231 | case OP_REVERSE: | |
232 | fprintf(f, "%3d %s", GET(code, 1), OP_names[*code]); | |
233 | break; | |
234 | ||
235 | case OP_BRANUMBER: | |
236 | printf("%3d %s", GET2(code, 1), OP_names[*code]); | |
237 | break; | |
238 | ||
239 | case OP_CREF: | |
240 | if (GET2(code, 1) == CREF_RECURSE) | |
241 | fprintf(f, " Cond recurse"); | |
242 | else | |
243 | fprintf(f, "%3d %s", GET2(code,1), OP_names[*code]); | |
244 | break; | |
245 | ||
246 | case OP_STAR: | |
247 | case OP_MINSTAR: | |
248 | case OP_PLUS: | |
249 | case OP_MINPLUS: | |
250 | case OP_QUERY: | |
251 | case OP_MINQUERY: | |
252 | case OP_TYPESTAR: | |
253 | case OP_TYPEMINSTAR: | |
254 | case OP_TYPEPLUS: | |
255 | case OP_TYPEMINPLUS: | |
256 | case OP_TYPEQUERY: | |
257 | case OP_TYPEMINQUERY: | |
258 | fprintf(f, " "); | |
259 | if (*code >= OP_TYPESTAR) | |
260 | { | |
261 | fprintf(f, "%s", OP_names[code[1]]); | |
262 | if (code[1] == OP_PROP || code[1] == OP_NOTPROP) | |
263 | { | |
264 | fprintf(f, " %s ", get_ucpname(code[2])); | |
265 | extra = 1; | |
266 | } | |
267 | } | |
268 | else extra = print_char(f, code+1, utf8); | |
269 | fprintf(f, "%s", OP_names[*code]); | |
270 | break; | |
271 | ||
272 | case OP_EXACT: | |
273 | case OP_UPTO: | |
274 | case OP_MINUPTO: | |
275 | fprintf(f, " "); | |
276 | extra = print_char(f, code+3, utf8); | |
277 | fprintf(f, "{"); | |
278 | if (*code != OP_EXACT) fprintf(f, ","); | |
279 | fprintf(f, "%d}", GET2(code,1)); | |
280 | if (*code == OP_MINUPTO) fprintf(f, "?"); | |
281 | break; | |
282 | ||
283 | case OP_TYPEEXACT: | |
284 | case OP_TYPEUPTO: | |
285 | case OP_TYPEMINUPTO: | |
286 | fprintf(f, " %s", OP_names[code[3]]); | |
287 | if (code[3] == OP_PROP || code[3] == OP_NOTPROP) | |
288 | { | |
289 | fprintf(f, " %s ", get_ucpname(code[4])); | |
290 | extra = 1; | |
291 | } | |
292 | fprintf(f, "{"); | |
293 | if (*code != OP_TYPEEXACT) fprintf(f, "0,"); | |
294 | fprintf(f, "%d}", GET2(code,1)); | |
295 | if (*code == OP_TYPEMINUPTO) fprintf(f, "?"); | |
296 | break; | |
297 | ||
298 | case OP_NOT: | |
299 | if (isprint(c = code[1])) fprintf(f, " [^%c]", c); | |
300 | else fprintf(f, " [^\\x%02x]", c); | |
301 | break; | |
302 | ||
303 | case OP_NOTSTAR: | |
304 | case OP_NOTMINSTAR: | |
305 | case OP_NOTPLUS: | |
306 | case OP_NOTMINPLUS: | |
307 | case OP_NOTQUERY: | |
308 | case OP_NOTMINQUERY: | |
309 | if (isprint(c = code[1])) fprintf(f, " [^%c]", c); | |
310 | else fprintf(f, " [^\\x%02x]", c); | |
311 | fprintf(f, "%s", OP_names[*code]); | |
312 | break; | |
313 | ||
314 | case OP_NOTEXACT: | |
315 | case OP_NOTUPTO: | |
316 | case OP_NOTMINUPTO: | |
317 | if (isprint(c = code[3])) fprintf(f, " [^%c]{", c); | |
318 | else fprintf(f, " [^\\x%02x]{", c); | |
319 | if (*code != OP_NOTEXACT) fprintf(f, ","); | |
320 | fprintf(f, "%d}", GET2(code,1)); | |
321 | if (*code == OP_NOTMINUPTO) fprintf(f, "?"); | |
322 | break; | |
323 | ||
324 | case OP_RECURSE: | |
325 | fprintf(f, "%3d %s", GET(code, 1), OP_names[*code]); | |
326 | break; | |
327 | ||
328 | case OP_REF: | |
329 | fprintf(f, " \\%d", GET2(code,1)); | |
330 | ccode = code + OP_lengths[*code]; | |
331 | goto CLASS_REF_REPEAT; | |
332 | ||
333 | case OP_CALLOUT: | |
334 | fprintf(f, " %s %d %d %d", OP_names[*code], code[1], GET(code,2), | |
335 | GET(code, 2 + LINK_SIZE)); | |
336 | break; | |
337 | ||
338 | case OP_PROP: | |
339 | case OP_NOTPROP: | |
340 | fprintf(f, " %s %s", OP_names[*code], get_ucpname(code[1])); | |
341 | break; | |
342 | ||
343 | /* OP_XCLASS can only occur in UTF-8 mode. However, there's no harm in | |
344 | having this code always here, and it makes it less messy without all those | |
345 | #ifdefs. */ | |
346 | ||
347 | case OP_CLASS: | |
348 | case OP_NCLASS: | |
349 | case OP_XCLASS: | |
350 | { | |
351 | int i, min, max; | |
352 | BOOL printmap; | |
353 | ||
354 | fprintf(f, " ["); | |
355 | ||
356 | if (*code == OP_XCLASS) | |
357 | { | |
358 | extra = GET(code, 1); | |
359 | ccode = code + LINK_SIZE + 1; | |
360 | printmap = (*ccode & XCL_MAP) != 0; | |
361 | if ((*ccode++ & XCL_NOT) != 0) fprintf(f, "^"); | |
362 | } | |
363 | else | |
364 | { | |
365 | printmap = TRUE; | |
366 | ccode = code + 1; | |
367 | } | |
368 | ||
369 | /* Print a bit map */ | |
370 | ||
371 | if (printmap) | |
372 | { | |
373 | for (i = 0; i < 256; i++) | |
374 | { | |
375 | if ((ccode[i/8] & (1 << (i&7))) != 0) | |
376 | { | |
377 | int j; | |
378 | for (j = i+1; j < 256; j++) | |
379 | if ((ccode[j/8] & (1 << (j&7))) == 0) break; | |
380 | if (i == '-' || i == ']') fprintf(f, "\\"); | |
381 | if (isprint(i)) fprintf(f, "%c", i); else fprintf(f, "\\x%02x", i); | |
382 | if (--j > i) | |
383 | { | |
384 | if (j != i + 1) fprintf(f, "-"); | |
385 | if (j == '-' || j == ']') fprintf(f, "\\"); | |
386 | if (isprint(j)) fprintf(f, "%c", j); else fprintf(f, "\\x%02x", j); | |
387 | } | |
388 | i = j; | |
389 | } | |
390 | } | |
391 | ccode += 32; | |
392 | } | |
393 | ||
394 | /* For an XCLASS there is always some additional data */ | |
395 | ||
396 | if (*code == OP_XCLASS) | |
397 | { | |
398 | int ch; | |
399 | while ((ch = *ccode++) != XCL_END) | |
400 | { | |
401 | if (ch == XCL_PROP) | |
402 | { | |
403 | fprintf(f, "\\p{%s}", get_ucpname(*ccode++)); | |
404 | } | |
405 | else if (ch == XCL_NOTPROP) | |
406 | { | |
407 | fprintf(f, "\\P{%s}", get_ucpname(*ccode++)); | |
408 | } | |
409 | else | |
410 | { | |
411 | ccode += 1 + print_char(f, ccode, TRUE); | |
412 | if (ch == XCL_RANGE) | |
413 | { | |
414 | fprintf(f, "-"); | |
415 | ccode += 1 + print_char(f, ccode, TRUE); | |
416 | } | |
417 | } | |
418 | } | |
419 | } | |
420 | ||
421 | /* Indicate a non-UTF8 class which was created by negation */ | |
422 | ||
423 | fprintf(f, "]%s", (*code == OP_NCLASS)? " (neg)" : ""); | |
424 | ||
425 | /* Handle repeats after a class or a back reference */ | |
426 | ||
427 | CLASS_REF_REPEAT: | |
428 | switch(*ccode) | |
429 | { | |
430 | case OP_CRSTAR: | |
431 | case OP_CRMINSTAR: | |
432 | case OP_CRPLUS: | |
433 | case OP_CRMINPLUS: | |
434 | case OP_CRQUERY: | |
435 | case OP_CRMINQUERY: | |
436 | fprintf(f, "%s", OP_names[*ccode]); | |
437 | extra += OP_lengths[*ccode]; | |
438 | break; | |
439 | ||
440 | case OP_CRRANGE: | |
441 | case OP_CRMINRANGE: | |
442 | min = GET2(ccode,1); | |
443 | max = GET2(ccode,3); | |
444 | if (max == 0) fprintf(f, "{%d,}", min); | |
445 | else fprintf(f, "{%d,%d}", min, max); | |
446 | if (*ccode == OP_CRMINRANGE) fprintf(f, "?"); | |
447 | extra += OP_lengths[*ccode]; | |
448 | break; | |
449 | } | |
450 | } | |
451 | break; | |
452 | ||
453 | /* Anything else is just an item with no data*/ | |
454 | ||
455 | default: | |
456 | fprintf(f, " %s", OP_names[*code]); | |
457 | break; | |
458 | } | |
459 | ||
460 | code += OP_lengths[*code] + extra; | |
461 | fprintf(f, "\n"); | |
462 | } | |
463 | } | |
464 | ||
465 | /* End of printint.c */ |