Commit | Line | Data |
---|---|---|
8ac170f3 PH |
1 | /* $Cambridge: exim/src/src/pcre/printint.c,v 1.2 2005/06/15 08:57:10 ph10 Exp $ */ |
2 | ||
c86f6258 PH |
3 | /************************************************* |
4 | * Perl-Compatible Regular Expressions * | |
5 | *************************************************/ | |
6 | ||
7 | /* | |
8 | This is a library of functions to support regular expressions whose syntax | |
9 | and semantics are as close as possible to those of the Perl 5 language. See | |
10 | the file Tech.Notes for some information on the internals. | |
11 | ||
12 | Written by: Philip Hazel <ph10@cam.ac.uk> | |
13 | ||
14 | Copyright (c) 1997-2004 University of Cambridge | |
15 | ||
16 | ----------------------------------------------------------------------------- | |
17 | Redistribution and use in source and binary forms, with or without | |
18 | modification, are permitted provided that the following conditions are met: | |
19 | ||
20 | * Redistributions of source code must retain the above copyright notice, | |
21 | this list of conditions and the following disclaimer. | |
22 | ||
23 | * Redistributions in binary form must reproduce the above copyright | |
24 | notice, this list of conditions and the following disclaimer in the | |
25 | documentation and/or other materials provided with the distribution. | |
26 | ||
27 | * Neither the name of the University of Cambridge nor the names of its | |
28 | contributors may be used to endorse or promote products derived from | |
29 | this software without specific prior written permission. | |
30 | ||
31 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
32 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
33 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
34 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |
35 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
36 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
37 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
38 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
39 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
40 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
41 | POSSIBILITY OF SUCH DAMAGE. | |
42 | ----------------------------------------------------------------------------- | |
43 | */ | |
44 | ||
45 | ||
46 | /* This module contains a debugging function for printing out the internal form | |
47 | of a compiled regular expression. It is kept in a separate file so that it can | |
48 | be #included both in the pcretest program, and in the library itself when | |
49 | compiled with the debugging switch. */ | |
50 | ||
51 | ||
52 | static const char *OP_names[] = { OP_NAME_LIST }; | |
53 | ||
54 | ||
55 | /************************************************* | |
56 | * Print single- or multi-byte character * | |
57 | *************************************************/ | |
58 | ||
59 | /* These tables are actually copies of ones in pcre.c. If we compile the | |
60 | library with debugging, they are included twice, but that isn't really a | |
61 | problem - compiling with debugging is pretty rare and these are very small. */ | |
62 | ||
63 | static const int utf8_t3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01}; | |
64 | ||
65 | static const uschar utf8_t4[] = { | |
66 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
67 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
68 | 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, | |
69 | 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 }; | |
70 | ||
71 | static int | |
72 | print_char(FILE *f, uschar *ptr, BOOL utf8) | |
73 | { | |
74 | int c = *ptr; | |
75 | ||
76 | if (!utf8 || (c & 0xc0) != 0xc0) | |
77 | { | |
78 | if (isprint(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c); | |
79 | return 0; | |
80 | } | |
81 | else | |
82 | { | |
83 | int i; | |
84 | int a = utf8_t4[c & 0x3f]; /* Number of additional bytes */ | |
85 | int s = 6*a; | |
86 | c = (c & utf8_t3[a]) << s; | |
87 | for (i = 1; i <= a; i++) | |
88 | { | |
89 | /* This is a check for malformed UTF-8; it should only occur if the sanity | |
90 | check has been turned off. Rather than swallow random bytes, just stop if | |
91 | we hit a bad one. Print it with \X instead of \x as an indication. */ | |
92 | ||
93 | if ((ptr[i] & 0xc0) != 0x80) | |
94 | { | |
95 | fprintf(f, "\\X{%x}", c); | |
96 | return i - 1; | |
97 | } | |
98 | ||
99 | /* The byte is OK */ | |
100 | ||
101 | s -= 6; | |
102 | c |= (ptr[i] & 0x3f) << s; | |
103 | } | |
104 | if (c < 128) fprintf(f, "\\x%02x", c); else fprintf(f, "\\x{%x}", c); | |
105 | return a; | |
106 | } | |
107 | } | |
108 | ||
109 | ||
110 | ||
111 | ||
112 | /************************************************* | |
113 | * Find Unicode property name * | |
114 | *************************************************/ | |
115 | ||
116 | static const char * | |
117 | get_ucpname(int property) | |
118 | { | |
119 | #ifdef SUPPORT_UCP | |
120 | int i; | |
121 | for (i = sizeof(utt)/sizeof(ucp_type_table); i >= 0; i--) | |
122 | { | |
123 | if (property == utt[i].value) break; | |
124 | } | |
125 | return (i >= 0)? utt[i].name : "??"; | |
126 | #else | |
127 | return "??"; | |
128 | #endif | |
129 | } | |
130 | ||
131 | ||
132 | ||
133 | /************************************************* | |
134 | * Print compiled regex * | |
135 | *************************************************/ | |
136 | ||
137 | /* Make this function work for a regex with integers either byte order. | |
138 | However, we assume that what we are passed is a compiled regex. */ | |
139 | ||
140 | static void | |
141 | print_internals(pcre *external_re, FILE *f) | |
142 | { | |
143 | real_pcre *re = (real_pcre *)external_re; | |
144 | uschar *codestart, *code; | |
145 | BOOL utf8; | |
146 | ||
147 | unsigned int options = re->options; | |
148 | int offset = re->name_table_offset; | |
149 | int count = re->name_count; | |
150 | int size = re->name_entry_size; | |
151 | ||
152 | if (re->magic_number != MAGIC_NUMBER) | |
153 | { | |
154 | offset = ((offset << 8) & 0xff00) | ((offset >> 8) & 0xff); | |
155 | count = ((count << 8) & 0xff00) | ((count >> 8) & 0xff); | |
156 | size = ((size << 8) & 0xff00) | ((size >> 8) & 0xff); | |
157 | options = ((options << 24) & 0xff000000) | | |
158 | ((options << 8) & 0x00ff0000) | | |
159 | ((options >> 8) & 0x0000ff00) | | |
160 | ((options >> 24) & 0x000000ff); | |
161 | } | |
162 | ||
163 | code = codestart = (uschar *)re + offset + count * size; | |
164 | utf8 = (options & PCRE_UTF8) != 0; | |
165 | ||
166 | for(;;) | |
167 | { | |
168 | uschar *ccode; | |
169 | int c; | |
170 | int extra = 0; | |
171 | ||
172 | fprintf(f, "%3d ", (int)(code - codestart)); | |
173 | ||
174 | if (*code >= OP_BRA) | |
175 | { | |
176 | if (*code - OP_BRA > EXTRACT_BASIC_MAX) | |
177 | fprintf(f, "%3d Bra extra\n", GET(code, 1)); | |
178 | else | |
179 | fprintf(f, "%3d Bra %d\n", GET(code, 1), *code - OP_BRA); | |
180 | code += OP_lengths[OP_BRA]; | |
181 | continue; | |
182 | } | |
183 | ||
184 | switch(*code) | |
185 | { | |
186 | case OP_END: | |
187 | fprintf(f, " %s\n", OP_names[*code]); | |
188 | fprintf(f, "------------------------------------------------------------------\n"); | |
189 | return; | |
190 | ||
191 | case OP_OPT: | |
192 | fprintf(f, " %.2x %s", code[1], OP_names[*code]); | |
193 | break; | |
194 | ||
195 | case OP_CHAR: | |
196 | { | |
197 | fprintf(f, " "); | |
198 | do | |
199 | { | |
200 | code++; | |
201 | code += 1 + print_char(f, code, utf8); | |
202 | } | |
203 | while (*code == OP_CHAR); | |
204 | fprintf(f, "\n"); | |
205 | continue; | |
206 | } | |
207 | break; | |
208 | ||
209 | case OP_CHARNC: | |
210 | { | |
211 | fprintf(f, " NC "); | |
212 | do | |
213 | { | |
214 | code++; | |
215 | code += 1 + print_char(f, code, utf8); | |
216 | } | |
217 | while (*code == OP_CHARNC); | |
218 | fprintf(f, "\n"); | |
219 | continue; | |
220 | } | |
221 | break; | |
222 | ||
223 | case OP_KETRMAX: | |
224 | case OP_KETRMIN: | |
225 | case OP_ALT: | |
226 | case OP_KET: | |
227 | case OP_ASSERT: | |
228 | case OP_ASSERT_NOT: | |
229 | case OP_ASSERTBACK: | |
230 | case OP_ASSERTBACK_NOT: | |
231 | case OP_ONCE: | |
232 | case OP_COND: | |
233 | case OP_REVERSE: | |
234 | fprintf(f, "%3d %s", GET(code, 1), OP_names[*code]); | |
235 | break; | |
236 | ||
237 | case OP_BRANUMBER: | |
238 | printf("%3d %s", GET2(code, 1), OP_names[*code]); | |
239 | break; | |
240 | ||
241 | case OP_CREF: | |
242 | if (GET2(code, 1) == CREF_RECURSE) | |
243 | fprintf(f, " Cond recurse"); | |
244 | else | |
245 | fprintf(f, "%3d %s", GET2(code,1), OP_names[*code]); | |
246 | break; | |
247 | ||
248 | case OP_STAR: | |
249 | case OP_MINSTAR: | |
250 | case OP_PLUS: | |
251 | case OP_MINPLUS: | |
252 | case OP_QUERY: | |
253 | case OP_MINQUERY: | |
254 | case OP_TYPESTAR: | |
255 | case OP_TYPEMINSTAR: | |
256 | case OP_TYPEPLUS: | |
257 | case OP_TYPEMINPLUS: | |
258 | case OP_TYPEQUERY: | |
259 | case OP_TYPEMINQUERY: | |
260 | fprintf(f, " "); | |
261 | if (*code >= OP_TYPESTAR) | |
262 | { | |
263 | fprintf(f, "%s", OP_names[code[1]]); | |
264 | if (code[1] == OP_PROP || code[1] == OP_NOTPROP) | |
265 | { | |
266 | fprintf(f, " %s ", get_ucpname(code[2])); | |
267 | extra = 1; | |
268 | } | |
269 | } | |
270 | else extra = print_char(f, code+1, utf8); | |
271 | fprintf(f, "%s", OP_names[*code]); | |
272 | break; | |
273 | ||
274 | case OP_EXACT: | |
275 | case OP_UPTO: | |
276 | case OP_MINUPTO: | |
277 | fprintf(f, " "); | |
278 | extra = print_char(f, code+3, utf8); | |
279 | fprintf(f, "{"); | |
280 | if (*code != OP_EXACT) fprintf(f, ","); | |
281 | fprintf(f, "%d}", GET2(code,1)); | |
282 | if (*code == OP_MINUPTO) fprintf(f, "?"); | |
283 | break; | |
284 | ||
285 | case OP_TYPEEXACT: | |
286 | case OP_TYPEUPTO: | |
287 | case OP_TYPEMINUPTO: | |
288 | fprintf(f, " %s", OP_names[code[3]]); | |
289 | if (code[3] == OP_PROP || code[3] == OP_NOTPROP) | |
290 | { | |
291 | fprintf(f, " %s ", get_ucpname(code[4])); | |
292 | extra = 1; | |
293 | } | |
294 | fprintf(f, "{"); | |
295 | if (*code != OP_TYPEEXACT) fprintf(f, "0,"); | |
296 | fprintf(f, "%d}", GET2(code,1)); | |
297 | if (*code == OP_TYPEMINUPTO) fprintf(f, "?"); | |
298 | break; | |
299 | ||
300 | case OP_NOT: | |
301 | if (isprint(c = code[1])) fprintf(f, " [^%c]", c); | |
302 | else fprintf(f, " [^\\x%02x]", c); | |
303 | break; | |
304 | ||
305 | case OP_NOTSTAR: | |
306 | case OP_NOTMINSTAR: | |
307 | case OP_NOTPLUS: | |
308 | case OP_NOTMINPLUS: | |
309 | case OP_NOTQUERY: | |
310 | case OP_NOTMINQUERY: | |
311 | if (isprint(c = code[1])) fprintf(f, " [^%c]", c); | |
312 | else fprintf(f, " [^\\x%02x]", c); | |
313 | fprintf(f, "%s", OP_names[*code]); | |
314 | break; | |
315 | ||
316 | case OP_NOTEXACT: | |
317 | case OP_NOTUPTO: | |
318 | case OP_NOTMINUPTO: | |
319 | if (isprint(c = code[3])) fprintf(f, " [^%c]{", c); | |
320 | else fprintf(f, " [^\\x%02x]{", c); | |
321 | if (*code != OP_NOTEXACT) fprintf(f, ","); | |
322 | fprintf(f, "%d}", GET2(code,1)); | |
323 | if (*code == OP_NOTMINUPTO) fprintf(f, "?"); | |
324 | break; | |
325 | ||
326 | case OP_RECURSE: | |
327 | fprintf(f, "%3d %s", GET(code, 1), OP_names[*code]); | |
328 | break; | |
329 | ||
330 | case OP_REF: | |
331 | fprintf(f, " \\%d", GET2(code,1)); | |
332 | ccode = code + OP_lengths[*code]; | |
333 | goto CLASS_REF_REPEAT; | |
334 | ||
335 | case OP_CALLOUT: | |
336 | fprintf(f, " %s %d %d %d", OP_names[*code], code[1], GET(code,2), | |
337 | GET(code, 2 + LINK_SIZE)); | |
338 | break; | |
339 | ||
340 | case OP_PROP: | |
341 | case OP_NOTPROP: | |
342 | fprintf(f, " %s %s", OP_names[*code], get_ucpname(code[1])); | |
343 | break; | |
344 | ||
345 | /* OP_XCLASS can only occur in UTF-8 mode. However, there's no harm in | |
346 | having this code always here, and it makes it less messy without all those | |
347 | #ifdefs. */ | |
348 | ||
349 | case OP_CLASS: | |
350 | case OP_NCLASS: | |
351 | case OP_XCLASS: | |
352 | { | |
353 | int i, min, max; | |
354 | BOOL printmap; | |
355 | ||
356 | fprintf(f, " ["); | |
357 | ||
358 | if (*code == OP_XCLASS) | |
359 | { | |
360 | extra = GET(code, 1); | |
361 | ccode = code + LINK_SIZE + 1; | |
362 | printmap = (*ccode & XCL_MAP) != 0; | |
363 | if ((*ccode++ & XCL_NOT) != 0) fprintf(f, "^"); | |
364 | } | |
365 | else | |
366 | { | |
367 | printmap = TRUE; | |
368 | ccode = code + 1; | |
369 | } | |
370 | ||
371 | /* Print a bit map */ | |
372 | ||
373 | if (printmap) | |
374 | { | |
375 | for (i = 0; i < 256; i++) | |
376 | { | |
377 | if ((ccode[i/8] & (1 << (i&7))) != 0) | |
378 | { | |
379 | int j; | |
380 | for (j = i+1; j < 256; j++) | |
381 | if ((ccode[j/8] & (1 << (j&7))) == 0) break; | |
382 | if (i == '-' || i == ']') fprintf(f, "\\"); | |
383 | if (isprint(i)) fprintf(f, "%c", i); else fprintf(f, "\\x%02x", i); | |
384 | if (--j > i) | |
385 | { | |
386 | if (j != i + 1) fprintf(f, "-"); | |
387 | if (j == '-' || j == ']') fprintf(f, "\\"); | |
388 | if (isprint(j)) fprintf(f, "%c", j); else fprintf(f, "\\x%02x", j); | |
389 | } | |
390 | i = j; | |
391 | } | |
392 | } | |
393 | ccode += 32; | |
394 | } | |
395 | ||
396 | /* For an XCLASS there is always some additional data */ | |
397 | ||
398 | if (*code == OP_XCLASS) | |
399 | { | |
400 | int ch; | |
401 | while ((ch = *ccode++) != XCL_END) | |
402 | { | |
403 | if (ch == XCL_PROP) | |
404 | { | |
405 | fprintf(f, "\\p{%s}", get_ucpname(*ccode++)); | |
406 | } | |
407 | else if (ch == XCL_NOTPROP) | |
408 | { | |
409 | fprintf(f, "\\P{%s}", get_ucpname(*ccode++)); | |
410 | } | |
411 | else | |
412 | { | |
413 | ccode += 1 + print_char(f, ccode, TRUE); | |
414 | if (ch == XCL_RANGE) | |
415 | { | |
416 | fprintf(f, "-"); | |
417 | ccode += 1 + print_char(f, ccode, TRUE); | |
418 | } | |
419 | } | |
420 | } | |
421 | } | |
422 | ||
423 | /* Indicate a non-UTF8 class which was created by negation */ | |
424 | ||
425 | fprintf(f, "]%s", (*code == OP_NCLASS)? " (neg)" : ""); | |
426 | ||
427 | /* Handle repeats after a class or a back reference */ | |
428 | ||
429 | CLASS_REF_REPEAT: | |
430 | switch(*ccode) | |
431 | { | |
432 | case OP_CRSTAR: | |
433 | case OP_CRMINSTAR: | |
434 | case OP_CRPLUS: | |
435 | case OP_CRMINPLUS: | |
436 | case OP_CRQUERY: | |
437 | case OP_CRMINQUERY: | |
438 | fprintf(f, "%s", OP_names[*ccode]); | |
439 | extra += OP_lengths[*ccode]; | |
440 | break; | |
441 | ||
442 | case OP_CRRANGE: | |
443 | case OP_CRMINRANGE: | |
444 | min = GET2(ccode,1); | |
445 | max = GET2(ccode,3); | |
446 | if (max == 0) fprintf(f, "{%d,}", min); | |
447 | else fprintf(f, "{%d,%d}", min, max); | |
448 | if (*ccode == OP_CRMINRANGE) fprintf(f, "?"); | |
449 | extra += OP_lengths[*ccode]; | |
450 | break; | |
451 | } | |
452 | } | |
453 | break; | |
454 | ||
455 | /* Anything else is just an item with no data*/ | |
456 | ||
457 | default: | |
458 | fprintf(f, " %s", OP_names[*code]); | |
459 | break; | |
460 | } | |
461 | ||
462 | code += OP_lengths[*code] + extra; | |
463 | fprintf(f, "\n"); | |
464 | } | |
465 | } | |
466 | ||
467 | /* End of printint.c */ |