Start
[exim.git] / src / src / pcre / printint.c
CommitLineData
c86f6258
PH
1/*************************************************
2* Perl-Compatible Regular Expressions *
3*************************************************/
4
5/*
6This is a library of functions to support regular expressions whose syntax
7and semantics are as close as possible to those of the Perl 5 language. See
8the file Tech.Notes for some information on the internals.
9
10Written by: Philip Hazel <ph10@cam.ac.uk>
11
12 Copyright (c) 1997-2004 University of Cambridge
13
14-----------------------------------------------------------------------------
15Redistribution and use in source and binary forms, with or without
16modification, are permitted provided that the following conditions are met:
17
18 * Redistributions of source code must retain the above copyright notice,
19 this list of conditions and the following disclaimer.
20
21 * Redistributions in binary form must reproduce the above copyright
22 notice, this list of conditions and the following disclaimer in the
23 documentation and/or other materials provided with the distribution.
24
25 * Neither the name of the University of Cambridge nor the names of its
26 contributors may be used to endorse or promote products derived from
27 this software without specific prior written permission.
28
29THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
30AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
33LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
34CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
35SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
36INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
37CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
38ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
39POSSIBILITY OF SUCH DAMAGE.
40-----------------------------------------------------------------------------
41*/
42
43
44/* This module contains a debugging function for printing out the internal form
45of a compiled regular expression. It is kept in a separate file so that it can
46be #included both in the pcretest program, and in the library itself when
47compiled with the debugging switch. */
48
49
50static const char *OP_names[] = { OP_NAME_LIST };
51
52
53/*************************************************
54* Print single- or multi-byte character *
55*************************************************/
56
57/* These tables are actually copies of ones in pcre.c. If we compile the
58library with debugging, they are included twice, but that isn't really a
59problem - compiling with debugging is pretty rare and these are very small. */
60
61static const int utf8_t3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
62
63static const uschar utf8_t4[] = {
64 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
65 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
66 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
67 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
68
69static int
70print_char(FILE *f, uschar *ptr, BOOL utf8)
71{
72int c = *ptr;
73
74if (!utf8 || (c & 0xc0) != 0xc0)
75 {
76 if (isprint(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c);
77 return 0;
78 }
79else
80 {
81 int i;
82 int a = utf8_t4[c & 0x3f]; /* Number of additional bytes */
83 int s = 6*a;
84 c = (c & utf8_t3[a]) << s;
85 for (i = 1; i <= a; i++)
86 {
87 /* This is a check for malformed UTF-8; it should only occur if the sanity
88 check has been turned off. Rather than swallow random bytes, just stop if
89 we hit a bad one. Print it with \X instead of \x as an indication. */
90
91 if ((ptr[i] & 0xc0) != 0x80)
92 {
93 fprintf(f, "\\X{%x}", c);
94 return i - 1;
95 }
96
97 /* The byte is OK */
98
99 s -= 6;
100 c |= (ptr[i] & 0x3f) << s;
101 }
102 if (c < 128) fprintf(f, "\\x%02x", c); else fprintf(f, "\\x{%x}", c);
103 return a;
104 }
105}
106
107
108
109
110/*************************************************
111* Find Unicode property name *
112*************************************************/
113
114static const char *
115get_ucpname(int property)
116{
117#ifdef SUPPORT_UCP
118int i;
119for (i = sizeof(utt)/sizeof(ucp_type_table); i >= 0; i--)
120 {
121 if (property == utt[i].value) break;
122 }
123return (i >= 0)? utt[i].name : "??";
124#else
125return "??";
126#endif
127}
128
129
130
131/*************************************************
132* Print compiled regex *
133*************************************************/
134
135/* Make this function work for a regex with integers either byte order.
136However, we assume that what we are passed is a compiled regex. */
137
138static void
139print_internals(pcre *external_re, FILE *f)
140{
141real_pcre *re = (real_pcre *)external_re;
142uschar *codestart, *code;
143BOOL utf8;
144
145unsigned int options = re->options;
146int offset = re->name_table_offset;
147int count = re->name_count;
148int size = re->name_entry_size;
149
150if (re->magic_number != MAGIC_NUMBER)
151 {
152 offset = ((offset << 8) & 0xff00) | ((offset >> 8) & 0xff);
153 count = ((count << 8) & 0xff00) | ((count >> 8) & 0xff);
154 size = ((size << 8) & 0xff00) | ((size >> 8) & 0xff);
155 options = ((options << 24) & 0xff000000) |
156 ((options << 8) & 0x00ff0000) |
157 ((options >> 8) & 0x0000ff00) |
158 ((options >> 24) & 0x000000ff);
159 }
160
161code = codestart = (uschar *)re + offset + count * size;
162utf8 = (options & PCRE_UTF8) != 0;
163
164for(;;)
165 {
166 uschar *ccode;
167 int c;
168 int extra = 0;
169
170 fprintf(f, "%3d ", (int)(code - codestart));
171
172 if (*code >= OP_BRA)
173 {
174 if (*code - OP_BRA > EXTRACT_BASIC_MAX)
175 fprintf(f, "%3d Bra extra\n", GET(code, 1));
176 else
177 fprintf(f, "%3d Bra %d\n", GET(code, 1), *code - OP_BRA);
178 code += OP_lengths[OP_BRA];
179 continue;
180 }
181
182 switch(*code)
183 {
184 case OP_END:
185 fprintf(f, " %s\n", OP_names[*code]);
186 fprintf(f, "------------------------------------------------------------------\n");
187 return;
188
189 case OP_OPT:
190 fprintf(f, " %.2x %s", code[1], OP_names[*code]);
191 break;
192
193 case OP_CHAR:
194 {
195 fprintf(f, " ");
196 do
197 {
198 code++;
199 code += 1 + print_char(f, code, utf8);
200 }
201 while (*code == OP_CHAR);
202 fprintf(f, "\n");
203 continue;
204 }
205 break;
206
207 case OP_CHARNC:
208 {
209 fprintf(f, " NC ");
210 do
211 {
212 code++;
213 code += 1 + print_char(f, code, utf8);
214 }
215 while (*code == OP_CHARNC);
216 fprintf(f, "\n");
217 continue;
218 }
219 break;
220
221 case OP_KETRMAX:
222 case OP_KETRMIN:
223 case OP_ALT:
224 case OP_KET:
225 case OP_ASSERT:
226 case OP_ASSERT_NOT:
227 case OP_ASSERTBACK:
228 case OP_ASSERTBACK_NOT:
229 case OP_ONCE:
230 case OP_COND:
231 case OP_REVERSE:
232 fprintf(f, "%3d %s", GET(code, 1), OP_names[*code]);
233 break;
234
235 case OP_BRANUMBER:
236 printf("%3d %s", GET2(code, 1), OP_names[*code]);
237 break;
238
239 case OP_CREF:
240 if (GET2(code, 1) == CREF_RECURSE)
241 fprintf(f, " Cond recurse");
242 else
243 fprintf(f, "%3d %s", GET2(code,1), OP_names[*code]);
244 break;
245
246 case OP_STAR:
247 case OP_MINSTAR:
248 case OP_PLUS:
249 case OP_MINPLUS:
250 case OP_QUERY:
251 case OP_MINQUERY:
252 case OP_TYPESTAR:
253 case OP_TYPEMINSTAR:
254 case OP_TYPEPLUS:
255 case OP_TYPEMINPLUS:
256 case OP_TYPEQUERY:
257 case OP_TYPEMINQUERY:
258 fprintf(f, " ");
259 if (*code >= OP_TYPESTAR)
260 {
261 fprintf(f, "%s", OP_names[code[1]]);
262 if (code[1] == OP_PROP || code[1] == OP_NOTPROP)
263 {
264 fprintf(f, " %s ", get_ucpname(code[2]));
265 extra = 1;
266 }
267 }
268 else extra = print_char(f, code+1, utf8);
269 fprintf(f, "%s", OP_names[*code]);
270 break;
271
272 case OP_EXACT:
273 case OP_UPTO:
274 case OP_MINUPTO:
275 fprintf(f, " ");
276 extra = print_char(f, code+3, utf8);
277 fprintf(f, "{");
278 if (*code != OP_EXACT) fprintf(f, ",");
279 fprintf(f, "%d}", GET2(code,1));
280 if (*code == OP_MINUPTO) fprintf(f, "?");
281 break;
282
283 case OP_TYPEEXACT:
284 case OP_TYPEUPTO:
285 case OP_TYPEMINUPTO:
286 fprintf(f, " %s", OP_names[code[3]]);
287 if (code[3] == OP_PROP || code[3] == OP_NOTPROP)
288 {
289 fprintf(f, " %s ", get_ucpname(code[4]));
290 extra = 1;
291 }
292 fprintf(f, "{");
293 if (*code != OP_TYPEEXACT) fprintf(f, "0,");
294 fprintf(f, "%d}", GET2(code,1));
295 if (*code == OP_TYPEMINUPTO) fprintf(f, "?");
296 break;
297
298 case OP_NOT:
299 if (isprint(c = code[1])) fprintf(f, " [^%c]", c);
300 else fprintf(f, " [^\\x%02x]", c);
301 break;
302
303 case OP_NOTSTAR:
304 case OP_NOTMINSTAR:
305 case OP_NOTPLUS:
306 case OP_NOTMINPLUS:
307 case OP_NOTQUERY:
308 case OP_NOTMINQUERY:
309 if (isprint(c = code[1])) fprintf(f, " [^%c]", c);
310 else fprintf(f, " [^\\x%02x]", c);
311 fprintf(f, "%s", OP_names[*code]);
312 break;
313
314 case OP_NOTEXACT:
315 case OP_NOTUPTO:
316 case OP_NOTMINUPTO:
317 if (isprint(c = code[3])) fprintf(f, " [^%c]{", c);
318 else fprintf(f, " [^\\x%02x]{", c);
319 if (*code != OP_NOTEXACT) fprintf(f, ",");
320 fprintf(f, "%d}", GET2(code,1));
321 if (*code == OP_NOTMINUPTO) fprintf(f, "?");
322 break;
323
324 case OP_RECURSE:
325 fprintf(f, "%3d %s", GET(code, 1), OP_names[*code]);
326 break;
327
328 case OP_REF:
329 fprintf(f, " \\%d", GET2(code,1));
330 ccode = code + OP_lengths[*code];
331 goto CLASS_REF_REPEAT;
332
333 case OP_CALLOUT:
334 fprintf(f, " %s %d %d %d", OP_names[*code], code[1], GET(code,2),
335 GET(code, 2 + LINK_SIZE));
336 break;
337
338 case OP_PROP:
339 case OP_NOTPROP:
340 fprintf(f, " %s %s", OP_names[*code], get_ucpname(code[1]));
341 break;
342
343 /* OP_XCLASS can only occur in UTF-8 mode. However, there's no harm in
344 having this code always here, and it makes it less messy without all those
345 #ifdefs. */
346
347 case OP_CLASS:
348 case OP_NCLASS:
349 case OP_XCLASS:
350 {
351 int i, min, max;
352 BOOL printmap;
353
354 fprintf(f, " [");
355
356 if (*code == OP_XCLASS)
357 {
358 extra = GET(code, 1);
359 ccode = code + LINK_SIZE + 1;
360 printmap = (*ccode & XCL_MAP) != 0;
361 if ((*ccode++ & XCL_NOT) != 0) fprintf(f, "^");
362 }
363 else
364 {
365 printmap = TRUE;
366 ccode = code + 1;
367 }
368
369 /* Print a bit map */
370
371 if (printmap)
372 {
373 for (i = 0; i < 256; i++)
374 {
375 if ((ccode[i/8] & (1 << (i&7))) != 0)
376 {
377 int j;
378 for (j = i+1; j < 256; j++)
379 if ((ccode[j/8] & (1 << (j&7))) == 0) break;
380 if (i == '-' || i == ']') fprintf(f, "\\");
381 if (isprint(i)) fprintf(f, "%c", i); else fprintf(f, "\\x%02x", i);
382 if (--j > i)
383 {
384 if (j != i + 1) fprintf(f, "-");
385 if (j == '-' || j == ']') fprintf(f, "\\");
386 if (isprint(j)) fprintf(f, "%c", j); else fprintf(f, "\\x%02x", j);
387 }
388 i = j;
389 }
390 }
391 ccode += 32;
392 }
393
394 /* For an XCLASS there is always some additional data */
395
396 if (*code == OP_XCLASS)
397 {
398 int ch;
399 while ((ch = *ccode++) != XCL_END)
400 {
401 if (ch == XCL_PROP)
402 {
403 fprintf(f, "\\p{%s}", get_ucpname(*ccode++));
404 }
405 else if (ch == XCL_NOTPROP)
406 {
407 fprintf(f, "\\P{%s}", get_ucpname(*ccode++));
408 }
409 else
410 {
411 ccode += 1 + print_char(f, ccode, TRUE);
412 if (ch == XCL_RANGE)
413 {
414 fprintf(f, "-");
415 ccode += 1 + print_char(f, ccode, TRUE);
416 }
417 }
418 }
419 }
420
421 /* Indicate a non-UTF8 class which was created by negation */
422
423 fprintf(f, "]%s", (*code == OP_NCLASS)? " (neg)" : "");
424
425 /* Handle repeats after a class or a back reference */
426
427 CLASS_REF_REPEAT:
428 switch(*ccode)
429 {
430 case OP_CRSTAR:
431 case OP_CRMINSTAR:
432 case OP_CRPLUS:
433 case OP_CRMINPLUS:
434 case OP_CRQUERY:
435 case OP_CRMINQUERY:
436 fprintf(f, "%s", OP_names[*ccode]);
437 extra += OP_lengths[*ccode];
438 break;
439
440 case OP_CRRANGE:
441 case OP_CRMINRANGE:
442 min = GET2(ccode,1);
443 max = GET2(ccode,3);
444 if (max == 0) fprintf(f, "{%d,}", min);
445 else fprintf(f, "{%d,%d}", min, max);
446 if (*ccode == OP_CRMINRANGE) fprintf(f, "?");
447 extra += OP_lengths[*ccode];
448 break;
449 }
450 }
451 break;
452
453 /* Anything else is just an item with no data*/
454
455 default:
456 fprintf(f, " %s", OP_names[*code]);
457 break;
458 }
459
460 code += OP_lengths[*code] + extra;
461 fprintf(f, "\n");
462 }
463}
464
465/* End of printint.c */