| 1 | /* $Cambridge: exim/src/src/pcre/pcre_compile.c,v 1.2 2005/08/08 10:22:14 ph10 Exp $ */ |
| 2 | |
| 3 | /************************************************* |
| 4 | * Perl-Compatible Regular Expressions * |
| 5 | *************************************************/ |
| 6 | |
| 7 | /* PCRE is a library of functions to support regular expressions whose syntax |
| 8 | and semantics are as close as possible to those of the Perl 5 language. |
| 9 | |
| 10 | Written by Philip Hazel |
| 11 | Copyright (c) 1997-2005 University of Cambridge |
| 12 | |
| 13 | ----------------------------------------------------------------------------- |
| 14 | Redistribution and use in source and binary forms, with or without |
| 15 | modification, are permitted provided that the following conditions are met: |
| 16 | |
| 17 | * Redistributions of source code must retain the above copyright notice, |
| 18 | this list of conditions and the following disclaimer. |
| 19 | |
| 20 | * Redistributions in binary form must reproduce the above copyright |
| 21 | notice, this list of conditions and the following disclaimer in the |
| 22 | documentation and/or other materials provided with the distribution. |
| 23 | |
| 24 | * Neither the name of the University of Cambridge nor the names of its |
| 25 | contributors may be used to endorse or promote products derived from |
| 26 | this software without specific prior written permission. |
| 27 | |
| 28 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| 29 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 30 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 31 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
| 32 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| 33 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| 34 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| 35 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| 36 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| 37 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| 38 | POSSIBILITY OF SUCH DAMAGE. |
| 39 | ----------------------------------------------------------------------------- |
| 40 | */ |
| 41 | |
| 42 | |
| 43 | /* This module contains the external function pcre_compile(), along with |
| 44 | supporting internal functions that are not used by other modules. */ |
| 45 | |
| 46 | |
| 47 | #include "pcre_internal.h" |
| 48 | |
| 49 | |
| 50 | /************************************************* |
| 51 | * Code parameters and static tables * |
| 52 | *************************************************/ |
| 53 | |
| 54 | /* Maximum number of items on the nested bracket stacks at compile time. This |
| 55 | applies to the nesting of all kinds of parentheses. It does not limit |
| 56 | un-nested, non-capturing parentheses. This number can be made bigger if |
| 57 | necessary - it is used to dimension one int and one unsigned char vector at |
| 58 | compile time. */ |
| 59 | |
| 60 | #define BRASTACK_SIZE 200 |
| 61 | |
| 62 | |
| 63 | /* Table for handling escaped characters in the range '0'-'z'. Positive returns |
| 64 | are simple data values; negative values are for special things like \d and so |
| 65 | on. Zero means further processing is needed (for things like \x), or the escape |
| 66 | is invalid. */ |
| 67 | |
| 68 | #if !EBCDIC /* This is the "normal" table for ASCII systems */ |
| 69 | static const short int escapes[] = { |
| 70 | 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */ |
| 71 | 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */ |
| 72 | '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */ |
| 73 | 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */ |
| 74 | -ESC_P, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */ |
| 75 | -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */ |
| 76 | '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */ |
| 77 | 0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */ |
| 78 | -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */ |
| 79 | 0, 0, -ESC_z /* x - z */ |
| 80 | }; |
| 81 | |
| 82 | #else /* This is the "abnormal" table for EBCDIC systems */ |
| 83 | static const short int escapes[] = { |
| 84 | /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|', |
| 85 | /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0, |
| 86 | /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~', |
| 87 | /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0, |
| 88 | /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?', |
| 89 | /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0, |
| 90 | /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"', |
| 91 | /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, |
| 92 | /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0, |
| 93 | /* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, -ESC_p, |
| 94 | /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0, |
| 95 | /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0, |
| 96 | /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0, |
| 97 | /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0, |
| 98 | /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-', |
| 99 | /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G, |
| 100 | /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0, |
| 101 | /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P, |
| 102 | /* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0, |
| 103 | /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X, |
| 104 | /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0, |
| 105 | /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0, |
| 106 | /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0 |
| 107 | }; |
| 108 | #endif |
| 109 | |
| 110 | |
| 111 | /* Tables of names of POSIX character classes and their lengths. The list is |
| 112 | terminated by a zero length entry. The first three must be alpha, upper, lower, |
| 113 | as this is assumed for handling case independence. */ |
| 114 | |
| 115 | static const char *const posix_names[] = { |
| 116 | "alpha", "lower", "upper", |
| 117 | "alnum", "ascii", "blank", "cntrl", "digit", "graph", |
| 118 | "print", "punct", "space", "word", "xdigit" }; |
| 119 | |
| 120 | static const uschar posix_name_lengths[] = { |
| 121 | 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 }; |
| 122 | |
| 123 | /* Table of class bit maps for each POSIX class; up to three may be combined |
| 124 | to form the class. The table for [:blank:] is dynamically modified to remove |
| 125 | the vertical space characters. */ |
| 126 | |
| 127 | static const int posix_class_maps[] = { |
| 128 | cbit_lower, cbit_upper, -1, /* alpha */ |
| 129 | cbit_lower, -1, -1, /* lower */ |
| 130 | cbit_upper, -1, -1, /* upper */ |
| 131 | cbit_digit, cbit_lower, cbit_upper, /* alnum */ |
| 132 | cbit_print, cbit_cntrl, -1, /* ascii */ |
| 133 | cbit_space, -1, -1, /* blank - a GNU extension */ |
| 134 | cbit_cntrl, -1, -1, /* cntrl */ |
| 135 | cbit_digit, -1, -1, /* digit */ |
| 136 | cbit_graph, -1, -1, /* graph */ |
| 137 | cbit_print, -1, -1, /* print */ |
| 138 | cbit_punct, -1, -1, /* punct */ |
| 139 | cbit_space, -1, -1, /* space */ |
| 140 | cbit_word, -1, -1, /* word - a Perl extension */ |
| 141 | cbit_xdigit,-1, -1 /* xdigit */ |
| 142 | }; |
| 143 | |
| 144 | |
| 145 | /* The texts of compile-time error messages. These are "char *" because they |
| 146 | are passed to the outside world. */ |
| 147 | |
| 148 | static const char *error_texts[] = { |
| 149 | "no error", |
| 150 | "\\ at end of pattern", |
| 151 | "\\c at end of pattern", |
| 152 | "unrecognized character follows \\", |
| 153 | "numbers out of order in {} quantifier", |
| 154 | /* 5 */ |
| 155 | "number too big in {} quantifier", |
| 156 | "missing terminating ] for character class", |
| 157 | "invalid escape sequence in character class", |
| 158 | "range out of order in character class", |
| 159 | "nothing to repeat", |
| 160 | /* 10 */ |
| 161 | "operand of unlimited repeat could match the empty string", |
| 162 | "internal error: unexpected repeat", |
| 163 | "unrecognized character after (?", |
| 164 | "POSIX named classes are supported only within a class", |
| 165 | "missing )", |
| 166 | /* 15 */ |
| 167 | "reference to non-existent subpattern", |
| 168 | "erroffset passed as NULL", |
| 169 | "unknown option bit(s) set", |
| 170 | "missing ) after comment", |
| 171 | "parentheses nested too deeply", |
| 172 | /* 20 */ |
| 173 | "regular expression too large", |
| 174 | "failed to get memory", |
| 175 | "unmatched parentheses", |
| 176 | "internal error: code overflow", |
| 177 | "unrecognized character after (?<", |
| 178 | /* 25 */ |
| 179 | "lookbehind assertion is not fixed length", |
| 180 | "malformed number after (?(", |
| 181 | "conditional group contains more than two branches", |
| 182 | "assertion expected after (?(", |
| 183 | "(?R or (?digits must be followed by )", |
| 184 | /* 30 */ |
| 185 | "unknown POSIX class name", |
| 186 | "POSIX collating elements are not supported", |
| 187 | "this version of PCRE is not compiled with PCRE_UTF8 support", |
| 188 | "spare error", |
| 189 | "character value in \\x{...} sequence is too large", |
| 190 | /* 35 */ |
| 191 | "invalid condition (?(0)", |
| 192 | "\\C not allowed in lookbehind assertion", |
| 193 | "PCRE does not support \\L, \\l, \\N, \\U, or \\u", |
| 194 | "number after (?C is > 255", |
| 195 | "closing ) for (?C expected", |
| 196 | /* 40 */ |
| 197 | "recursive call could loop indefinitely", |
| 198 | "unrecognized character after (?P", |
| 199 | "syntax error after (?P", |
| 200 | "two named groups have the same name", |
| 201 | "invalid UTF-8 string", |
| 202 | /* 45 */ |
| 203 | "support for \\P, \\p, and \\X has not been compiled", |
| 204 | "malformed \\P or \\p sequence", |
| 205 | "unknown property name after \\P or \\p" |
| 206 | }; |
| 207 | |
| 208 | |
| 209 | /* Table to identify digits and hex digits. This is used when compiling |
| 210 | patterns. Note that the tables in chartables are dependent on the locale, and |
| 211 | may mark arbitrary characters as digits - but the PCRE compiling code expects |
| 212 | to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have |
| 213 | a private table here. It costs 256 bytes, but it is a lot faster than doing |
| 214 | character value tests (at least in some simple cases I timed), and in some |
| 215 | applications one wants PCRE to compile efficiently as well as match |
| 216 | efficiently. |
| 217 | |
| 218 | For convenience, we use the same bit definitions as in chartables: |
| 219 | |
| 220 | 0x04 decimal digit |
| 221 | 0x08 hexadecimal digit |
| 222 | |
| 223 | Then we can use ctype_digit and ctype_xdigit in the code. */ |
| 224 | |
| 225 | #if !EBCDIC /* This is the "normal" case, for ASCII systems */ |
| 226 | static const unsigned char digitab[] = |
| 227 | { |
| 228 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */ |
| 229 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */ |
| 230 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */ |
| 231 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */ |
| 232 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */ |
| 233 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */ |
| 234 | 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */ |
| 235 | 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */ |
| 236 | 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */ |
| 237 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */ |
| 238 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */ |
| 239 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */ |
| 240 | 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */ |
| 241 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */ |
| 242 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */ |
| 243 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */ |
| 244 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */ |
| 245 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */ |
| 246 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */ |
| 247 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */ |
| 248 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */ |
| 249 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */ |
| 250 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */ |
| 251 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */ |
| 252 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */ |
| 253 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */ |
| 254 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */ |
| 255 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */ |
| 256 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */ |
| 257 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */ |
| 258 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */ |
| 259 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */ |
| 260 | |
| 261 | #else /* This is the "abnormal" case, for EBCDIC systems */ |
| 262 | static const unsigned char digitab[] = |
| 263 | { |
| 264 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */ |
| 265 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */ |
| 266 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */ |
| 267 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */ |
| 268 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */ |
| 269 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */ |
| 270 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */ |
| 271 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */ |
| 272 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */ |
| 273 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */ |
| 274 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */ |
| 275 |