Commit | Line | Data |
---|---|---|
47db1125 | 1 | /* $Cambridge: exim/src/src/pcre/pcre_tables.c,v 1.6 2007/11/12 13:02:20 nm4 Exp $ */ |
8ac170f3 PH |
2 | |
3 | /************************************************* | |
4 | * Perl-Compatible Regular Expressions * | |
5 | *************************************************/ | |
6 | ||
7 | /* PCRE is a library of functions to support regular expressions whose syntax | |
8 | and semantics are as close as possible to those of the Perl 5 language. | |
9 | ||
10 | Written by Philip Hazel | |
64f2600a | 11 | Copyright (c) 1997-2007 University of Cambridge |
8ac170f3 PH |
12 | |
13 | ----------------------------------------------------------------------------- | |
14 | Redistribution and use in source and binary forms, with or without | |
15 | modification, are permitted provided that the following conditions are met: | |
16 | ||
17 | * Redistributions of source code must retain the above copyright notice, | |
18 | this list of conditions and the following disclaimer. | |
19 | ||
20 | * Redistributions in binary form must reproduce the above copyright | |
21 | notice, this list of conditions and the following disclaimer in the | |
22 | documentation and/or other materials provided with the distribution. | |
23 | ||
24 | * Neither the name of the University of Cambridge nor the names of its | |
25 | contributors may be used to endorse or promote products derived from | |
26 | this software without specific prior written permission. | |
27 | ||
28 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
29 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
30 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
31 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |
32 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
33 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
34 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
35 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
36 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
37 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
38 | POSSIBILITY OF SUCH DAMAGE. | |
39 | ----------------------------------------------------------------------------- | |
40 | */ | |
41 | ||
42 | ||
43 | /* This module contains some fixed tables that are used by more than one of the | |
aa41d2de PH |
44 | PCRE code modules. The tables are also #included by the pcretest program, which |
45 | uses macros to change their names from _pcre_xxx to xxxx, thereby avoiding name | |
46 | clashes with the library. */ | |
8ac170f3 PH |
47 | |
48 | ||
47db1125 NM |
49 | #ifdef HAVE_CONFIG_H |
50 | #include "config.h" | |
51 | #endif | |
52 | ||
8ac170f3 PH |
53 | #include "pcre_internal.h" |
54 | ||
55 | ||
56 | /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that | |
aa41d2de | 57 | the definition is next to the definition of the opcodes in pcre_internal.h. */ |
8ac170f3 PH |
58 | |
59 | const uschar _pcre_OP_lengths[] = { OP_LENGTHS }; | |
60 | ||
61 | ||
62 | ||
63 | /************************************************* | |
64 | * Tables for UTF-8 support * | |
65 | *************************************************/ | |
66 | ||
67 | /* These are the breakpoints for different numbers of bytes in a UTF-8 | |
68 | character. */ | |
69 | ||
64f2600a PH |
70 | #ifdef SUPPORT_UTF8 |
71 | ||
8ac170f3 PH |
72 | const int _pcre_utf8_table1[] = |
73 | { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff}; | |
74 | ||
75 | const int _pcre_utf8_table1_size = sizeof(_pcre_utf8_table1)/sizeof(int); | |
76 | ||
77 | /* These are the indicator bits and the mask for the data bits to set in the | |
78 | first byte of a character, indexed by the number of additional bytes. */ | |
79 | ||
80 | const int _pcre_utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc}; | |
81 | const int _pcre_utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01}; | |
82 | ||
6bf342e1 PH |
83 | /* Table of the number of extra bytes, indexed by the first byte masked with |
84 | 0x3f. The highest number for a valid UTF-8 first byte is in fact 0x3d. */ | |
8ac170f3 PH |
85 | |
86 | const uschar _pcre_utf8_table4[] = { | |
87 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
88 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
89 | 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, | |
90 | 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 }; | |
91 | ||
47db1125 NM |
92 | /* The pcre_utt[] table below translates Unicode property names into type and |
93 | code values. It is searched by binary chop, so must be in collating sequence of | |
94 | name. Originally, the table contained pointers to the name strings in the first | |
95 | field of each entry. However, that leads to a large number of relocations when | |
96 | a shared library is dynamically loaded. A significant reduction is made by | |
97 | putting all the names into a single, large string and then using offsets in the | |
98 | table itself. Maintenance is more error-prone, but frequent changes to this | |
99 | data is unlikely. */ | |
100 | ||
101 | const char _pcre_utt_names[] = | |
102 | "Any\0" | |
103 | "Arabic\0" | |
104 | "Armenian\0" | |
105 | "Balinese\0" | |
106 | "Bengali\0" | |
107 | "Bopomofo\0" | |
108 | "Braille\0" | |
109 | "Buginese\0" | |
110 | "Buhid\0" | |
111 | "C\0" | |
112 | "Canadian_Aboriginal\0" | |
113 | "Cc\0" | |
114 | "Cf\0" | |
115 | "Cherokee\0" | |
116 | "Cn\0" | |
117 | "Co\0" | |
118 | "Common\0" | |
119 | "Coptic\0" | |
120 | "Cs\0" | |
121 | "Cuneiform\0" | |
122 | "Cypriot\0" | |
123 | "Cyrillic\0" | |
124 | "Deseret\0" | |
125 | "Devanagari\0" | |
126 | "Ethiopic\0" | |
127 | "Georgian\0" | |
128 | "Glagolitic\0" | |
129 | "Gothic\0" | |
130 | "Greek\0" | |
131 | "Gujarati\0" | |
132 | "Gurmukhi\0" | |
133 | "Han\0" | |
134 | "Hangul\0" | |
135 | "Hanunoo\0" | |
136 | "Hebrew\0" | |
137 | "Hiragana\0" | |
138 | "Inherited\0" | |
139 | "Kannada\0" | |
140 | "Katakana\0" | |
141 | "Kharoshthi\0" | |
142 | "Khmer\0" | |
143 | "L\0" | |
144 | "L&\0" | |
145 | "Lao\0" | |
146 | "Latin\0" | |
147 | "Limbu\0" | |
148 | "Linear_B\0" | |
149 | "Ll\0" | |
150 | "Lm\0" | |
151 | "Lo\0" | |
152 | "Lt\0" | |
153 | "Lu\0" | |
154 | "M\0" | |
155 | "Malayalam\0" | |
156 | "Mc\0" | |
157 | "Me\0" | |
158 | "Mn\0" | |
159 | "Mongolian\0" | |
160 | "Myanmar\0" | |
161 | "N\0" | |
162 | "Nd\0" | |
163 | "New_Tai_Lue\0" | |
164 | "Nko\0" | |
165 | "Nl\0" | |
166 | "No\0" | |
167 | "Ogham\0" | |
168 | "Old_Italic\0" | |
169 | "Old_Persian\0" | |
170 | "Oriya\0" | |
171 | "Osmanya\0" | |
172 | "P\0" | |
173 | "Pc\0" | |
174 | "Pd\0" | |
175 | "Pe\0" | |
176 | "Pf\0" | |
177 | "Phags_Pa\0" | |
178 | "Phoenician\0" | |
179 | "Pi\0" | |
180 | "Po\0" | |
181 | "Ps\0" | |
182 | "Runic\0" | |
183 | "S\0" | |
184 | "Sc\0" | |
185 | "Shavian\0" | |
186 | "Sinhala\0" | |
187 | "Sk\0" | |
188 | "Sm\0" | |
189 | "So\0" | |
190 | "Syloti_Nagri\0" | |
191 | "Syriac\0" | |
192 | "Tagalog\0" | |
193 | "Tagbanwa\0" | |
194 | "Tai_Le\0" | |
195 | "Tamil\0" | |
196 | "Telugu\0" | |
197 | "Thaana\0" | |
198 | "Thai\0" | |
199 | "Tibetan\0" | |
200 | "Tifinagh\0" | |
201 | "Ugaritic\0" | |
202 | "Yi\0" | |
203 | "Z\0" | |
204 | "Zl\0" | |
205 | "Zp\0" | |
206 | "Zs\0"; | |
8ac170f3 PH |
207 | |
208 | const ucp_type_table _pcre_utt[] = { | |
47db1125 NM |
209 | { 0, PT_ANY, 0 }, |
210 | { 4, PT_SC, ucp_Arabic }, | |
211 | { 11, PT_SC, ucp_Armenian }, | |
212 | { 20, PT_SC, ucp_Balinese }, | |
213 | { 29, PT_SC, ucp_Bengali }, | |
214 | { 37, PT_SC, ucp_Bopomofo }, | |
215 | { 46, PT_SC, ucp_Braille }, | |
216 | { 54, PT_SC, ucp_Buginese }, | |
217 | { 63, PT_SC, ucp_Buhid }, | |
218 | { 69, PT_GC, ucp_C }, | |
219 | { 71, PT_SC, ucp_Canadian_Aboriginal }, | |
220 | { 91, PT_PC, ucp_Cc }, | |
221 | { 94, PT_PC, ucp_Cf }, | |
222 | { 97, PT_SC, ucp_Cherokee }, | |
223 | { 106, PT_PC, ucp_Cn }, | |
224 | { 109, PT_PC, ucp_Co }, | |
225 | { 112, PT_SC, ucp_Common }, | |
226 | { 119, PT_SC, ucp_Coptic }, | |
227 | { 126, PT_PC, ucp_Cs }, | |
228 | { 129, PT_SC, ucp_Cuneiform }, | |
229 | { 139, PT_SC, ucp_Cypriot }, | |
230 | { 147, PT_SC, ucp_Cyrillic }, | |
231 | { 156, PT_SC, ucp_Deseret }, | |
232 | { 164, PT_SC, ucp_Devanagari }, | |
233 | { 175, PT_SC, ucp_Ethiopic }, | |
234 | { 184, PT_SC, ucp_Georgian }, | |
235 | { 193, PT_SC, ucp_Glagolitic }, | |
236 | { 204, PT_SC, ucp_Gothic }, | |
237 | { 211, PT_SC, ucp_Greek }, | |
238 | { 217, PT_SC, ucp_Gujarati }, | |
239 | { 226, PT_SC, ucp_Gurmukhi }, | |
240 | { 235, PT_SC, ucp_Han }, | |
241 | { 239, PT_SC, ucp_Hangul }, | |
242 | { 246, PT_SC, ucp_Hanunoo }, | |
243 | { 254, PT_SC, ucp_Hebrew }, | |
244 | { 261, PT_SC, ucp_Hiragana }, | |
245 | { 270, PT_SC, ucp_Inherited }, | |
246 | { 280, PT_SC, ucp_Kannada }, | |
247 | { 288, PT_SC, ucp_Katakana }, | |
248 | { 297, PT_SC, ucp_Kharoshthi }, | |
249 | { 308, PT_SC, ucp_Khmer }, | |
250 | { 314, PT_GC, ucp_L }, | |
251 | { 316, PT_LAMP, 0 }, | |
252 | { 319, PT_SC, ucp_Lao }, | |
253 | { 323, PT_SC, ucp_Latin }, | |
254 | { 329, PT_SC, ucp_Limbu }, | |
255 | { 335, PT_SC, ucp_Linear_B }, | |
256 | { 344, PT_PC, ucp_Ll }, | |
257 | { 347, PT_PC, ucp_Lm }, | |
258 | { 350, PT_PC, ucp_Lo }, | |
259 | { 353, PT_PC, ucp_Lt }, | |
260 | { 356, PT_PC, ucp_Lu }, | |
261 | { 359, PT_GC, ucp_M }, | |
262 | { 361, PT_SC, ucp_Malayalam }, | |
263 | { 371, PT_PC, ucp_Mc }, | |
264 | { 374, PT_PC, ucp_Me }, | |
265 | { 377, PT_PC, ucp_Mn }, | |
266 | { 380, PT_SC, ucp_Mongolian }, | |
267 | { 390, PT_SC, ucp_Myanmar }, | |
268 | { 398, PT_GC, ucp_N }, | |
269 | { 400, PT_PC, ucp_Nd }, | |
270 | { 403, PT_SC, ucp_New_Tai_Lue }, | |
271 | { 415, PT_SC, ucp_Nko }, | |
272 | { 419, PT_PC, ucp_Nl }, | |
273 | { 422, PT_PC, ucp_No }, | |
274 | { 425, PT_SC, ucp_Ogham }, | |
275 | { 431, PT_SC, ucp_Old_Italic }, | |
276 | { 442, PT_SC, ucp_Old_Persian }, | |
277 | { 454, PT_SC, ucp_Oriya }, | |
278 | { 460, PT_SC, ucp_Osmanya }, | |
279 | { 468, PT_GC, ucp_P }, | |
280 | { 470, PT_PC, ucp_Pc }, | |
281 | { 473, PT_PC, ucp_Pd }, | |
282 | { 476, PT_PC, ucp_Pe }, | |
283 | { 479, PT_PC, ucp_Pf }, | |
284 | { 482, PT_SC, ucp_Phags_Pa }, | |
285 | { 491, PT_SC, ucp_Phoenician }, | |
286 | { 502, PT_PC, ucp_Pi }, | |
287 | { 505, PT_PC, ucp_Po }, | |
288 | { 508, PT_PC, ucp_Ps }, | |
289 | { 511, PT_SC, ucp_Runic }, | |
290 | { 517, PT_GC, ucp_S }, | |
291 | { 519, PT_PC, ucp_Sc }, | |
292 | { 522, PT_SC, ucp_Shavian }, | |
293 | { 530, PT_SC, ucp_Sinhala }, | |
294 | { 538, PT_PC, ucp_Sk }, | |
295 | { 541, PT_PC, ucp_Sm }, | |
296 | { 544, PT_PC, ucp_So }, | |
297 | { 547, PT_SC, ucp_Syloti_Nagri }, | |
298 | { 560, PT_SC, ucp_Syriac }, | |
299 | { 567, PT_SC, ucp_Tagalog }, | |
300 | { 575, PT_SC, ucp_Tagbanwa }, | |
301 | { 584, PT_SC, ucp_Tai_Le }, | |
302 | { 591, PT_SC, ucp_Tamil }, | |
303 | { 597, PT_SC, ucp_Telugu }, | |
304 | { 604, PT_SC, ucp_Thaana }, | |
305 | { 611, PT_SC, ucp_Thai }, | |
306 | { 616, PT_SC, ucp_Tibetan }, | |
307 | { 624, PT_SC, ucp_Tifinagh }, | |
308 | { 633, PT_SC, ucp_Ugaritic }, | |
309 | { 642, PT_SC, ucp_Yi }, | |
310 | { 645, PT_GC, ucp_Z }, | |
311 | { 647, PT_PC, ucp_Zl }, | |
312 | { 650, PT_PC, ucp_Zp }, | |
313 | { 653, PT_PC, ucp_Zs } | |
8ac170f3 PH |
314 | }; |
315 | ||
316 | const int _pcre_utt_size = sizeof(_pcre_utt)/sizeof(ucp_type_table); | |
317 | ||
64f2600a PH |
318 | #endif /* SUPPORT_UTF8 */ |
319 | ||
8ac170f3 | 320 | /* End of pcre_tables.c */ |