ee37ee9b |
1 | <?php |
d6c32258 |
2 | /** |
a5ab6455 |
3 | * functions/decode/utf-8.php - utf-8 decoding functions |
ee37ee9b |
4 | * |
6c84ba1e |
5 | * Copyright (c) 2003-2005 The SquirrelMail Project Team |
ee37ee9b |
6 | * Licensed under the GNU GPL. For full terms see the file COPYING. |
7 | * |
8 | * This file contains utf-8 decoding function that is needed to read |
9 | * utf-8 encoded mails in non-utf-8 locale. |
10 | * |
11 | * Every decoded character consists of n bytes. First byte is octal |
12 | * 300-375, other bytes - always octals 200-277. |
a5ab6455 |
13 | *<pre> |
14 | * Ranges (first byte): |
15 | * oct dec hex |
16 | * Two byte - 300-337 192-223 C0-DF |
17 | * Three byte - 340-357 224-239 E0-EF |
18 | * Four byte - 360-367 240-247 F0-F7 |
19 | * Five byte - 370-373 248-251 F8-FB |
20 | * Six byte - 374-375 252-253 FC-FD |
ee37ee9b |
21 | * |
a5ab6455 |
22 | * \a\b characters are decoded to html code calculated with formula: |
23 | * octdec(a-300)*64 + octdec(b-200) |
ee37ee9b |
24 | * |
a5ab6455 |
25 | * \a\b\c characters are decoded to html code calculated with formula: |
26 | * octdec(a-340)*64^2 + octdec(b-200)*64 + octdec(c-200) |
91e0dccc |
27 | * |
a5ab6455 |
28 | * \a\b\c\d characters are decoded to html code calculated with formula: |
e50f5ac2 |
29 | * octdec(a-360)*64^3 + octdec(b-200)*64^2 + |
a5ab6455 |
30 | * + octdec(c-200)*64 + octdec(d-200) |
31 | * |
32 | * \a\b\c\d\e characters are decoded to html code calculated with formula: |
e50f5ac2 |
33 | * octdec(a-370)*64^4 + octdec(b-200)*64^3 + |
a5ab6455 |
34 | * + octdec(c-200)*64^2 + octdec(d-200)*64 + octdec(e-200) |
35 | * |
36 | * \a\b\c\d\e\f characters are decoded to html code calculated with formula: |
e50f5ac2 |
37 | * octdec(a-374)*64^5 + octdec(b-200)*64^4 + octdec(c-200)*64^3 + |
a5ab6455 |
38 | * + octdec(d-200)*64^2 + octdec(e-200)*64 + octdec(f-200) |
39 | *</pre> |
31841a9e |
40 | * @version $Id$ |
d6c32258 |
41 | * @package squirrelmail |
42 | * @subpackage decode |
43 | */ |
44 | |
45 | /** |
46 | * Decode utf-8 strings |
47 | * @param string $string Encoded string |
48 | * @return string Decoded string |
ee37ee9b |
49 | */ |
df8c4d6d |
50 | function charset_decode_utf_8 ($string) { |
a5ab6455 |
51 | global $squirrelmail_language; |
ee37ee9b |
52 | |
a5ab6455 |
53 | // Japanese translation uses mbstring function to read utf-8 |
3379969e |
54 | if ($squirrelmail_language == 'ja_JP') |
ee37ee9b |
55 | return $string; |
ee37ee9b |
56 | |
e53c9681 |
57 | // don't do decoding when there are no 8bit symbols |
58 | if (! sq_is8bit($string,'utf-8')) |
ee37ee9b |
59 | return $string; |
60 | |
e50f5ac2 |
61 | // decode six byte unicode characters |
a5ab6455 |
62 | /* (i think currently there is no such symbol) |
63 | $string = preg_replace("/([\374-\375])([\200-\277])([\200-\277])([\200-\277])([\200-\277])([\200-\277])/e", |
64 | "'&#'.((ord('\\1')-252)*1073741824+(ord('\\2')-200)*16777216+(ord('\\3')-200)*262144+(ord('\\4')-128)*4096+(ord('\\5')-128)*64+(ord('\\6')-128)).';'", |
65 | $string); |
66 | */ |
67 | |
e50f5ac2 |
68 | // decode five byte unicode characters |
a5ab6455 |
69 | /* (i think currently there is no such symbol) |
70 | $string = preg_replace("/([\370-\373])([\200-\277])([\200-\277])([\200-\277])([\200-\277])/e", |
71 | "'&#'.((ord('\\1')-248)*16777216+(ord('\\2')-200)*262144+(ord('\\3')-128)*4096+(ord('\\4')-128)*64+(ord('\\5')-128)).';'", |
72 | $string); |
73 | */ |
74 | |
75 | // decode four byte unicode characters |
76 | $string = preg_replace("/([\360-\367])([\200-\277])([\200-\277])([\200-\277])/e", |
77 | "'&#'.((ord('\\1')-240)*262144+(ord('\\2')-128)*4096+(ord('\\3')-128)*64+(ord('\\4')-128)).';'", |
78 | $string); |
79 | |
ee37ee9b |
80 | // decode three byte unicode characters |
81 | $string = preg_replace("/([\340-\357])([\200-\277])([\200-\277])/e", |
82 | "'&#'.((ord('\\1')-224)*4096+(ord('\\2')-128)*64+(ord('\\3')-128)).';'", |
83 | $string); |
84 | |
85 | // decode two byte unicode characters |
86 | $string = preg_replace("/([\300-\337])([\200-\277])/e", |
87 | "'&#'.((ord('\\1')-192)*64+(ord('\\2')-128)).';'", |
88 | $string); |
89 | |
a5ab6455 |
90 | // remove broken unicode |
91 | $string = preg_replace("/[\200-\237]|\240|[\241-\377]/",'?',$string); |
92 | |
ee37ee9b |
93 | return $string; |
94 | } |
e53c9681 |
95 | ?> |