fsf changes, meant to be rebased on upstream
[squirrelmail.git] / functions / decode / utf_8.php
1 <?php
2
3 /**
4 * functions/decode/utf-8.php - utf-8 decoding functions
5 *
6 * This file contains utf-8 decoding function that is needed to read
7 * utf-8 encoded mails in non-utf-8 locale.
8 *
9 * Every decoded character consists of n bytes. First byte is octal
10 * 300-375, other bytes - always octals 200-277.
11 *<pre>
12 * Ranges (first byte):
13 * oct dec hex
14 * Two byte - 300-337 192-223 C0-DF
15 * Three byte - 340-357 224-239 E0-EF
16 * Four byte - 360-367 240-247 F0-F7
17 * Five byte - 370-373 248-251 F8-FB
18 * Six byte - 374-375 252-253 FC-FD
19 *
20 * \a\b characters are decoded to html code calculated with formula:
21 * octdec(a-300)*64 + octdec(b-200)
22 *
23 * \a\b\c characters are decoded to html code calculated with formula:
24 * octdec(a-340)*64^2 + octdec(b-200)*64 + octdec(c-200)
25 *
26 * \a\b\c\d characters are decoded to html code calculated with formula:
27 * octdec(a-360)*64^3 + octdec(b-200)*64^2 +
28 * + octdec(c-200)*64 + octdec(d-200)
29 *
30 * \a\b\c\d\e characters are decoded to html code calculated with formula:
31 * octdec(a-370)*64^4 + octdec(b-200)*64^3 +
32 * + octdec(c-200)*64^2 + octdec(d-200)*64 + octdec(e-200)
33 *
34 * \a\b\c\d\e\f characters are decoded to html code calculated with formula:
35 * octdec(a-374)*64^5 + octdec(b-200)*64^4 + octdec(c-200)*64^3 +
36 * + octdec(d-200)*64^2 + octdec(e-200)*64 + octdec(f-200)
37 *</pre>
38 * @copyright 2003-2021 The SquirrelMail Project Team
39 * @license http://opensource.org/licenses/gpl-license.php GNU Public License
40 * @version $Id$
41 * @package squirrelmail
42 * @subpackage decode
43 */
44
45 /**
46 * Decode utf-8 strings
47 * @param string $string Encoded string
48 * @return string Decoded string
49 */
50 function charset_decode_utf_8 ($string) {
51 global $squirrelmail_language;
52
53 // Japanese translation uses mbstring function to read utf-8
54 if ($squirrelmail_language == 'ja_JP')
55 return $string;
56
57 // don't do decoding when there are no 8bit symbols
58 if (! sq_is8bit($string,'utf-8'))
59 return $string;
60
61 // decode six byte unicode characters
62 /* (i think currently there is no such symbol)
63 $string = preg_replace_callback("/([\374-\375])([\200-\277])([\200-\277])([\200-\277])([\200-\277])([\200-\277])/",
64 create_function ('$matches', 'return \'&#\'.((ord($matches[1])-252)*1073741824+(ord($matches[2])-200)*16777216+(ord($matches[3])-200)*262144+(ord($matches[4])-128)*4096+(ord($matches[5])-128)*64+(ord($matches[6])-128)).\';\';'),
65 $string);
66 */
67
68 // decode five byte unicode characters
69 /* (i think currently there is no such symbol)
70 $string = preg_replace_callback("/([\370-\373])([\200-\277])([\200-\277])([\200-\277])([\200-\277])/",
71 create_function ('$matches', 'return \'&#\'.((ord($matches[1])-248)*16777216+(ord($matches[2])-200)*262144+(ord($matches[3])-128)*4096+(ord($matches[4])-128)*64+(ord($matches[5])-128)).\';\';'),
72 $string);
73 */
74
75 // decode four byte unicode characters
76 $string = preg_replace_callback("/([\360-\367])([\200-\277])([\200-\277])([\200-\277])/",
77 (check_php_version(5, 3, 0)
78 ? function($matches) { return '&#'.((ord($matches[1])-240)*262144+(ord($matches[2])-128)*4096+(ord($matches[3])-128)*64+(ord($matches[4])-128)).';'; }
79 : create_function ('$matches', 'return \'&#\'.((ord($matches[1])-240)*262144+(ord($matches[2])-128)*4096+(ord($matches[3])-128)*64+(ord($matches[4])-128)).\';\';')
80 ),
81 $string);
82
83 // decode three byte unicode characters
84 $string = preg_replace_callback("/([\340-\357])([\200-\277])([\200-\277])/",
85 (check_php_version(5, 3, 0)
86 ? function($matches) { return '&#'.((ord($matches[1])-224)*4096+(ord($matches[2])-128)*64+(ord($matches[3])-128)).';'; }
87 : create_function ('$matches', 'return \'&#\'.((ord($matches[1])-224)*4096+(ord($matches[2])-128)*64+(ord($matches[3])-128)).\';\';')
88 ),
89 $string);
90
91 // decode two byte unicode characters
92 $string = preg_replace_callback("/([\300-\337])([\200-\277])/",
93 (check_php_version(5, 3, 0)
94 ? function($matches) { return '&#'.((ord($matches[1])-192)*64+(ord($matches[2])-128)).';'; }
95 : create_function ('$matches', 'return \'&#\'.((ord($matches[1])-192)*64+(ord($matches[2])-128)).\';\';')
96 ),
97 $string);
98
99 // remove broken unicode
100 $string = preg_replace("/[\200-\237]|\240|[\241-\377]/",'?',$string);
101
102 return $string;
103 }