c6b1e61584d1d0c95ce3cca8a9449ced2a1a6e91
[squirrelmail.git] / functions / decode / utf_8.php
1 <?php
2 /**
3 * functions/decode/utf-8.php - utf-8 decoding functions
4 *
5 * Copyright (c) 2003-2004 The SquirrelMail Project Team
6 * Licensed under the GNU GPL. For full terms see the file COPYING.
7 *
8 * This file contains utf-8 decoding function that is needed to read
9 * utf-8 encoded mails in non-utf-8 locale.
10 *
11 * Every decoded character consists of n bytes. First byte is octal
12 * 300-375, other bytes - always octals 200-277.
13 *<pre>
14 * Ranges (first byte):
15 * oct dec hex
16 * Two byte - 300-337 192-223 C0-DF
17 * Three byte - 340-357 224-239 E0-EF
18 * Four byte - 360-367 240-247 F0-F7
19 * Five byte - 370-373 248-251 F8-FB
20 * Six byte - 374-375 252-253 FC-FD
21 *
22 * \a\b characters are decoded to html code calculated with formula:
23 * octdec(a-300)*64 + octdec(b-200)
24 *
25 * \a\b\c characters are decoded to html code calculated with formula:
26 * octdec(a-340)*64^2 + octdec(b-200)*64 + octdec(c-200)
27 *
28 * \a\b\c\d characters are decoded to html code calculated with formula:
29 * octdec(a-360)*64^3 + octdec(b-200)*64^2 +
30 * + octdec(c-200)*64 + octdec(d-200)
31 *
32 * \a\b\c\d\e characters are decoded to html code calculated with formula:
33 * octdec(a-370)*64^4 + octdec(b-200)*64^3 +
34 * + octdec(c-200)*64^2 + octdec(d-200)*64 + octdec(e-200)
35 *
36 * \a\b\c\d\e\f characters are decoded to html code calculated with formula:
37 * octdec(a-374)*64^5 + octdec(b-200)*64^4 + octdec(c-200)*64^3 +
38 * + octdec(d-200)*64^2 + octdec(e-200)*64 + octdec(f-200)
39 *</pre>
40 * @version $Id$
41 * @package squirrelmail
42 * @subpackage decode
43 */
44
45 /**
46 * Decode utf-8 strings
47 * @param string $string Encoded string
48 * @return string Decoded string
49 */
50 function charset_decode_utf_8 ($string) {
51 global $squirrelmail_language;
52
53 // Japanese translation uses mbstring function to read utf-8
54 if ($squirrelmail_language == 'ja_JP')
55 return $string;
56
57 // don't do decoding when there are no 8bit symbols
58 if (! sq_is8bit($string,'utf-8'))
59 return $string;
60
61 // decode six byte unicode characters
62 /* (i think currently there is no such symbol)
63 $string = preg_replace("/([\374-\375])([\200-\277])([\200-\277])([\200-\277])([\200-\277])([\200-\277])/e",
64 "'&#'.((ord('\\1')-252)*1073741824+(ord('\\2')-200)*16777216+(ord('\\3')-200)*262144+(ord('\\4')-128)*4096+(ord('\\5')-128)*64+(ord('\\6')-128)).';'",
65 $string);
66 */
67
68 // decode five byte unicode characters
69 /* (i think currently there is no such symbol)
70 $string = preg_replace("/([\370-\373])([\200-\277])([\200-\277])([\200-\277])([\200-\277])/e",
71 "'&#'.((ord('\\1')-248)*16777216+(ord('\\2')-200)*262144+(ord('\\3')-128)*4096+(ord('\\4')-128)*64+(ord('\\5')-128)).';'",
72 $string);
73 */
74
75 // decode four byte unicode characters
76 $string = preg_replace("/([\360-\367])([\200-\277])([\200-\277])([\200-\277])/e",
77 "'&#'.((ord('\\1')-240)*262144+(ord('\\2')-128)*4096+(ord('\\3')-128)*64+(ord('\\4')-128)).';'",
78 $string);
79
80 // decode three byte unicode characters
81 $string = preg_replace("/([\340-\357])([\200-\277])([\200-\277])/e",
82 "'&#'.((ord('\\1')-224)*4096+(ord('\\2')-128)*64+(ord('\\3')-128)).';'",
83 $string);
84
85 // decode two byte unicode characters
86 $string = preg_replace("/([\300-\337])([\200-\277])/e",
87 "'&#'.((ord('\\1')-192)*64+(ord('\\2')-128)).';'",
88 $string);
89
90 // remove broken unicode
91 $string = preg_replace("/[\200-\237]|\240|[\241-\377]/",'?',$string);
92
93 return $string;
94 }
95 ?>