Happy 2015
[squirrelmail.git] / functions / decode / utf_8.php
CommitLineData
ee37ee9b 1<?php
4b4abf93 2
d6c32258 3/**
a5ab6455 4 * functions/decode/utf-8.php - utf-8 decoding functions
ee37ee9b 5 *
ee37ee9b 6 * This file contains utf-8 decoding function that is needed to read
7 * utf-8 encoded mails in non-utf-8 locale.
8 *
9 * Every decoded character consists of n bytes. First byte is octal
10 * 300-375, other bytes - always octals 200-277.
a5ab6455 11 *<pre>
12 * Ranges (first byte):
13 * oct dec hex
14 * Two byte - 300-337 192-223 C0-DF
15 * Three byte - 340-357 224-239 E0-EF
16 * Four byte - 360-367 240-247 F0-F7
17 * Five byte - 370-373 248-251 F8-FB
18 * Six byte - 374-375 252-253 FC-FD
ee37ee9b 19 *
a5ab6455 20 * \a\b characters are decoded to html code calculated with formula:
21 * octdec(a-300)*64 + octdec(b-200)
ee37ee9b 22 *
a5ab6455 23 * \a\b\c characters are decoded to html code calculated with formula:
24 * octdec(a-340)*64^2 + octdec(b-200)*64 + octdec(c-200)
91e0dccc 25 *
a5ab6455 26 * \a\b\c\d characters are decoded to html code calculated with formula:
e50f5ac2 27 * octdec(a-360)*64^3 + octdec(b-200)*64^2 +
a5ab6455 28 * + octdec(c-200)*64 + octdec(d-200)
29 *
30 * \a\b\c\d\e characters are decoded to html code calculated with formula:
e50f5ac2 31 * octdec(a-370)*64^4 + octdec(b-200)*64^3 +
a5ab6455 32 * + octdec(c-200)*64^2 + octdec(d-200)*64 + octdec(e-200)
33 *
34 * \a\b\c\d\e\f characters are decoded to html code calculated with formula:
e50f5ac2 35 * octdec(a-374)*64^5 + octdec(b-200)*64^4 + octdec(c-200)*64^3 +
a5ab6455 36 * + octdec(d-200)*64^2 + octdec(e-200)*64 + octdec(f-200)
37 *</pre>
5e5daa47 38 * @copyright 2003-2015 The SquirrelMail Project Team
4b4abf93 39 * @license http://opensource.org/licenses/gpl-license.php GNU Public License
31841a9e 40 * @version $Id$
d6c32258 41 * @package squirrelmail
42 * @subpackage decode
43 */
44
45/**
46 * Decode utf-8 strings
47 * @param string $string Encoded string
48 * @return string Decoded string
ee37ee9b 49 */
df8c4d6d 50function charset_decode_utf_8 ($string) {
a5ab6455 51 global $squirrelmail_language;
ee37ee9b 52
a5ab6455 53 // Japanese translation uses mbstring function to read utf-8
3379969e 54 if ($squirrelmail_language == 'ja_JP')
ee37ee9b 55 return $string;
ee37ee9b 56
e53c9681 57 // don't do decoding when there are no 8bit symbols
58 if (! sq_is8bit($string,'utf-8'))
ee37ee9b 59 return $string;
60
e50f5ac2 61 // decode six byte unicode characters
a5ab6455 62 /* (i think currently there is no such symbol)
6c4e2e9b 63 $string = preg_replace_callback("/([\374-\375])([\200-\277])([\200-\277])([\200-\277])([\200-\277])([\200-\277])/",
64 create_function ('$matches', 'return \'&#\'.((ord($matches[1])-252)*1073741824+(ord($matches[2])-200)*16777216+(ord($matches[3])-200)*262144+(ord($matches[4])-128)*4096+(ord($matches[5])-128)*64+(ord($matches[6])-128)).\';\';'),
a5ab6455 65 $string);
66 */
67
e50f5ac2 68 // decode five byte unicode characters
a5ab6455 69 /* (i think currently there is no such symbol)
6c4e2e9b 70 $string = preg_replace_callback("/([\370-\373])([\200-\277])([\200-\277])([\200-\277])([\200-\277])/",
71 create_function ('$matches', 'return \'&#\'.((ord($matches[1])-248)*16777216+(ord($matches[2])-200)*262144+(ord($matches[3])-128)*4096+(ord($matches[4])-128)*64+(ord($matches[5])-128)).\';\';'),
a5ab6455 72 $string);
73 */
6c4e2e9b 74
a5ab6455 75 // decode four byte unicode characters
6c4e2e9b 76 $string = preg_replace_callback("/([\360-\367])([\200-\277])([\200-\277])([\200-\277])/",
77 create_function ('$matches', 'return \'&#\'.((ord($matches[1])-240)*262144+(ord($matches[2])-128)*4096+(ord($matches[3])-128)*64+(ord($matches[4])-128)).\';\';'),
a5ab6455 78 $string);
79
ee37ee9b 80 // decode three byte unicode characters
6c4e2e9b 81 $string = preg_replace_callback("/([\340-\357])([\200-\277])([\200-\277])/",
82 create_function ('$matches', 'return \'&#\'.((ord($matches[1])-224)*4096+(ord($matches[2])-128)*64+(ord($matches[3])-128)).\';\';'),
ee37ee9b 83 $string);
84
85 // decode two byte unicode characters
6c4e2e9b 86 $string = preg_replace_callback("/([\300-\337])([\200-\277])/",
87 create_function ('$matches', 'return \'&#\'.((ord($matches[1])-192)*64+(ord($matches[2])-128)).\';\';'),
ee37ee9b 88 $string);
89
a5ab6455 90 // remove broken unicode
91 $string = preg_replace("/[\200-\237]|\240|[\241-\377]/",'?',$string);
92
ee37ee9b 93 return $string;
94}