removed local directory name used for testing.
[squirrelmail.git] / functions / decode / utf_8.php
CommitLineData
ee37ee9b 1<?php
d6c32258 2/**
a5ab6455 3 * functions/decode/utf-8.php - utf-8 decoding functions
ee37ee9b 4 *
6c84ba1e 5 * Copyright (c) 2003-2005 The SquirrelMail Project Team
ee37ee9b 6 * Licensed under the GNU GPL. For full terms see the file COPYING.
7 *
8 * This file contains utf-8 decoding function that is needed to read
9 * utf-8 encoded mails in non-utf-8 locale.
10 *
11 * Every decoded character consists of n bytes. First byte is octal
12 * 300-375, other bytes - always octals 200-277.
a5ab6455 13 *<pre>
14 * Ranges (first byte):
15 * oct dec hex
16 * Two byte - 300-337 192-223 C0-DF
17 * Three byte - 340-357 224-239 E0-EF
18 * Four byte - 360-367 240-247 F0-F7
19 * Five byte - 370-373 248-251 F8-FB
20 * Six byte - 374-375 252-253 FC-FD
ee37ee9b 21 *
a5ab6455 22 * \a\b characters are decoded to html code calculated with formula:
23 * octdec(a-300)*64 + octdec(b-200)
ee37ee9b 24 *
a5ab6455 25 * \a\b\c characters are decoded to html code calculated with formula:
26 * octdec(a-340)*64^2 + octdec(b-200)*64 + octdec(c-200)
91e0dccc 27 *
a5ab6455 28 * \a\b\c\d characters are decoded to html code calculated with formula:
e50f5ac2 29 * octdec(a-360)*64^3 + octdec(b-200)*64^2 +
a5ab6455 30 * + octdec(c-200)*64 + octdec(d-200)
31 *
32 * \a\b\c\d\e characters are decoded to html code calculated with formula:
e50f5ac2 33 * octdec(a-370)*64^4 + octdec(b-200)*64^3 +
a5ab6455 34 * + octdec(c-200)*64^2 + octdec(d-200)*64 + octdec(e-200)
35 *
36 * \a\b\c\d\e\f characters are decoded to html code calculated with formula:
e50f5ac2 37 * octdec(a-374)*64^5 + octdec(b-200)*64^4 + octdec(c-200)*64^3 +
a5ab6455 38 * + octdec(d-200)*64^2 + octdec(e-200)*64 + octdec(f-200)
39 *</pre>
31841a9e 40 * @version $Id$
d6c32258 41 * @package squirrelmail
42 * @subpackage decode
43 */
44
45/**
46 * Decode utf-8 strings
47 * @param string $string Encoded string
48 * @return string Decoded string
ee37ee9b 49 */
df8c4d6d 50function charset_decode_utf_8 ($string) {
a5ab6455 51 global $squirrelmail_language;
ee37ee9b 52
a5ab6455 53 // Japanese translation uses mbstring function to read utf-8
3379969e 54 if ($squirrelmail_language == 'ja_JP')
ee37ee9b 55 return $string;
ee37ee9b 56
e53c9681 57 // don't do decoding when there are no 8bit symbols
58 if (! sq_is8bit($string,'utf-8'))
ee37ee9b 59 return $string;
60
e50f5ac2 61 // decode six byte unicode characters
a5ab6455 62 /* (i think currently there is no such symbol)
63 $string = preg_replace("/([\374-\375])([\200-\277])([\200-\277])([\200-\277])([\200-\277])([\200-\277])/e",
64 "'&#'.((ord('\\1')-252)*1073741824+(ord('\\2')-200)*16777216+(ord('\\3')-200)*262144+(ord('\\4')-128)*4096+(ord('\\5')-128)*64+(ord('\\6')-128)).';'",
65 $string);
66 */
67
e50f5ac2 68 // decode five byte unicode characters
a5ab6455 69 /* (i think currently there is no such symbol)
70 $string = preg_replace("/([\370-\373])([\200-\277])([\200-\277])([\200-\277])([\200-\277])/e",
71 "'&#'.((ord('\\1')-248)*16777216+(ord('\\2')-200)*262144+(ord('\\3')-128)*4096+(ord('\\4')-128)*64+(ord('\\5')-128)).';'",
72 $string);
73 */
74
75 // decode four byte unicode characters
76 $string = preg_replace("/([\360-\367])([\200-\277])([\200-\277])([\200-\277])/e",
77 "'&#'.((ord('\\1')-240)*262144+(ord('\\2')-128)*4096+(ord('\\3')-128)*64+(ord('\\4')-128)).';'",
78 $string);
79
ee37ee9b 80 // decode three byte unicode characters
81 $string = preg_replace("/([\340-\357])([\200-\277])([\200-\277])/e",
82 "'&#'.((ord('\\1')-224)*4096+(ord('\\2')-128)*64+(ord('\\3')-128)).';'",
83 $string);
84
85 // decode two byte unicode characters
86 $string = preg_replace("/([\300-\337])([\200-\277])/e",
87 "'&#'.((ord('\\1')-192)*64+(ord('\\2')-128)).';'",
88 $string);
89
a5ab6455 90 // remove broken unicode
91 $string = preg_replace("/[\200-\237]|\240|[\241-\377]/",'?',$string);
92
ee37ee9b 93 return $string;
94}
e53c9681 95?>