From: tokul Date: Sun, 14 Nov 2004 17:30:06 +0000 (+0000) Subject: extending utf-8 decoding function. X-Git-Url: https://vcs.fsf.org/?a=commitdiff_plain;h=a5ab6455a827b1cca6deb3e0fd095cea4b29d9d6;p=squirrelmail.git extending utf-8 decoding function. default_charset test removed because it is better to do in main decoding function and ldap needs decoding without default_charset test git-svn-id: https://svn.code.sf.net/p/squirrelmail/code/trunk/squirrelmail@8371 7612ce4b-ef26-0410-bec9-ea0150e637f0 --- diff --git a/functions/decode/utf_8.php b/functions/decode/utf_8.php index 03153615..c6b1e615 100644 --- a/functions/decode/utf_8.php +++ b/functions/decode/utf_8.php @@ -1,6 +1,6 @@ + * Ranges (first byte): + * oct dec hex + * Two byte - 300-337 192-223 C0-DF + * Three byte - 340-357 224-239 E0-EF + * Four byte - 360-367 240-247 F0-F7 + * Five byte - 370-373 248-251 F8-FB + * Six byte - 374-375 252-253 FC-FD * - * \a\b characters are decoded to html code octdec(a-300)*64 + octdec(b-200) - * \a\b\c characters are decoded to html code octdec(a-340)*64*64 + octdec(b-200)*64 + octdec(c-200) + * \a\b characters are decoded to html code calculated with formula: + * octdec(a-300)*64 + octdec(b-200) * - * decoding cycle is unfinished. please test and report problems to tokul@users.sourceforge.net + * \a\b\c characters are decoded to html code calculated with formula: + * octdec(a-340)*64^2 + octdec(b-200)*64 + octdec(c-200) * + * \a\b\c\d characters are decoded to html code calculated with formula: + * octdec(a-360)*64^3 + octdec(b-200)*64^2 + + * + octdec(c-200)*64 + octdec(d-200) + * + * \a\b\c\d\e characters are decoded to html code calculated with formula: + * octdec(a-370)*64^4 + octdec(b-200)*64^3 + + * + octdec(c-200)*64^2 + octdec(d-200)*64 + octdec(e-200) + * + * \a\b\c\d\e\f characters are decoded to html code calculated with formula: + * octdec(a-374)*64^5 + octdec(b-200)*64^4 + octdec(c-200)*64^3 + + * + octdec(d-200)*64^2 + octdec(e-200)*64 + octdec(f-200) + * * @version $Id$ * @package squirrelmail * @subpackage decode @@ -27,11 +48,9 @@ * @return string Decoded string */ function charset_decode_utf_8 ($string) { - global $squirrelmail_language, $default_charset; - - if (strtolower($default_charset) == 'utf-8') - return $string; + global $squirrelmail_language; + // Japanese translation uses mbstring function to read utf-8 if ($squirrelmail_language == 'ja_JP') return $string; @@ -39,6 +58,25 @@ function charset_decode_utf_8 ($string) { if (! sq_is8bit($string,'utf-8')) return $string; + // decode six byte unicode characters + /* (i think currently there is no such symbol) + $string = preg_replace("/([\374-\375])([\200-\277])([\200-\277])([\200-\277])([\200-\277])([\200-\277])/e", + "'&#'.((ord('\\1')-252)*1073741824+(ord('\\2')-200)*16777216+(ord('\\3')-200)*262144+(ord('\\4')-128)*4096+(ord('\\5')-128)*64+(ord('\\6')-128)).';'", + $string); + */ + + // decode five byte unicode characters + /* (i think currently there is no such symbol) + $string = preg_replace("/([\370-\373])([\200-\277])([\200-\277])([\200-\277])([\200-\277])/e", + "'&#'.((ord('\\1')-248)*16777216+(ord('\\2')-200)*262144+(ord('\\3')-128)*4096+(ord('\\4')-128)*64+(ord('\\5')-128)).';'", + $string); + */ + + // decode four byte unicode characters + $string = preg_replace("/([\360-\367])([\200-\277])([\200-\277])([\200-\277])/e", + "'&#'.((ord('\\1')-240)*262144+(ord('\\2')-128)*4096+(ord('\\3')-128)*64+(ord('\\4')-128)).';'", + $string); + // decode three byte unicode characters $string = preg_replace("/([\340-\357])([\200-\277])([\200-\277])/e", "'&#'.((ord('\\1')-224)*4096+(ord('\\2')-128)*64+(ord('\\3')-128)).';'", @@ -49,6 +87,9 @@ function charset_decode_utf_8 ($string) { "'&#'.((ord('\\1')-192)*64+(ord('\\2')-128)).';'", $string); + // remove broken unicode + $string = preg_replace("/[\200-\237]|\240|[\241-\377]/",'?',$string); + return $string; } ?> \ No newline at end of file