X-Git-Url: https://vcs.fsf.org/?a=blobdiff_plain;ds=sidebyside;f=functions%2Fdecode%2Futf_8.php;h=263fb6e2bfcf9e79e8f4c32ee9297fb76d48d8ea;hb=8ed1923822b383ddb338e9eef75bb7f110cc47b4;hp=ca2e73b154280ae4ff1dae20bc1f655052aff52f;hpb=82d304a0501324b276cabab1870755d5352bd21c;p=squirrelmail.git diff --git a/functions/decode/utf_8.php b/functions/decode/utf_8.php index ca2e73b1..263fb6e2 100644 --- a/functions/decode/utf_8.php +++ b/functions/decode/utf_8.php @@ -1,22 +1,43 @@ + * Ranges (first byte): + * oct dec hex + * Two byte - 300-337 192-223 C0-DF + * Three byte - 340-357 224-239 E0-EF + * Four byte - 360-367 240-247 F0-F7 + * Five byte - 370-373 248-251 F8-FB + * Six byte - 374-375 252-253 FC-FD + * + * \a\b characters are decoded to html code calculated with formula: + * octdec(a-300)*64 + octdec(b-200) * - * \a\b characters are decoded to html code octdec(a-300)*64 + octdec(b-200) - * \a\b\c characters are decoded to html code octdec(a-340)*64*64 + octdec(b-200)*64 + octdec(c-200) + * \a\b\c characters are decoded to html code calculated with formula: + * octdec(a-340)*64^2 + octdec(b-200)*64 + octdec(c-200) * - * decoding cycle is unfinished. please test and report problems to tokul@users.sourceforge.net - * + * \a\b\c\d characters are decoded to html code calculated with formula: + * octdec(a-360)*64^3 + octdec(b-200)*64^2 + + * + octdec(c-200)*64 + octdec(d-200) + * + * \a\b\c\d\e characters are decoded to html code calculated with formula: + * octdec(a-370)*64^4 + octdec(b-200)*64^3 + + * + octdec(c-200)*64^2 + octdec(d-200)*64 + octdec(e-200) + * + * \a\b\c\d\e\f characters are decoded to html code calculated with formula: + * octdec(a-374)*64^5 + octdec(b-200)*64^4 + octdec(c-200)*64^3 + + * + octdec(d-200)*64^2 + octdec(e-200)*64 + octdec(f-200) + * + * @copyright 2003-2019 The SquirrelMail Project Team + * @license http://opensource.org/licenses/gpl-license.php GNU Public License + * @version $Id$ * @package squirrelmail * @subpackage decode */ @@ -27,27 +48,47 @@ * @return string Decoded string */ function charset_decode_utf_8 ($string) { - global $default_charset,$squirrelmail_language; + global $squirrelmail_language; + // Japanese translation uses mbstring function to read utf-8 if ($squirrelmail_language == 'ja_JP') return $string; - /* Only do the slow convert if there are 8-bit characters */ - /* avoid using 0xA0 (\240) in ereg ranges. RH73 does not like that */ - if (! ereg("[\200-\237]", $string) and ! ereg("[\241-\377]", $string)) + // don't do decoding when there are no 8bit symbols + if (! sq_is8bit($string,'utf-8')) return $string; + // decode six byte unicode characters + /* (i think currently there is no such symbol) + $string = preg_replace_callback("/([\374-\375])([\200-\277])([\200-\277])([\200-\277])([\200-\277])([\200-\277])/", + create_function ('$matches', 'return \'&#\'.((ord($matches[1])-252)*1073741824+(ord($matches[2])-200)*16777216+(ord($matches[3])-200)*262144+(ord($matches[4])-128)*4096+(ord($matches[5])-128)*64+(ord($matches[6])-128)).\';\';'), + $string); + */ + + // decode five byte unicode characters + /* (i think currently there is no such symbol) + $string = preg_replace_callback("/([\370-\373])([\200-\277])([\200-\277])([\200-\277])([\200-\277])/", + create_function ('$matches', 'return \'&#\'.((ord($matches[1])-248)*16777216+(ord($matches[2])-200)*262144+(ord($matches[3])-128)*4096+(ord($matches[4])-128)*64+(ord($matches[5])-128)).\';\';'), + $string); + */ + + // decode four byte unicode characters + $string = preg_replace_callback("/([\360-\367])([\200-\277])([\200-\277])([\200-\277])/", + create_function ('$matches', 'return \'&#\'.((ord($matches[1])-240)*262144+(ord($matches[2])-128)*4096+(ord($matches[3])-128)*64+(ord($matches[4])-128)).\';\';'), + $string); + // decode three byte unicode characters - $string = preg_replace("/([\340-\357])([\200-\277])([\200-\277])/e", - "'&#'.((ord('\\1')-224)*4096+(ord('\\2')-128)*64+(ord('\\3')-128)).';'", + $string = preg_replace_callback("/([\340-\357])([\200-\277])([\200-\277])/", + create_function ('$matches', 'return \'&#\'.((ord($matches[1])-224)*4096+(ord($matches[2])-128)*64+(ord($matches[3])-128)).\';\';'), $string); // decode two byte unicode characters - $string = preg_replace("/([\300-\337])([\200-\277])/e", - "'&#'.((ord('\\1')-192)*64+(ord('\\2')-128)).';'", + $string = preg_replace_callback("/([\300-\337])([\200-\277])/", + create_function ('$matches', 'return \'&#\'.((ord($matches[1])-192)*64+(ord($matches[2])-128)).\';\';'), $string); + // remove broken unicode + $string = preg_replace("/[\200-\237]|\240|[\241-\377]/",'?',$string); + return $string; } - -?>