| 1 | <?php |
| 2 | |
| 3 | /** |
| 4 | * functions/decode/utf-8.php - utf-8 decoding functions |
| 5 | * |
| 6 | * This file contains utf-8 decoding function that is needed to read |
| 7 | * utf-8 encoded mails in non-utf-8 locale. |
| 8 | * |
| 9 | * Every decoded character consists of n bytes. First byte is octal |
| 10 | * 300-375, other bytes - always octals 200-277. |
| 11 | *<pre> |
| 12 | * Ranges (first byte): |
| 13 | * oct dec hex |
| 14 | * Two byte - 300-337 192-223 C0-DF |
| 15 | * Three byte - 340-357 224-239 E0-EF |
| 16 | * Four byte - 360-367 240-247 F0-F7 |
| 17 | * Five byte - 370-373 248-251 F8-FB |
| 18 | * Six byte - 374-375 252-253 FC-FD |
| 19 | * |
| 20 | * \a\b characters are decoded to html code calculated with formula: |
| 21 | * octdec(a-300)*64 + octdec(b-200) |
| 22 | * |
| 23 | * \a\b\c characters are decoded to html code calculated with formula: |
| 24 | * octdec(a-340)*64^2 + octdec(b-200)*64 + octdec(c-200) |
| 25 | * |
| 26 | * \a\b\c\d characters are decoded to html code calculated with formula: |
| 27 | * octdec(a-360)*64^3 + octdec(b-200)*64^2 + |
| 28 | * + octdec(c-200)*64 + octdec(d-200) |
| 29 | * |
| 30 | * \a\b\c\d\e characters are decoded to html code calculated with formula: |
| 31 | * octdec(a-370)*64^4 + octdec(b-200)*64^3 + |
| 32 | * + octdec(c-200)*64^2 + octdec(d-200)*64 + octdec(e-200) |
| 33 | * |
| 34 | * \a\b\c\d\e\f characters are decoded to html code calculated with formula: |
| 35 | * octdec(a-374)*64^5 + octdec(b-200)*64^4 + octdec(c-200)*64^3 + |
| 36 | * + octdec(d-200)*64^2 + octdec(e-200)*64 + octdec(f-200) |
| 37 | *</pre> |
| 38 | * @copyright © 2003-2005 The SquirrelMail Project Team |
| 39 | * @license http://opensource.org/licenses/gpl-license.php GNU Public License |
| 40 | * @version $Id$ |
| 41 | * @package squirrelmail |
| 42 | * @subpackage decode |
| 43 | */ |
| 44 | |
| 45 | /** |
| 46 | * Decode utf-8 strings |
| 47 | * @param string $string Encoded string |
| 48 | * @return string Decoded string |
| 49 | */ |
| 50 | function charset_decode_utf_8 ($string) { |
| 51 | global $squirrelmail_language; |
| 52 | |
| 53 | // Japanese translation uses mbstring function to read utf-8 |
| 54 | if ($squirrelmail_language == 'ja_JP') |
| 55 | return $string; |
| 56 | |
| 57 | // don't do decoding when there are no 8bit symbols |
| 58 | if (! sq_is8bit($string,'utf-8')) |
| 59 | return $string; |
| 60 | |
| 61 | // decode six byte unicode characters |
| 62 | /* (i think currently there is no such symbol) |
| 63 | $string = preg_replace("/([\374-\375])([\200-\277])([\200-\277])([\200-\277])([\200-\277])([\200-\277])/e", |
| 64 | "'&#'.((ord('\\1')-252)*1073741824+(ord('\\2')-200)*16777216+(ord('\\3')-200)*262144+(ord('\\4')-128)*4096+(ord('\\5')-128)*64+(ord('\\6')-128)).';'", |
| 65 | $string); |
| 66 | */ |
| 67 | |
| 68 | // decode five byte unicode characters |
| 69 | /* (i think currently there is no such symbol) |
| 70 | $string = preg_replace("/([\370-\373])([\200-\277])([\200-\277])([\200-\277])([\200-\277])/e", |
| 71 | "'&#'.((ord('\\1')-248)*16777216+(ord('\\2')-200)*262144+(ord('\\3')-128)*4096+(ord('\\4')-128)*64+(ord('\\5')-128)).';'", |
| 72 | $string); |
| 73 | */ |
| 74 | |
| 75 | // decode four byte unicode characters |
| 76 | $string = preg_replace("/([\360-\367])([\200-\277])([\200-\277])([\200-\277])/e", |
| 77 | "'&#'.((ord('\\1')-240)*262144+(ord('\\2')-128)*4096+(ord('\\3')-128)*64+(ord('\\4')-128)).';'", |
| 78 | $string); |
| 79 | |
| 80 | // decode three byte unicode characters |
| 81 | $string = preg_replace("/([\340-\357])([\200-\277])([\200-\277])/e", |
| 82 | "'&#'.((ord('\\1')-224)*4096+(ord('\\2')-128)*64+(ord('\\3')-128)).';'", |
| 83 | $string); |
| 84 | |
| 85 | // decode two byte unicode characters |
| 86 | $string = preg_replace("/([\300-\337])([\200-\277])/e", |
| 87 | "'&#'.((ord('\\1')-192)*64+(ord('\\2')-128)).';'", |
| 88 | $string); |
| 89 | |
| 90 | // remove broken unicode |
| 91 | $string = preg_replace("/[\200-\237]|\240|[\241-\377]/",'?',$string); |
| 92 | |
| 93 | return $string; |
| 94 | } |
| 95 | ?> |