From: tokul <tokul@7612ce4b-ef26-0410-bec9-ea0150e637f0>
Date: Sun, 14 Nov 2004 17:30:06 +0000 (+0000)
Subject: extending utf-8 decoding function.
X-Git-Url: https://vcs.fsf.org/?a=commitdiff_plain;h=a5ab6455a827b1cca6deb3e0fd095cea4b29d9d6;p=squirrelmail.git

extending utf-8 decoding function.
default_charset test removed because it is better to do in main decoding
function and ldap needs decoding without default_charset test


git-svn-id: https://svn.code.sf.net/p/squirrelmail/code/trunk/squirrelmail@8371 7612ce4b-ef26-0410-bec9-ea0150e637f0
---

diff --git a/functions/decode/utf_8.php b/functions/decode/utf_8.php
index 03153615..c6b1e615 100644
--- a/functions/decode/utf_8.php
+++ b/functions/decode/utf_8.php
@@ -1,6 +1,6 @@
 <?php
 /**
- * decode/utf-8.php
+ * functions/decode/utf-8.php - utf-8 decoding functions
  *
  * Copyright (c) 2003-2004 The SquirrelMail Project Team
  * Licensed under the GNU GPL. For full terms see the file COPYING.
@@ -10,12 +10,33 @@
  *
  * Every decoded character consists of n bytes. First byte is octal
  * 300-375, other bytes - always octals 200-277.
+ *<pre>
+ * Ranges (first byte):
+ *                oct     dec    hex
+ * Two byte   - 300-337 192-223 C0-DF
+ * Three byte - 340-357 224-239 E0-EF
+ * Four byte  - 360-367 240-247 F0-F7
+ * Five byte  - 370-373 248-251 F8-FB
+ * Six byte   - 374-375 252-253 FC-FD
  *
- * \a\b characters are decoded to html code octdec(a-300)*64 + octdec(b-200)
- * \a\b\c characters are decoded to html code octdec(a-340)*64*64 + octdec(b-200)*64 + octdec(c-200)
+ * \a\b characters are decoded to html code calculated with formula:
+ *  octdec(a-300)*64 + octdec(b-200)
  *
- * decoding cycle is unfinished. please test and report problems to tokul@users.sourceforge.net
+ * \a\b\c characters are decoded to html code calculated with formula:
+ *  octdec(a-340)*64^2 + octdec(b-200)*64 + octdec(c-200)
  *
+ * \a\b\c\d characters are decoded to html code calculated with formula:
+ *  octdec(a-360)*64^3 + octdec(b-200)*64^2 + 
+ *  + octdec(c-200)*64 + octdec(d-200)
+ *
+ * \a\b\c\d\e characters are decoded to html code calculated with formula:
+ *  octdec(a-370)*64^4 + octdec(b-200)*64^3 + 
+ *  + octdec(c-200)*64^2 + octdec(d-200)*64 + octdec(e-200)
+ *
+ * \a\b\c\d\e\f characters are decoded to html code calculated with formula:
+ *  octdec(a-374)*64^5 + octdec(b-200)*64^4 + octdec(c-200)*64^3 + 
+ *  + octdec(d-200)*64^2 + octdec(e-200)*64 + octdec(f-200)
+ *</pre>
  * @version $Id$
  * @package squirrelmail
  * @subpackage decode
@@ -27,11 +48,9 @@
  * @return string Decoded string
  */
 function charset_decode_utf_8 ($string) {
-    global $squirrelmail_language, $default_charset;
-
-    if (strtolower($default_charset) == 'utf-8')
-        return $string;
+    global $squirrelmail_language;
 
+    // Japanese translation uses mbstring function to read utf-8
     if ($squirrelmail_language == 'ja_JP')
         return $string;
 
@@ -39,6 +58,25 @@ function charset_decode_utf_8 ($string) {
     if (! sq_is8bit($string,'utf-8'))
         return $string;
 
+    // decode six byte unicode characters 
+    /* (i think currently there is no such symbol)
+    $string = preg_replace("/([\374-\375])([\200-\277])([\200-\277])([\200-\277])([\200-\277])([\200-\277])/e",
+    "'&#'.((ord('\\1')-252)*1073741824+(ord('\\2')-200)*16777216+(ord('\\3')-200)*262144+(ord('\\4')-128)*4096+(ord('\\5')-128)*64+(ord('\\6')-128)).';'",
+    $string);
+    */
+
+    // decode five byte unicode characters 
+    /* (i think currently there is no such symbol)
+    $string = preg_replace("/([\370-\373])([\200-\277])([\200-\277])([\200-\277])([\200-\277])/e",
+    "'&#'.((ord('\\1')-248)*16777216+(ord('\\2')-200)*262144+(ord('\\3')-128)*4096+(ord('\\4')-128)*64+(ord('\\5')-128)).';'",
+    $string);
+    */
+
+    // decode four byte unicode characters
+    $string = preg_replace("/([\360-\367])([\200-\277])([\200-\277])([\200-\277])/e",
+    "'&#'.((ord('\\1')-240)*262144+(ord('\\2')-128)*4096+(ord('\\3')-128)*64+(ord('\\4')-128)).';'",
+    $string);
+
     // decode three byte unicode characters
     $string = preg_replace("/([\340-\357])([\200-\277])([\200-\277])/e",
     "'&#'.((ord('\\1')-224)*4096+(ord('\\2')-128)*64+(ord('\\3')-128)).';'",
@@ -49,6 +87,9 @@ function charset_decode_utf_8 ($string) {
     "'&#'.((ord('\\1')-192)*64+(ord('\\2')-128)).';'",
     $string);
 
+    // remove broken unicode
+    $string = preg_replace("/[\200-\237]|\240|[\241-\377]/",'?',$string);
+
     return $string;
 }
 ?>
\ No newline at end of file