Happy 2017

[squirrelmail.git] / functions / decode / utf_8.php
diff --git a/functions/decode/utf_8.php b/functions/decode/utf_8.php

index df87dcacb8d504a77281faeb4d1b23855a1e73fe..d1b11890e13e51bfb2a470eeaa558c0f0d67bb3a 100644 (file)
--- a/functions/decode/utf_8.php
+++ b/functions/decode/utf_8.php
@@ -1,22 +1,43 @@
  <?php
+
  /**
- * decode/utf-8.php
- * $Id$
- *
- * Copyright (c) 2003 The SquirrelMail Project Team
- * Licensed under the GNU GPL. For full terms see the file COPYING.
+ * functions/decode/utf-8.php - utf-8 decoding functions
   *
   * This file contains utf-8 decoding function that is needed to read
   * utf-8 encoded mails in non-utf-8 locale.
   *
   * Every decoded character consists of n bytes. First byte is octal
   * 300-375, other bytes - always octals 200-277.
+ *<pre>
+ * Ranges (first byte):
+ *                oct     dec    hex
+ * Two byte   - 300-337 192-223 C0-DF
+ * Three byte - 340-357 224-239 E0-EF
+ * Four byte  - 360-367 240-247 F0-F7
+ * Five byte  - 370-373 248-251 F8-FB
+ * Six byte   - 374-375 252-253 FC-FD
+ *
+ * \a\b characters are decoded to html code calculated with formula:
+ *  octdec(a-300)*64 + octdec(b-200)
   *
- * \a\b characters are decoded to html code octdec(a-300)*64 + octdec(b-200)
- * \a\b\c characters are decoded to html code octdec(a-340)*64*64 + octdec(b-200)*64 + octdec(c-200)
+ * \a\b\c characters are decoded to html code calculated with formula:
+ *  octdec(a-340)*64^2 + octdec(b-200)*64 + octdec(c-200)
   *
- * decoding cycle is unfinished. please test and report problems to tokul@users.sourceforge.net
- * 
+ * \a\b\c\d characters are decoded to html code calculated with formula:
+ *  octdec(a-360)*64^3 + octdec(b-200)*64^2 +
+ *  + octdec(c-200)*64 + octdec(d-200)
+ *
+ * \a\b\c\d\e characters are decoded to html code calculated with formula:
+ *  octdec(a-370)*64^4 + octdec(b-200)*64^3 +
+ *  + octdec(c-200)*64^2 + octdec(d-200)*64 + octdec(e-200)
+ *
+ * \a\b\c\d\e\f characters are decoded to html code calculated with formula:
+ *  octdec(a-374)*64^5 + octdec(b-200)*64^4 + octdec(c-200)*64^3 +
+ *  + octdec(d-200)*64^2 + octdec(e-200)*64 + octdec(f-200)
+ *</pre>
+ * @copyright 2003-2017 The SquirrelMail Project Team
+ * @license http://opensource.org/licenses/gpl-license.php GNU Public License
+ * @version $Id$
   * @package squirrelmail
   * @subpackage decode
   */
@@ -27,27 +48,47 @@
   * @return string Decoded string
   */
  function charset_decode_utf_8 ($string) {
-  global $default_charset;
+    global $squirrelmail_language;
  
-    if (strtolower($default_charset) == 'utf-8')
+    // Japanese translation uses mbstring function to read utf-8
+    if ($squirrelmail_language == 'ja_JP')
          return $string;
  
-    /* Only do the slow convert if there are 8-bit characters */
-    /* avoid using 0xA0 (\240) in ereg ranges. RH73 does not like that */
-    if (! ereg("[\200-\237]", $string) and ! ereg("[\241-\377]", $string))
+    // don't do decoding when there are no 8bit symbols
+    if (! sq_is8bit($string,'utf-8'))
          return $string;
  
+    // decode six byte unicode characters
+    /* (i think currently there is no such symbol)
+    $string = preg_replace_callback("/([\374-\375])([\200-\277])([\200-\277])([\200-\277])([\200-\277])([\200-\277])/",
+    create_function ('$matches', 'return \'&#\'.((ord($matches[1])-252)*1073741824+(ord($matches[2])-200)*16777216+(ord($matches[3])-200)*262144+(ord($matches[4])-128)*4096+(ord($matches[5])-128)*64+(ord($matches[6])-128)).\';\';'),
+    $string);
+    */
+
+    // decode five byte unicode characters
+    /* (i think currently there is no such symbol)
+    $string = preg_replace_callback("/([\370-\373])([\200-\277])([\200-\277])([\200-\277])([\200-\277])/",
+    create_function ('$matches', 'return \'&#\'.((ord($matches[1])-248)*16777216+(ord($matches[2])-200)*262144+(ord($matches[3])-128)*4096+(ord($matches[4])-128)*64+(ord($matches[5])-128)).\';\';'),
+    $string);
+    */
+    
+    // decode four byte unicode characters
+    $string = preg_replace_callback("/([\360-\367])([\200-\277])([\200-\277])([\200-\277])/",
+    create_function ('$matches', 'return \'&#\'.((ord($matches[1])-240)*262144+(ord($matches[2])-128)*4096+(ord($matches[3])-128)*64+(ord($matches[4])-128)).\';\';'),
+    $string);
+
      // decode three byte unicode characters
-    $string = preg_replace("/([\340-\357])([\200-\277])([\200-\277])/e",
-    "'&#'.((ord('\\1')-224)*4096+(ord('\\2')-128)*64+(ord('\\3')-128)).';'",
+    $string = preg_replace_callback("/([\340-\357])([\200-\277])([\200-\277])/",
+    create_function ('$matches', 'return \'&#\'.((ord($matches[1])-224)*4096+(ord($matches[2])-128)*64+(ord($matches[3])-128)).\';\';'),
      $string);
  
      // decode two byte unicode characters
-    $string = preg_replace("/([\300-\337])([\200-\277])/e",
-    "'&#'.((ord('\\1')-192)*64+(ord('\\2')-128)).';'",
+    $string = preg_replace_callback("/([\300-\337])([\200-\277])/",
+    create_function ('$matches', 'return \'&#\'.((ord($matches[1])-192)*64+(ord($matches[2])-128)).\';\';'),
      $string);
  
+    // remove broken unicode
+    $string = preg_replace("/[\200-\237]|\240|[\241-\377]/",'?',$string);
+
      return $string;
  }
-
-?>