Fix certain messages with headers in unknown charsets encoded as quoted printable...

author pdontthink <pdontthink@7612ce4b-ef26-0410-bec9-ea0150e637f0>

Wed, 25 Aug 2021 03:38:28 +0000 (03:38 +0000)

committer pdontthink <pdontthink@7612ce4b-ef26-0410-bec9-ea0150e637f0>

Wed, 25 Aug 2021 03:38:28 +0000 (03:38 +0000)
author pdontthink <pdontthink@7612ce4b-ef26-0410-bec9-ea0150e637f0>
Wed, 25 Aug 2021 03:38:28 +0000 (03:38 +0000)
committer pdontthink <pdontthink@7612ce4b-ef26-0410-bec9-ea0150e637f0>
Wed, 25 Aug 2021 03:38:28 +0000 (03:38 +0000)
diff --git a/config/config_local.example.php b/config/config_local.example.php

index 40644cd2f4d3d73ff9f73993dca63fde73083f5e..dcaf72696fbf283e6a3ee264eda8618dd03d6043 100644 (file)
--- a/config/config_local.example.php
+++ b/config/config_local.example.php
@@ -61,6 +61,14 @@
   * part (beginning with "@") will be stripped before
   * calculating the CRC or MD5.
   *
+ * $default_htmlspecialchars_encoding (string) is used to
+ * specify the charset that is used for htmlspecialchars()
+ * calls when an invalid charset was requested (PHP's
+ * htmlspecialchars() only supports a limited number of
+ * encodings).  SquirrelMail defaults to iso-8859-1, but if
+ * you want to change the default to something like utf-8,
+ * you can use this setting for that.
+ *
   * $smtp_stream_options allows more control over the SSL context
   * used when connecting to the SMTP server over SSL/TLS.  See:
   * http://www.php.net/manual/context.php and in particular
diff --git a/functions/strings.php b/functions/strings.php

index c2a33e07accbd63cff647da18b81c78461a735ec..13a4eca99b723179bc3bf3e5a8b1d058efef435c 100644 (file)
--- a/functions/strings.php
+++ b/functions/strings.php
@@ -1729,21 +1729,75 @@ function sm_validate_security_token($token, $validity_period=0, $show_error=FALS
    * attempts to add the correct character encoding
    *
    * @param string $string The string to be converted
-  * @param int $flags A bitmask that controls the behavior of htmlspecialchars()
+  * @param int $flags A bitmask that controls the behavior of
+  *                   htmlspecialchars() -- NOTE that this parameter
+  *                   should only be used to dictate handling of
+  *                   quotes; handling invalid code sequences is done
+  *                   using the $invalid_sequence_flag parameter below
    *                   (See http://php.net/manual/function.htmlspecialchars.php )
-  *                   (OPTIONAL; default ENT_COMPAT, ENT_COMPAT | ENT_SUBSTITUTE for PHP >=5.4)
+  *                   (OPTIONAL; default ENT_COMPAT)
    * @param string $encoding The character encoding to use in the conversion
-  *                         (OPTIONAL; default automatic detection)
+  *                         (if not one of the character sets supported
+  *                         by PHP's htmlspecialchars(), then $encoding
+  *                         will be ignored and iso-8859-1 will be used,
+  *                         unless a default has been specified in
+  *                         $default_htmlspecialchars_encoding in
+  *                         config_local.php) (OPTIONAL; default automatic
+  *                         detection)
    * @param boolean $double_encode Whether or not to convert entities that are
    *                               already in the string (only supported in
    *                               PHP 5.2.3+) (OPTIONAL; default TRUE)
+  * @param mixed $invalid_sequence_flag A bitmask that controls how invalid
+  *                                     code sequences should be handled;
+  *                                     When calling htmlspecialchars(),
+  *                                     this value will be combined with
+  *                                     the $flags parameter above
+  *                                     (See http://php.net/manual/function.htmlspecialchars.php )
+  *                                     (OPTIONAL; defaults to the string
+  *                                     "ent_substitute" that, for PHP 5.4+,
+  *                                     is converted to the ENT_SUBSTITUTE
+  *                                     constant, otherwise empty)
    *
    * @return string The converted text
    *
    */
  function sm_encode_html_special_chars($string, $flags=ENT_COMPAT,
-                                      $encoding=NULL, $double_encode=TRUE)
+                                      $encoding=NULL, $double_encode=TRUE,
+                                      $invalid_sequence_flag='ent_substitute')
  {
+   if ($invalid_sequence_flag === 'ent_substitute')
+   {
+      if (check_php_version(5, 4, 0))
+         $invalid_sequence_flag = ENT_SUBSTITUTE;
+      else
+         $invalid_sequence_flag = 0;
+   }
+
+
+   // charsets supported by PHP's htmlspecialchars
+   // (move this elsewhere if needed)
+   //
+   static $htmlspecialchars_charsets = array(
+      'iso-8859-1', 'iso8859-1',
+      'iso-8859-5', 'iso8859-5',
+      'iso-8859-15', 'iso8859-15',
+      'utf-8',
+      'cp866', 'ibm866', '866',
+      'cp1251', 'windows-1251', 'win-1251', '1251',
+      'cp1252', 'windows-1252', '1252',
+      'koi8-R', 'koi8-ru', 'koi8r',
+      'big5', '950',
+      'gb2312', '936',
+      'big5-hkscs',
+      'shift_jis', 'sjis', 'sjis-win', 'cp932', '932',
+      'euc-jp', 'eucjp', 'eucjp-win',
+      'macroman',
+   );
+
+
+   // if not given, set encoding to the charset being
+   // used by the current user interface language
+   //
     if (!$encoding)
     {
        global $default_charset;
@@ -1752,15 +1806,58 @@ function sm_encode_html_special_chars($string, $flags=ENT_COMPAT,
        $encoding = $default_charset;
     }
  
-   if (check_php_version(5, 2, 3)) {
-      // Replace invalid characters with a symbol instead of returning
-      // empty string for the entire to be encoded string.
-      if (check_php_version(5, 4, 0) && $flags == ENT_COMPAT) {
-         $flags = $flags | ENT_SUBSTITUTE;
+
+   // two ways to handle encodings not supported by htmlspecialchars() -
+   // one takes less CPU cycles but can munge characters in certain
+   // translations, the other is more exact but requires more resources
+   //
+   global $html_special_chars_extended_fix;
+//FIXME: need to document that the config switch above can be enabled in config_local... but first, we need to decide if we will implement the second option here -- currently there hasn't been a need for it (munged characters seem quite rare).... see tracker #2806 for some tips https://sourceforge.net/p/squirrelmail/bugs/2806
+   if (!in_array(strtolower($encoding), $htmlspecialchars_charsets))
+   {
+      if ($html_special_chars_extended_fix)
+      {
+         // convert to utf-8 first, run htmlspecialchars() and convert
+         // back to original encoding below
+         //
+//FIXME: try conversion functions in this order: recode_string(), iconv(), mbstring (with various charset checks: sq_mb_list_encodings(), mb_check_encoding) -- oh, first check for internal charset_decode_CHARSET() function?? or just use (does this put everything into HTML entities already? shouldn't, but if it does, return right here):
+         $string = charset_decode($encoding, $string, TRUE, TRUE);
+         $string = charset_encode($string, $encoding, TRUE);
+      }
+      else
+      {
+         // simply force use of an encoding that is supported (some
+         // characters may be munged)
+         //
+         // use default from configuration if provided or hard-coded fallback
+         //
+         global $default_htmlspecialchars_encoding;
+         if (!empty($default_htmlspecialchars_encoding))
+            $encoding = $default_htmlspecialchars_encoding;
+         else
+            $encoding = 'iso-8859-1';
        }
-      return htmlspecialchars($string, $flags, $encoding, $double_encode);
     }
  
-   return htmlspecialchars($string, $flags, $encoding);
+
+// TODO: Is adding this check an unnecessary performance hit?
+   if (check_php_version(5, 2, 3))
+      $ret = htmlspecialchars($string, $flags | $invalid_sequence_flag,
+                              $encoding, $double_encode);
+   else
+      $ret = htmlspecialchars($string, $flags | $invalid_sequence_flag,
+                              $encoding);
+
+
+   // convert back to original encoding if needed (see above)
+   //
+   if ($html_special_chars_extended_fix
+    && !in_array(strtolower($encoding), $htmlspecialchars_charsets))
+   {
+//FIXME: NOT FINISHED - here, we'd convert from utf-8 back to original charset (if we obey $lossy_encoding and end up returning in utf-8 instead of original charset, does that screw up the caller?)
+   }
+
+
+   return $ret;
  }
  
diff --git a/include/languages.php b/include/languages.php

index 4987f306bb7cb3f500ebba10eb0ea5633384b82b..c7143e93c4314a1dc13b306fbe59a558b3c33be8 100644 (file)
--- a/include/languages.php
+++ b/include/languages.php
@@ -180,16 +180,18 @@ function sq_setlocale($category,$locale) {
  }
  
  /**
- * Converts string from given charset to charset, that can be displayed by user translation.
+ * Converts a string from the given $charset to a character set that
+ * can be displayed by the current user interface language (translation)
   *
- * Function by default returns html encoded strings, if translation uses different encoding.
+ * Function by default returns html encoded strings if translation uses
+ * different encoding.
   * If Japanese translation is used - function returns string converted to euc-jp
   * If iconv or recode functions are enabled and translation uses utf-8 - function returns utf-8 encoded string.
   * If $charset is not supported - function returns unconverted string.
   *
   * sanitizing of html tags is also done by this function.
   *
- * @param string $charset
+ * @param string $charset The charset of the incoming string
   * @param string $string Text to be decoded
   * @param boolean $force_decode converts string to html without $charset!=$default_charset check.
   * Argument is available since 1.4.5 and 1.5.1.
@@ -218,7 +220,7 @@ function charset_decode ($charset, $string, $force_decode=false, $save_html=fals
  
      // Don't do conversion if charset is the same.
      if ( ! $force_decode && $charset == strtolower($default_charset) )
-        return ($save_html ? $string : sm_encode_html_special_chars($string));
+        return ($save_html ? $string : sm_encode_html_special_chars($string, ENT_COMPAT, $charset));
  
      // catch iso-8859-8-i thing
      if ( $charset == "iso-8859-8-i" )
@@ -234,7 +236,7 @@ function charset_decode ($charset, $string, $force_decode=false, $save_html=fals
              // other charsets can be converted to utf-8 without loss.
              // and output string is smaller
              $string = recode_string($charset . "..utf-8",$string);
-            return ($save_html ? $string : sm_encode_html_special_chars($string));
+            return ($save_html ? $string : sm_encode_html_special_chars($string, ENT_COMPAT, $charset));
          } else {
              $string = recode_string($charset . "..html",$string);
              // recode does not convert single quote, sm_encode_html_special_chars does.
@@ -250,13 +252,13 @@ function charset_decode ($charset, $string, $force_decode=false, $save_html=fals
      // iconv functions does not have html target and can be used only with utf-8
      if ( $use_php_iconv && $default_charset=='utf-8') {
          $string = iconv($charset,$default_charset,$string);
-        return ($save_html ? $string : sm_encode_html_special_chars($string));
+        return ($save_html ? $string : sm_encode_html_special_chars($string, ENT_COMPAT, $charset));
      }
  
      // If we don't use recode and iconv, we'll do it old way.
  
      /* All HTML special characters are 7 bit and can be replaced first */
-    if (! $save_html) $string = sm_encode_html_special_chars ($string);
+    if (! $save_html) $string = sm_encode_html_special_chars($string, ENT_COMPAT, $charset);
  
      /* controls cpu and memory intensive decoding cycles */
      if (! isset($aggressive_decoding) || $aggressive_decoding=="" ) {
author	pdontthink <pdontthink@7612ce4b-ef26-0410-bec9-ea0150e637f0>
	Wed, 25 Aug 2021 03:38:28 +0000 (03:38 +0000)
committer	pdontthink <pdontthink@7612ce4b-ef26-0410-bec9-ea0150e637f0>
	Wed, 25 Aug 2021 03:38:28 +0000 (03:38 +0000)
config/config_local.example.php		patch \| blob \| blame \| history
functions/strings.php		patch \| blob \| blame \| history
include/languages.php		patch \| blob \| blame \| history