* attempts to add the correct character encoding
*
* @param string $string The string to be converted
- * @param int $flags A bitmask that controls the behavior of htmlspecialchars()
+ * @param int $flags A bitmask that controls the behavior of
+ * htmlspecialchars() -- NOTE that this parameter
+ * should only be used to dictate handling of
+ * quotes; handling invalid code sequences is done
+ * using the $invalid_sequence_flag parameter below
* (See http://php.net/manual/function.htmlspecialchars.php )
- * (OPTIONAL; default ENT_COMPAT, ENT_COMPAT | ENT_SUBSTITUTE for PHP >=5.4)
+ * (OPTIONAL; default ENT_COMPAT)
* @param string $encoding The character encoding to use in the conversion
- * (OPTIONAL; default automatic detection)
+ * (if not one of the character sets supported
+ * by PHP's htmlspecialchars(), then $encoding
+ * will be ignored and iso-8859-1 will be used,
+ * unless a default has been specified in
+ * $default_htmlspecialchars_encoding in
+ * config_local.php) (OPTIONAL; default automatic
+ * detection)
* @param boolean $double_encode Whether or not to convert entities that are
* already in the string (only supported in
* PHP 5.2.3+) (OPTIONAL; default TRUE)
+ * @param mixed $invalid_sequence_flag A bitmask that controls how invalid
+ * code sequences should be handled;
+ * When calling htmlspecialchars(),
+ * this value will be combined with
+ * the $flags parameter above
+ * (See http://php.net/manual/function.htmlspecialchars.php )
+ * (OPTIONAL; defaults to the string
+ * "ent_substitute" that, for PHP 5.4+,
+ * is converted to the ENT_SUBSTITUTE
+ * constant, otherwise empty)
*
* @return string The converted text
*
*/
function sm_encode_html_special_chars($string, $flags=ENT_COMPAT,
- $encoding=NULL, $double_encode=TRUE)
+ $encoding=NULL, $double_encode=TRUE,
+ $invalid_sequence_flag='ent_substitute')
{
+ if ($invalid_sequence_flag === 'ent_substitute')
+ {
+ if (check_php_version(5, 4, 0))
+ $invalid_sequence_flag = ENT_SUBSTITUTE;
+ else
+ $invalid_sequence_flag = 0;
+ }
+
+
+ // charsets supported by PHP's htmlspecialchars
+ // (move this elsewhere if needed)
+ //
+ static $htmlspecialchars_charsets = array(
+ 'iso-8859-1', 'iso8859-1',
+ 'iso-8859-5', 'iso8859-5',
+ 'iso-8859-15', 'iso8859-15',
+ 'utf-8',
+ 'cp866', 'ibm866', '866',
+ 'cp1251', 'windows-1251', 'win-1251', '1251',
+ 'cp1252', 'windows-1252', '1252',
+ 'koi8-R', 'koi8-ru', 'koi8r',
+ 'big5', '950',
+ 'gb2312', '936',
+ 'big5-hkscs',
+ 'shift_jis', 'sjis', 'sjis-win', 'cp932', '932',
+ 'euc-jp', 'eucjp', 'eucjp-win',
+ 'macroman',
+ );
+
+
+ // if not given, set encoding to the charset being
+ // used by the current user interface language
+ //
if (!$encoding)
{
global $default_charset;
$encoding = $default_charset;
}
- if (check_php_version(5, 2, 3)) {
- // Replace invalid characters with a symbol instead of returning
- // empty string for the entire to be encoded string.
- if (check_php_version(5, 4, 0) && $flags == ENT_COMPAT) {
- $flags = $flags | ENT_SUBSTITUTE;
+
+ // two ways to handle encodings not supported by htmlspecialchars() -
+ // one takes less CPU cycles but can munge characters in certain
+ // translations, the other is more exact but requires more resources
+ //
+ global $html_special_chars_extended_fix;
+//FIXME: need to document that the config switch above can be enabled in config_local... but first, we need to decide if we will implement the second option here -- currently there hasn't been a need for it (munged characters seem quite rare).... see tracker #2806 for some tips https://sourceforge.net/p/squirrelmail/bugs/2806
+ if (!in_array(strtolower($encoding), $htmlspecialchars_charsets))
+ {
+ if ($html_special_chars_extended_fix)
+ {
+ // convert to utf-8 first, run htmlspecialchars() and convert
+ // back to original encoding below
+ //
+//FIXME: try conversion functions in this order: recode_string(), iconv(), mbstring (with various charset checks: sq_mb_list_encodings(), mb_check_encoding) -- oh, first check for internal charset_decode_CHARSET() function?? or just use (does this put everything into HTML entities already? shouldn't, but if it does, return right here):
+ $string = charset_decode($encoding, $string, TRUE, TRUE);
+ $string = charset_encode($string, $encoding, TRUE);
+ }
+ else
+ {
+ // simply force use of an encoding that is supported (some
+ // characters may be munged)
+ //
+ // use default from configuration if provided or hard-coded fallback
+ //
+ global $default_htmlspecialchars_encoding;
+ if (!empty($default_htmlspecialchars_encoding))
+ $encoding = $default_htmlspecialchars_encoding;
+ else
+ $encoding = 'iso-8859-1';
}
- return htmlspecialchars($string, $flags, $encoding, $double_encode);
}
- return htmlspecialchars($string, $flags, $encoding);
+
+// TODO: Is adding this check an unnecessary performance hit?
+ if (check_php_version(5, 2, 3))
+ $ret = htmlspecialchars($string, $flags | $invalid_sequence_flag,
+ $encoding, $double_encode);
+ else
+ $ret = htmlspecialchars($string, $flags | $invalid_sequence_flag,
+ $encoding);
+
+
+ // convert back to original encoding if needed (see above)
+ //
+ if ($html_special_chars_extended_fix
+ && !in_array(strtolower($encoding), $htmlspecialchars_charsets))
+ {
+//FIXME: NOT FINISHED - here, we'd convert from utf-8 back to original charset (if we obey $lossy_encoding and end up returning in utf-8 instead of original charset, does that screw up the caller?)
+ }
+
+
+ return $ret;
}
}
/**
- * Converts string from given charset to charset, that can be displayed by user translation.
+ * Converts a string from the given $charset to a character set that
+ * can be displayed by the current user interface language (translation)
*
- * Function by default returns html encoded strings, if translation uses different encoding.
+ * Function by default returns html encoded strings if translation uses
+ * different encoding.
* If Japanese translation is used - function returns string converted to euc-jp
* If iconv or recode functions are enabled and translation uses utf-8 - function returns utf-8 encoded string.
* If $charset is not supported - function returns unconverted string.
*
* sanitizing of html tags is also done by this function.
*
- * @param string $charset
+ * @param string $charset The charset of the incoming string
* @param string $string Text to be decoded
* @param boolean $force_decode converts string to html without $charset!=$default_charset check.
* Argument is available since 1.4.5 and 1.5.1.
// Don't do conversion if charset is the same.
if ( ! $force_decode && $charset == strtolower($default_charset) )
- return ($save_html ? $string : sm_encode_html_special_chars($string));
+ return ($save_html ? $string : sm_encode_html_special_chars($string, ENT_COMPAT, $charset));
// catch iso-8859-8-i thing
if ( $charset == "iso-8859-8-i" )
// other charsets can be converted to utf-8 without loss.
// and output string is smaller
$string = recode_string($charset . "..utf-8",$string);
- return ($save_html ? $string : sm_encode_html_special_chars($string));
+ return ($save_html ? $string : sm_encode_html_special_chars($string, ENT_COMPAT, $charset));
} else {
$string = recode_string($charset . "..html",$string);
// recode does not convert single quote, sm_encode_html_special_chars does.
// iconv functions does not have html target and can be used only with utf-8
if ( $use_php_iconv && $default_charset=='utf-8') {
$string = iconv($charset,$default_charset,$string);
- return ($save_html ? $string : sm_encode_html_special_chars($string));
+ return ($save_html ? $string : sm_encode_html_special_chars($string, ENT_COMPAT, $charset));
}
// If we don't use recode and iconv, we'll do it old way.
/* All HTML special characters are 7 bit and can be replaced first */
- if (! $save_html) $string = sm_encode_html_special_chars ($string);
+ if (! $save_html) $string = sm_encode_html_special_chars($string, ENT_COMPAT, $charset);
/* controls cpu and memory intensive decoding cycles */
if (! isset($aggressive_decoding) || $aggressive_decoding=="" ) {