From 98abf40863b317860ae2cabba04f97b69103f556 Mon Sep 17 00:00:00 2001 From: tokul Date: Sun, 27 Mar 2005 09:48:33 +0000 Subject: [PATCH] using mbstring functions for body wrapping, if they are available. own utf strlen implementation is removed because we need strlen, substr and strpos implementation in order to use it correctly. sqbodywrap function is modified to use sq_* functions instead of vanilla string functions. function calls that analize string in bytes ($body{$position}) are replaced with string functions. closes #1043576 in devel. code is not that complex in stable and it needs only modified strlen calls. git-svn-id: https://svn.code.sf.net/p/squirrelmail/code/trunk/squirrelmail@9146 7612ce4b-ef26-0410-bec9-ea0150e637f0 --- functions/strings.php | 218 +++++++++++++++++++++++------------------- 1 file changed, 119 insertions(+), 99 deletions(-) diff --git a/functions/strings.php b/functions/strings.php index 49addede..84a70af4 100644 --- a/functions/strings.php +++ b/functions/strings.php @@ -77,8 +77,6 @@ function sqMakeNewLine (&$str, $citeLevel, &$column) { /** * Checks for spaces in strings - only used if PHP doesn't have native ctype support * - * @author Tomas Kuliavas - * * You might be able to rewrite the function by adding short evaluation form. * * possible problems: @@ -127,7 +125,7 @@ function &sqBodyWrap (&$body, $wrap) { $outString = ''; // current column since the last newline in the outstring $outStringCol = 0; - $length = strlen($body); + $length = sq_strlen($body); // where we are in the original string $pos = 0; // the number of >>> citation markers we are currently at @@ -139,12 +137,12 @@ function &sqBodyWrap (&$body, $wrap) { // we're at the beginning of a line, get the new cite level $newCiteLevel = 0; - while (($pos < $length) && ($body{$pos} == '>')) { + while (($pos < $length) && (sq_substr($body,$pos,1) == '>')) { $newCiteLevel++; $pos++; // skip over any spaces interleaved among the cite markers - while (($pos < $length) && ($body{$pos} == ' ')) { + while (($pos < $length) && (sq_substr($body,$pos,1) == ' ')) { $pos++; @@ -157,8 +155,8 @@ function &sqBodyWrap (&$body, $wrap) { // special case: if this is a blank line then maintain it // (i.e. try to preserve original paragraph breaks) // unless they occur at the very beginning of the text - if (($body{$pos} == "\n" ) && (strlen($outString) != 0)) { - $outStringLast = $outString{strlen($outString) - 1}; + if ((sq_substr($body,$pos,1) == "\n" ) && (sq_strlen($outString) != 0)) { + $outStringLast = $outString{sq_strlen($outString) - 1}; if ($outStringLast != "\n") { $outString .= "\n"; } @@ -192,7 +190,7 @@ function &sqBodyWrap (&$body, $wrap) { } // find the next newline -- we don't want to go further than that - $nextNewline = strpos ($body, "\n", $pos); + $nextNewline = sq_strpos ($body, "\n", $pos); if ($nextNewline === FALSE) { $nextNewline = $length; } @@ -201,7 +199,7 @@ function &sqBodyWrap (&$body, $wrap) { // will work fine for this. Maybe revisit this later though // (for completeness more than anything else, I think) if ($citeLevel == 0) { - $outString .= substr ($body, $pos, ($nextNewline - $pos)); + $outString .= sq_substr ($body, $pos, ($nextNewline - $pos)); $outStringCol = $nextNewline - $pos; if ($nextNewline != $length) { sqMakeNewLine ($outString, 0, $outStringCol); @@ -217,7 +215,7 @@ function &sqBodyWrap (&$body, $wrap) { // the next newline while ($pos < $nextNewline) { // skip over initial spaces - while (($pos < $nextNewline) && (ctype_space ($body{$pos}))) { + while (($pos < $nextNewline) && (ctype_space (sq_substr($body,$pos,1)))) { $pos++; } // if this is a short line then just append it and continue outer loop @@ -225,24 +223,24 @@ function &sqBodyWrap (&$body, $wrap) { // if this is the final line in the input string then include // any trailing newlines // echo substr($body,$pos,$wrap). "
"; - if (($nextNewline + 1 == $length) && ($body{$nextNewline} == "\n")) { + if (($nextNewline + 1 == $length) && (sq_substr($body,$nextNewline,1) == "\n")) { $nextNewline++; } // trim trailing spaces $lastRealChar = $nextNewline; - while (($lastRealChar > $pos && $lastRealChar < $length) && (ctype_space ($body{$lastRealChar}))) { + while (($lastRealChar > $pos && $lastRealChar < $length) && (ctype_space (sq_substr($body,$lastRealChar,1)))) { $lastRealChar--; } // decide if appending the short string is what we want - if (($nextNewline < $length && $body{$nextNewline} == "\n") && + if (($nextNewline < $length && sq_substr($body,$nextNewline,1) == "\n") && isset($lastRealChar)) { $mypos = $pos; //check the first word: - while (($mypos < $length) && ($body{$mypos} == '>')) { + while (($mypos < $length) && (sq_substr($body,$mypos,1) == '>')) { $mypos++; // skip over any spaces interleaved among the cite markers - while (($mypos < $length) && ($body{$mypos} == ' ')) { + while (($mypos < $length) && (sq_substr($body,$mypos,1) == ' ')) { $mypos++; } } @@ -255,15 +253,15 @@ function &sqBodyWrap (&$body, $wrap) { } */ - $firstword = substr($body,$mypos,strpos($body,' ',$mypos) - $mypos); + $firstword = sq_substr($body,$mypos,sq_strpos($body,' ',$mypos) - $mypos); //if ($dowrap || $ldnspacecnt > 1 || ($firstword && ( if (!$smartwrap || $firstword && ( $firstword{0} == '-' || $firstword{0} == '+' || $firstword{0} == '*' || - $firstword{0} == strtoupper($firstword{0}) || + sq_substr($firstword,0,1) == sq_strtoupper(sq_substr($firstword,0,1)) || strpos($firstword,':'))) { - $outString .= substr($body,$pos,($lastRealChar - $pos+1)); + $outString .= sq_substr($body,$pos,($lastRealChar - $pos+1)); $outStringCol += ($lastRealChar - $pos); sqMakeNewLine($outString,$citeLevel,$outStringCol); $nextNewline++; @@ -274,7 +272,7 @@ function &sqBodyWrap (&$body, $wrap) { } - $outString .= substr ($body, $pos, ($lastRealChar - $pos + 1)); + $outString .= sq_substr ($body, $pos, ($lastRealChar - $pos + 1)); $outStringCol += ($lastRealChar - $pos); $pos = $nextNewline + 1; continue; @@ -293,7 +291,7 @@ function &sqBodyWrap (&$body, $wrap) { // start looking backwards for whitespace to break at. $breakPoint = $eol; - while (($breakPoint > $pos) && (! ctype_space ($body{$breakPoint}))) { + while (($breakPoint > $pos) && (! ctype_space (sq_substr($body,$breakPoint,1)))) { $breakPoint--; } @@ -326,13 +324,13 @@ function &sqBodyWrap (&$body, $wrap) { } // skip newlines or whitespace at the beginning of the string - $substring = substr ($body, $pos, ($breakPoint - $pos)); + $substring = sq_substr ($body, $pos, ($breakPoint - $pos)); $substring = rtrim ($substring); // do rtrim and ctype_space have the same ideas about whitespace? $outString .= $substring; - $outStringCol += strlen ($substring); + $outStringCol += sq_strlen ($substring); // advance past the whitespace which caused the wrap $pos = $breakPoint; - while (($pos < $length) && (ctype_space ($body{$pos}))) { + while (($pos < $length) && (ctype_space (sq_substr($body,$pos,1)))) { $pos++; } if ($pos < $length) { @@ -1065,6 +1063,7 @@ function sq_mb_list_encodings() { 'koi8-u', 'big5', 'gb2312', + 'gb18030', 'windows-1251', 'windows-1255', 'windows-1256', @@ -1094,8 +1093,9 @@ function sq_mb_list_encodings() { * Function returns number of characters in string. * * Returned number might be different from number of bytes in string, - * if $charset is multibyte charset. Currently only utf-8 charset is - * supported. + * if $charset is multibyte charset. Detection depends on mbstring + * functions. If mbstring does not support tested multibyte charset, + * vanilla string length function is used. * @param string $str string * @param string $charset charset * @since 1.5.1 @@ -1115,83 +1115,15 @@ function sq_strlen($str, $charset=''){ // lowercase charset name $charset=strtolower($charset); - // set initial returned length number - $real_length=0; + // Use mbstring only with listed charsets + $aList_of_mb_charsets=array('utf-8','big5','gb2312','gb18030','euc-jp','euc-cn','euc-tw','euc-kr'); // calculate string length according to charset - // function can be modulized same way we modulize decode/encode/htmlentities - if ($charset=='utf-8') { - if (function_exists('mb_strlen')) { - $real_length = mb_strlen($str,'utf-8'); - } else { - // function needs length of string in bytes. - // mbstring overloading might break it - $str_length=strlen($str); - $str_index=0; - while ($str_index < $str_length) { - // start of internal utf-8 multibyte character detection - if (preg_match("/[\xC0-\xDF]/",$str[$str_index]) && - isset($str[$str_index+1]) && - preg_match("/[\x80-\xBF]/",$str[$str_index+1])) { - // two byte utf-8 - $str_index=$str_index+2; - $real_length++; - } elseif (preg_match("/[\xE0-\xEF]/",$str[$str_index]) && - isset($str[$str_index+2]) && - preg_match("/[\x80-\xBF][\x80-\xBF]/",$str[$str_index+1].$str[$str_index+2])) { - // three byte utf-8 - $str_index=$str_index+3; - $real_length++; - } elseif (preg_match("/[\xF0-\xF7]/",$str[$str_index]) && - isset($str[$str_index+3]) && - preg_match("/[\x80-\xBF][\x80-\xBF][\x80-\xBF]/",$str[$str_index+1].$str[$str_index+2].$str[$str_index+3])) { - // four byte utf-8 - $str_index=$str_index+4; - $real_length++; - } elseif (preg_match("/[\xF8-\xFB]/",$str[$str_index]) && - isset($str[$str_index+4]) && - preg_match("/[\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]/", - $str[$str_index+1].$str[$str_index+2].$str[$str_index+3].$str[$str_index+4])) { - // five byte utf-8 - $str_index=$str_index+5; - $real_length++; - } elseif (preg_match("/[\xFC-\xFD]/",$str[$str_index]) && - isset($str[$str_index+5]) && - preg_match("/[\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]/", - $str[$str_index+1].$str[$str_index+2].$str[$str_index+3].$str[$str_index+4].$str[$str_index+5])) { - // six byte utf-8 - $str_index=$str_index+6; - $real_length++; - } else { - $str_index++; - $real_length++; - } - // end of internal utf-8 multibyte character detection - } - } - // end of utf-8 length detection - } elseif ($charset=='big5') { - // TODO: add big5 string length detection - $real_length=strlen($str); - } elseif ($charset=='gb2312') { - // TODO: add gb2312 string length detection - $real_length=strlen($str); - } elseif ($charset=='gb18030') { - // TODO: add gb18030 string length detection - $real_length=strlen($str); - } elseif ($charset=='euc-jp') { - // TODO: add euc-jp string length detection - $real_length=strlen($str); - } elseif ($charset=='euc-cn') { - // TODO: add euc-cn string length detection - $real_length=strlen($str); - } elseif ($charset=='euc-tw') { - // TODO: add euc-tw string length detection - $real_length=strlen($str); - } elseif ($charset=='euc-kr') { - // TODO: add euc-kr string length detection - $real_length=strlen($str); + if (in_array($charset,$aList_of_mb_charsets) && in_array($charset,sq_mb_list_encodings())) { + $real_length = mb_strlen($str,$charset); } else { + // own strlen detection code is removed because missing strpos, + // strtoupper and substr implementations break string wrapping. $real_length=strlen($str); } return $real_length; @@ -1229,5 +1161,93 @@ function sq_str_pad($string, $width, $pad, $padtype, $charset='') { } return $padded_string; } + +/** + * Wrapper that is used to switch between vanilla and multibyte substr + * functions. + * @param string $string + * @param integer $start + * @param integer $length + * @param string $charset + * @return string + * @since 1.5.1 + * @link http://www.php.net/substr + * @link http://www.php.net/mb_substr + */ +function sq_substr($string,$start,$length,$charset='auto') { + // use automatic charset detection, if function call asks for it + if ($charset=='auto') { + global $default_charset; + set_my_charset(); + $charset=$default_charset; + } + $charset = strtolower($charset); + if (function_exists('mb_internal_encoding') && + in_array($charset,sq_mb_list_encodings())) { + return mb_substr($string,$start,$length,$charset); + } + // TODO: add mbstring independent code + + // use vanilla string functions as last option + return substr($string,$start,$length); +} + +/** + * Wrapper that is used to switch between vanilla and multibyte strpos + * functions. + * @param string $haystack + * @param mixed $needle + * @param integer $offset + * @param string $charset + * @return string + * @since 1.5.1 + * @link http://www.php.net/strpos + * @link http://www.php.net/mb_strpos + */ +function sq_strpos($haystack,$needle,$offset,$charset='auto') { + // use automatic charset detection, if function call asks for it + if ($charset=='auto') { + global $default_charset; + set_my_charset(); + $charset=$default_charset; + } + $charset = strtolower($charset); + if (function_exists('mb_internal_encoding') && + in_array($charset,sq_mb_list_encodings())) { + return mb_strpos($haystack,$needle,$offset,$charset); + } + // TODO: add mbstring independent code + + // use vanilla string functions as last option + return strpos($haystack,$needle,$offset); +} + +/** + * Wrapper that is used to switch between vanilla and multibyte strtoupper + * functions. + * @param string $string + * @param string $charset + * @return string + * @since 1.5.1 + * @link http://www.php.net/strtoupper + * @link http://www.php.net/mb_strtoupper + */ +function sq_strtoupper($string,$charset='auto') { + // use automatic charset detection, if function call asks for it + if ($charset=='auto') { + global $default_charset; + set_my_charset(); + $charset=$default_charset; + } + $charset = strtolower($charset); + if (function_exists('mb_internal_encoding') && + in_array($charset,sq_mb_list_encodings())) { + return mb_strtoupper($string,$charset); + } + // TODO: add mbstring independent code + + // use vanilla string functions as last option + return strtoupper($string); +} $PHP_SELF = php_self(); ?> \ No newline at end of file -- 2.25.1