[squirrelmail.git] / functions / url_parser.php

<?php

/**
 * url_parser.php
 *
 * This code provides various string manipulation functions that are
 * used by the rest of the SquirrelMail code.
 *
 * @copyright 1999-2022 The SquirrelMail Project Team
 * @license http://opensource.org/licenses/gpl-license.php GNU Public License
 * @version $Id$
 * @package squirrelmail
 */

/**
 * Undocumented - complain, then patch.
 */
function replaceBlock (&$in, $replace, $start, $end) {
    $begin = substr($in,0,$start);
    $end   = substr($in,$end,strlen($in)-$end);
    $in    = $begin.$replace.$end;
}

/* Having this defined in just one spot could help when changes need
 * to be made to the pattern
 * Make sure that the expression is evaluated case insensitively
 *
 * RFC2822 (and RFC822) defines the left side of an email address as (roughly):
 *  1*atext *("." 1*atext)
 * where atext is: a-zA-Z0-9!#$%&'*+-/=?^_`{|}~
 *
 * Here's pretty sophisticated IP matching:
 * $IPMatch = '(2[0-5][0-9]|1?[0-9]{1,2})';
 * $IPMatch = '\[?' . $IPMatch . '(\.' . $IPMatch . '){3}\]?';
 */
/* Here's enough: */
global $IP_RegExp_Match, $Host_RegExp_Match, $Email_RegExp_Match;
//FIXME: these were written for use in an ereg().... they are now being used in preg()... we need to run some tests to make sure they are fully working still
$IP_RegExp_Match = '\\[?[0-9]{1,3}(\\.[0-9]{1,3}){3}\\]?';
$Host_RegExp_Match = '(' . $IP_RegExp_Match .
    '|[0-9a-z]([-.]?[0-9a-z])*\\.[a-z][a-z]+)';
// NB: the backslash in the following line escapes the forward slash, which assumes that the regular expression will be enclosed in /.../
$atext = '([a-z0-9!#$&%*+\/=?^_`{|}~-]|&amp;)';
$dot_atom = $atext . '+(\.' . $atext . '+)*';
$Email_RegExp_Match = $dot_atom . '(%' . $Host_RegExp_Match . ')?@' .
                      $Host_RegExp_Match;

/**
 * Parses a body and converts all found email addresses to clickable links.
 *
 * @param string body the body to process, by ref
 * @return int the number of unique addresses found
 */
function parseEmail (&$body) {
    global $Email_RegExp_Match;
    $sbody     = $body;
    $addresses = array();

    /* Find all the email addresses in the body */
    while (preg_match('/' . $Email_RegExp_Match . '/i', $sbody, $regs)) {
        $addresses[$regs[0]] = strtr($regs[0], array('&amp;' => '&'));
        $start = strpos($sbody, $regs[0]) + strlen($regs[0]);
        $sbody = substr($sbody, $start);
    }

    /* Replace each email address with a compose URL */
    foreach ($addresses as $text => $email) {
        $comp_uri = makeComposeLink('src/compose.php?send_to='.urlencode($email), $text);
        $body = str_replace($text, $comp_uri, $body);
    }

    /* Return number of unique addresses found */
    return count($addresses);
}


/* We don't want to re-initialize this stuff for every line.  Save work
 * and just do it once here.
 */
global $url_parser_url_tokens;
$url_parser_url_tokens = array(
    'http://',
    'https://',
    'ftp://',
    'telnet:',  // Special case -- doesn't need the slashes
    'mailto:',  // Special case -- doesn't use the slashes
    'gopher://',
    'news://');

global $url_parser_poss_ends;
$url_parser_poss_ends = array(' ', "\n", "\r", '<', '>', ".\r", ".\n",
    '.&nbsp;', '&nbsp;', ')', '(', '&quot;', '&lt;', '&gt;', '.<',
    ']', '[', '{', '}', "\240", ', ', '. ', ",\n", ",\r");


/**
 * rfc 2368 (mailto URL) preg_match() regexp
 * @link http://www.ietf.org/rfc/rfc2368.txt
 * @global string MailTo_PReg_Match the encapsulated regexp for preg_match()
 */
global $MailTo_PReg_Match;
$Mailto_Email_RegExp = '[0-9a-z%]([-_.+%]?[0-9a-z])*(%' . $Host_RegExp_Match . ')?@' . $Host_RegExp_Match;
$MailTo_PReg_Match = '/((?:' . $Mailto_Email_RegExp . ')*)((?:\?(?:to|cc|bcc|subject|body)=[^\s\?&=,()]+)?(?:&amp;(?:to|cc|bcc|subject|body)=[^\s\?&=,()]+)*)/i';

/**
 * Parses a body and converts all found URLs to clickable links.
 *
 * @param string body the body to process, by ref
 * @return void
 */
function parseUrl (&$body) {
    global $url_parser_poss_ends, $url_parser_url_tokens;
    $start      = 0;
    $blength    = strlen($body);

    while ($start < $blength) {
        $target_token = '';
        $target_pos = $blength;

        /* Find the first token to replace */
        foreach ($url_parser_url_tokens as $the_token) {
            $pos = strpos(strtolower($body), $the_token, $start);
            if (is_int($pos) && $pos < $target_pos) {
                $target_pos   = $pos;
                $target_token = $the_token;
            }
        }

        /* Look for email addresses between $start and $target_pos */
        $check_str = substr($body, $start, $target_pos-$start);

        if (parseEmail($check_str)) {
            replaceBlock($body, $check_str, $start, $target_pos);
            $blength    = strlen($body);
            $target_pos = strlen($check_str) + $start;
        }

        // rfc 2368 (mailto URL)
        if ($target_token == 'mailto:') {
            $target_pos += 7;    //skip mailto:
            $end = $blength;

            $mailto = substr($body, $target_pos, $end-$target_pos);

            global $MailTo_PReg_Match;
            if ((preg_match($MailTo_PReg_Match, $mailto, $regs)) && ($regs[0] != '')) {
                //sm_print_r($regs);
                $mailto_before = $target_token . $regs[0];
                /**
                 * '+' characters in a mailto URI don't need to be percent-encoded.
                 * However, when mailto URI data is transported via HTTP, '+' must
                 * be percent-encoded as %2B so that when the HTTP data is
                 * percent-decoded, you get '+' back and not a space.
                 */
                $mailto_params = str_replace("+", "%2B", $regs[10]);
                if ($regs[1]) {    //if there is an email addr before '?', we need to merge it with the params
                    $to = 'to=' . str_replace("+", "%2B", $regs[1]);
                    if (strpos($mailto_params, 'to=') > -1)    //already a 'to='
                        $mailto_params = str_replace('to=', $to . '%2C%20', $mailto_params);
                    else {
                        if ($mailto_params)    //already some params, append to them
                            $mailto_params .= '&amp;' . $to;
                        else
                            $mailto_params .= '?' . $to;
                    }
                }
                $url_str = preg_replace(array('/to=/i', '/(?<!b)cc=/i', '/bcc=/i'), array('send_to=', 'send_to_cc=', 'send_to_bcc='), $mailto_params);
                $comp_uri = makeComposeLink('src/compose.php' . $url_str, $mailto_before);
                replaceBlock($body, $comp_uri, $target_pos - 7, $target_pos + strlen($regs[0]));
                $target_pos += strlen($comp_uri) - 7;
            }
        }
        else
        /* If there was a token to replace, replace it */
        if ($target_token != '') {
            /* Find the end of the URL */
            $end = $blength;
            foreach ($url_parser_poss_ends as $val) {
                $enda = strpos($body, $val, $target_pos);
                if (is_int($enda) && $enda < $end) {
                    $end = $enda;
                }
            }

            /* make sure that there are no 8bit chars between $target_pos and suspected end of URL */
            if (!is_bool($first8bit=sq_strpos_8bit($body,$target_pos,$end))) {
                $end = $first8bit;
            } 

            /* Extract URL */
            $url = substr($body, $target_pos, $end-$target_pos);

            /* Needed since lines are not passed with \n or \r */
            while ( preg_match('/[,.]$/', $url) ) {
                $url = substr( $url, 0, -1 );
                $end--;
            }

            /* Replace URL with HyperLinked Url, requires 1 char in link */
            if ($url != '' && $url != $target_token) {
                $url_str = "<a href=\"$url\" target=\"_blank\">$url</a>";
                replaceBlock($body,$url_str,$target_pos,$end);
                $target_pos += strlen($url_str);
            }
            else {
                // Not quite a valid link, skip ahead to next chance
                $target_pos += strlen($target_token);
            }
        }

        /* Move forward */
        $start   = $target_pos;
        $blength = strlen($body);
    }
}

/**
 * Parses a string and returns the first e-mail address found.
 *
 * @param string string the string to process
 * @return string the first e-mail address found
 */
function getEmail($string) {
    global $Email_RegExp_Match;
    $addresses = array();

    /* Find all the email addresses in the body */
    while (preg_match('/' . $Email_RegExp_Match . '/i', $string, $regs)) {
        $addresses[$regs[0]] = strtr($regs[0], array('&amp;' => '&'));
        $start = strpos($string, $regs[0]) + strlen($regs[0]);
        $string = substr($string, $start);
    }

    /* Return the first address, or an empty string if no address was found */
    $addresses = array_values($addresses);
    return (array_key_exists(0, $addresses) ? $addresses[0] : '');
}

/**
 * Finds first occurrence of 8bit data in the string
 *
 * Function finds first 8bit symbol or html entity that represents 8bit character.
 * Search start is defined by $offset argument. Search ends at $maxlength position.
 * If $maxlength is not defined or bigger than provided string, search ends when 
 * string ends.
 *
 * Check returned data type in order to avoid confusion between bool(false) 
 * (not found) and int(0) (first char in the string).
 * @param string $haystack
 * @param integer $offset
 * @param integer $maxlength
 * @return mixed integer with first 8bit character position or boolean false 
 * @since 1.5.2
 */
function sq_strpos_8bit($haystack,$offset=0,$maxlength=false) {
    $ret = false;

    if ($maxlength===false || strlen($haystack) < $maxlength) {
        $maxlength=strlen($haystack);
    }

    for($i=$offset;$i<$maxlength;$i++) {
        /* rh7-8 compatibility. don't use full 8bit range in regexp */
        if (preg_match('/[\200-\237]|\240|[\241-\377]/',$haystack[$i])) {
            /* we have 8bit char. stop here and return position */
            $ret = $i;
            break;
        } elseif ($haystack[$i]=='&') {
            $substring = substr($haystack,$i);
            /**
             * 1. look for "&#(decimal number);" where decimal_number is bigger than 127
             * 2. look for "&x(hexadecimal number);", where hex number is bigger than x7f
             * 3. look for any html character entity that is not 7bit html special char. Use 
             * own sq_get_html_translation_table() function with 'utf-8' character set in 
             * order to get all html entities.
             */
            if ((preg_match('/^&#(\d+);/',$substring,$match) && $match[1]>127) ||
                (preg_match('/^&x([0-9a-f]+);/i',$substring,$match) && $match[1]>"\x7f") ||
                (preg_match('/^&([a-z]+);/i',$substring,$match) && 
                 !in_array($match[0],get_html_translation_table(HTML_SPECIALCHARS)) && 
                 in_array($match[0],sq_get_html_translation_table(HTML_ENTITIES,ENT_COMPAT,'utf-8')))) {
                $ret = $i;
                break;
            }
        }
    }
    return $ret;
}
Commit	Line	Data
59177427	1	<?php
43fcef5c	2
35586184	3	/**
	4	* url_parser.php
	5	*
35586184	6	* This code provides various string manipulation functions that are
598294a7	7	* used by the rest of the SquirrelMail code.
35586184	8	*
77a1e3d1	9	* @copyright 1999-2022 The SquirrelMail Project Team
4b4abf93	10	* @license http://opensource.org/licenses/gpl-license.php GNU Public License
31841a9e	11	* @version $Id$
d6c32258	12	* @package squirrelmail
35586184	13	*/
43fcef5c	14
d6c32258	15	/**
	16	* Undocumented - complain, then patch.
	17	*/
35586184	18	function replaceBlock (&$in, $replace, $start, $end) {
	19	$begin = substr($in,0,$start);
	20	$end = substr($in,$end,strlen($in)-$end);
	21	$in = $begin.$replace.$end;
	22	}
43fcef5c	23
01d27858	24	/* Having this defined in just one spot could help when changes need
	25	* to be made to the pattern
	26	* Make sure that the expression is evaluated case insensitively
7e235a1a	27	*
ff18cccd	28	* RFC2822 (and RFC822) defines the left side of an email address as (roughly):
	29	* 1atext ("." 1*atext)
	30	* where atext is: a-zA-Z0-9!#$%&'*+-/=?^_`{\|}~
	31	*
01d27858	32	* Here's pretty sophisticated IP matching:
	33	* $IPMatch = '(2[0-5][0-9]\|1?[0-9]{1,2})';
	34	* $IPMatch = '\[?' . $IPMatch . '(\.' . $IPMatch . '){3}\]?';
	35	*/
	36	/* Here's enough: */
	37	global $IP_RegExp_Match, $Host_RegExp_Match, $Email_RegExp_Match;
a6ec6dff	38	//FIXME: these were written for use in an ereg().... they are now being used in preg()... we need to run some tests to make sure they are fully working still
01d27858	39	$IP_RegExp_Match = '\\[?[0-9]{1,3}(\\.[0-9]{1,3}){3}\\]?';
7e235a1a	40	$Host_RegExp_Match = '(' . $IP_RegExp_Match .
01d27858	41	'\|[0-9a-z]([-.]?[0-9a-z])*\\.[a-z][a-z]+)';
a6ec6dff	42	// NB: the backslash in the following line escapes the forward slash, which assumes that the regular expression will be enclosed in /.../
a6ec6dff	43	$atext = '([a-z0-9!#$&%*+\/=?^_`{\|}~-]\|&)';
ff18cccd	44	$dot_atom = $atext . '+(\.' . $atext . '+)*';
	45	$Email_RegExp_Match = $dot_atom . '(%' . $Host_RegExp_Match . ')?@' .
	46	$Host_RegExp_Match;
7e235a1a	47
8b096f0a	48	/**
	49	* Parses a body and converts all found email addresses to clickable links.
	50	*
	51	* @param string body the body to process, by ref
	52	* @return int the number of unique addresses found
	53	*/
01d27858	54	function parseEmail (&$body) {
916669ad	55	global $Email_RegExp_Match;
cf0d436a	56	$sbody = $body;
	57	$addresses = array();
	58
	59	/* Find all the email addresses in the body */
b7910e12	60	while (preg_match('/' . $Email_RegExp_Match . '/i', $sbody, $regs)) {
ff18cccd	61	$addresses[$regs[0]] = strtr($regs[0], array('&' => '&'));
cf0d436a	62	$start = strpos($sbody, $regs[0]) + strlen($regs[0]);
	63	$sbody = substr($sbody, $start);
	64	}
916669ad	65
cf0d436a	66	/* Replace each email address with a compose URL */
ff18cccd	67	foreach ($addresses as $text => $email) {
	68	$comp_uri = makeComposeLink('src/compose.php?send_to='.urlencode($email), $text);
	69	$body = str_replace($text, $comp_uri, $body);
7e235a1a	70	}
916669ad	71
cf0d436a	72	/* Return number of unique addresses found */
cf0d436a	73	return count($addresses);
01d27858	74	}
43fcef5c	75
43fcef5c	76
01d27858	77	/* We don't want to re-initialize this stuff for every line. Save work
	78	* and just do it once here.
	79	*/
	80	global $url_parser_url_tokens;
	81	$url_parser_url_tokens = array(
	82	'http://',
	83	'https://',
	84	'ftp://',
	85	'telnet:', // Special case -- doesn't need the slashes
7efaee4f	86	'mailto:', // Special case -- doesn't use the slashes
01d27858	87	'gopher://',
01d27858	88	'news://');
8f7163e7	89
01d27858	90	global $url_parser_poss_ends;
62f7daa5	91	$url_parser_poss_ends = array(' ', "\n", "\r", '<', '>', ".\r", ".\n",
62f7daa5	92	'. ', ' ', ')', '(', '"', '<', '>', '.<',
01d27858	93	']', '[', '{', '}', "\240", ', ', '. ', ",\n", ",\r");
8f7163e7	94
20a60f89	95
7efaee4f	96	/**
7efaee4f	97	* rfc 2368 (mailto URL) preg_match() regexp
99554426	98	* @link http://www.ietf.org/rfc/rfc2368.txt
7efaee4f	99	* @global string MailTo_PReg_Match the encapsulated regexp for preg_match()
	100	*/
	101	global $MailTo_PReg_Match;
	102	$Mailto_Email_RegExp = '[0-9a-z%]([-_.+%]?[0-9a-z])*(%' . $Host_RegExp_Match . ')?@' . $Host_RegExp_Match;
	103	$MailTo_PReg_Match = '/((?:' . $Mailto_Email_RegExp . '))((?:\?(?:to\|cc\|bcc\|subject\|body)=[^\s\?&=,()]+)?(?:&(?:to\|cc\|bcc\|subject\|body)=[^\s\?&=,()]+))/i';
	104
8b096f0a	105	/**
	106	* Parses a body and converts all found URLs to clickable links.
	107	*
	108	* @param string body the body to process, by ref
	109	* @return void
	110	*/
01d27858	111	function parseUrl (&$body) {
99554426	112	global $url_parser_poss_ends, $url_parser_url_tokens;
cf0d436a	113	$start = 0;
cf0d436a	114	$blength = strlen($body);
cf0d436a	115
7efaee4f	116	while ($start < $blength) {
8f7163e7	117	$target_token = '';
0d3ff000	118	$target_pos = $blength;
cf0d436a	119
01d27858	120	/* Find the first token to replace */
	121	foreach ($url_parser_url_tokens as $the_token) {
	122	$pos = strpos(strtolower($body), $the_token, $start);
6f1450f4	123	if (is_int($pos) && $pos < $target_pos) {
cf0d436a	124	$target_pos = $pos;
01d27858	125	$target_token = $the_token;
01d27858	126	}
8f7163e7	127	}
cf0d436a	128
01d27858	129	/* Look for email addresses between $start and $target_pos */
cf0d436a	130	$check_str = substr($body, $start, $target_pos-$start);
cf0d436a	131
01d27858	132	if (parseEmail($check_str)) {
01d27858	133	replaceBlock($body, $check_str, $start, $target_pos);
cf0d436a	134	$blength = strlen($body);
01d27858	135	$target_pos = strlen($check_str) + $start;
8f7163e7	136	}
e2ef6f4b	137
879c8945	138	// rfc 2368 (mailto URL)
879c8945	139	if ($target_token == 'mailto:') {
91e0dccc	140	$target_pos += 7; //skip mailto:
7efaee4f	141	$end = $blength;
	142
	143	$mailto = substr($body, $target_pos, $end-$target_pos);
	144
	145	global $MailTo_PReg_Match;
99554426	146	if ((preg_match($MailTo_PReg_Match, $mailto, $regs)) && ($regs[0] != '')) {
7efaee4f	147	//sm_print_r($regs);
7efaee4f	148	$mailto_before = $target_token . $regs[0];
84edf699	149	/**
	150	* '+' characters in a mailto URI don't need to be percent-encoded.
	151	* However, when mailto URI data is transported via HTTP, '+' must
	152	* be percent-encoded as %2B so that when the HTTP data is
	153	* percent-decoded, you get '+' back and not a space.
	154	*/
	155	$mailto_params = str_replace("+", "%2B", $regs[10]);
91e0dccc	156	if ($regs[1]) { //if there is an email addr before '?', we need to merge it with the params
84edf699	157	$to = 'to=' . str_replace("+", "%2B", $regs[1]);
91e0dccc	158	if (strpos($mailto_params, 'to=') > -1) //already a 'to='
7efaee4f	159	$mailto_params = str_replace('to=', $to . '%2C%20', $mailto_params);
7efaee4f	160	else {
91e0dccc	161	if ($mailto_params) //already some params, append to them
99554426	162	$mailto_params .= '&' . $to;
7efaee4f	163	else
99554426	164	$mailto_params .= '?' . $to;
7efaee4f	165	}
7efaee4f	166	}
1bc52de5	167	$url_str = preg_replace(array('/to=/i', '/(?<!b)cc=/i', '/bcc=/i'), array('send_to=', 'send_to_cc=', 'send_to_bcc='), $mailto_params);
7efaee4f	168	$comp_uri = makeComposeLink('src/compose.php' . $url_str, $mailto_before);
99554426	169	replaceBlock($body, $comp_uri, $target_pos - 7, $target_pos + strlen($regs[0]));
99554426	170	$target_pos += strlen($comp_uri) - 7;
7efaee4f	171	}
	172	}
	173	else
879c8945	174	/* If there was a token to replace, replace it */
01d27858	175	if ($target_token != '') {
01d27858	176	/* Find the end of the URL */
cf0d436a	177	$end = $blength;
	178	foreach ($url_parser_poss_ends as $val) {
	179	$enda = strpos($body, $val, $target_pos);
01d27858	180	if (is_int($enda) && $enda < $end) {
	181	$end = $enda;
	182	}
	183	}
cf0d436a	184
1f18e313	185	/* make sure that there are no 8bit chars between $target_pos and suspected end of URL */
	186	if (!is_bool($first8bit=sq_strpos_8bit($body,$target_pos,$end))) {
	187	$end = $first8bit;
	188	}
	189
01d27858	190	/* Extract URL */
01d27858	191	$url = substr($body, $target_pos, $end-$target_pos);
cf0d436a	192
01d27858	193	/* Needed since lines are not passed with \n or \r */
b7910e12	194	while ( preg_match('/[,.]$/', $url) ) {
01d27858	195	$url = substr( $url, 0, -1 );
	196	$end--;
	197	}
1b3324b3	198
01d27858	199	/* Replace URL with HyperLinked Url, requires 1 char in link */
	200	if ($url != '' && $url != $target_token) {
	201	$url_str = "<a href=\"$url\" target=\"_blank\">$url</a>";
	202	replaceBlock($body,$url_str,$target_pos,$end);
	203	$target_pos += strlen($url_str);
cf0d436a	204	}
	205	else {
	206	// Not quite a valid link, skip ahead to next chance
	207	$target_pos += strlen($target_token);
01d27858	208	}
8f7163e7	209	}
cf0d436a	210
01d27858	211	/* Move forward */
cf0d436a	212	$start = $target_pos;
cf0d436a	213	$blength = strlen($body);
01d27858	214	}
62f7daa5	215	}
916669ad	216
	217	/**
	218	* Parses a string and returns the first e-mail address found.
	219	*
	220	* @param string string the string to process
	221	* @return string the first e-mail address found
	222	*/
	223	function getEmail($string) {
	224	global $Email_RegExp_Match;
	225	$addresses = array();
	226
	227	/* Find all the email addresses in the body */
b7910e12	228	while (preg_match('/' . $Email_RegExp_Match . '/i', $string, $regs)) {
916669ad	229	$addresses[$regs[0]] = strtr($regs[0], array('&' => '&'));
91e0dccc	230	$start = strpos($string, $regs[0]) + strlen($regs[0]);
91e0dccc	231	$string = substr($string, $start);
916669ad	232	}
	233
	234	/* Return the first address, or an empty string if no address was found */
	235	$addresses = array_values($addresses);
	236	return (array_key_exists(0, $addresses) ? $addresses[0] : '');
	237	}
	238
1f18e313	239	/**
	240	* Finds first occurrence of 8bit data in the string
	241	*
	242	* Function finds first 8bit symbol or html entity that represents 8bit character.
	243	* Search start is defined by $offset argument. Search ends at $maxlength position.
	244	* If $maxlength is not defined or bigger than provided string, search ends when
	245	* string ends.
	246	*
	247	* Check returned data type in order to avoid confusion between bool(false)
	248	* (not found) and int(0) (first char in the string).
	249	* @param string $haystack
	250	* @param integer $offset
	251	* @param integer $maxlength
	252	* @return mixed integer with first 8bit character position or boolean false
	253	* @since 1.5.2
	254	*/
	255	function sq_strpos_8bit($haystack,$offset=0,$maxlength=false) {
	256	$ret = false;
	257
	258	if ($maxlength===false \|\| strlen($haystack) < $maxlength) {
	259	$maxlength=strlen($haystack);
	260	}
	261
	262	for($i=$offset;$i<$maxlength;$i++) {
	263	/* rh7-8 compatibility. don't use full 8bit range in regexp */
	264	if (preg_match('/[\200-\237]\|\240\|[\241-\377]/',$haystack[$i])) {
	265	/* we have 8bit char. stop here and return position */
	266	$ret = $i;
	267	break;
	268	} elseif ($haystack[$i]=='&') {
	269	$substring = substr($haystack,$i);
	270	/**
	271	* 1. look for "&#(decimal number);" where decimal_number is bigger than 127
	272	* 2. look for "&x(hexadecimal number);", where hex number is bigger than x7f
	273	* 3. look for any html character entity that is not 7bit html special char. Use
	274	* own sq_get_html_translation_table() function with 'utf-8' character set in
	275	* order to get all html entities.
	276	*/
	277	if ((preg_match('/^&#(\d+);/',$substring,$match) && $match[1]>127) \|\|
	278	(preg_match('/^&x([0-9a-f]+);/i',$substring,$match) && $match[1]>"\x7f") \|\|
	279	(preg_match('/^&([a-z]+);/i',$substring,$match) &&
	280	!in_array($match[0],get_html_translation_table(HTML_SPECIALCHARS)) &&
	281	in_array($match[0],sq_get_html_translation_table(HTML_ENTITIES,ENT_COMPAT,'utf-8')))) {
	282	$ret = $i;
	283	break;
	284	}
	285	}
	286	}
	287	return $ret;
	288	}