Quote dynamic regex contents to be safe. Thanks to Daniel Hahler.
[squirrelmail.git] / functions / url_parser.php
CommitLineData
59177427 1<?php
43fcef5c 2
35586184 3/**
4 * url_parser.php
5 *
35586184 6 * This code provides various string manipulation functions that are
598294a7 7 * used by the rest of the SquirrelMail code.
35586184 8 *
30460a05 9 * @copyright 1999-2009 The SquirrelMail Project Team
4b4abf93 10 * @license http://opensource.org/licenses/gpl-license.php GNU Public License
31841a9e 11 * @version $Id$
d6c32258 12 * @package squirrelmail
35586184 13 */
43fcef5c 14
d6c32258 15/**
16 * Undocumented - complain, then patch.
17 */
35586184 18function replaceBlock (&$in, $replace, $start, $end) {
19 $begin = substr($in,0,$start);
20 $end = substr($in,$end,strlen($in)-$end);
21 $in = $begin.$replace.$end;
22}
43fcef5c 23
01d27858 24/* Having this defined in just one spot could help when changes need
25 * to be made to the pattern
26 * Make sure that the expression is evaluated case insensitively
7e235a1a 27 *
ff18cccd 28 * RFC2822 (and RFC822) defines the left side of an email address as (roughly):
29 * 1*atext *("." 1*atext)
30 * where atext is: a-zA-Z0-9!#$%&'*+-/=?^_`{|}~
31 *
01d27858 32 * Here's pretty sophisticated IP matching:
33 * $IPMatch = '(2[0-5][0-9]|1?[0-9]{1,2})';
34 * $IPMatch = '\[?' . $IPMatch . '(\.' . $IPMatch . '){3}\]?';
35 */
36/* Here's enough: */
37global $IP_RegExp_Match, $Host_RegExp_Match, $Email_RegExp_Match;
a6ec6dff 38//FIXME: these were written for use in an ereg().... they are now being used in preg()... we need to run some tests to make sure they are fully working still
01d27858 39$IP_RegExp_Match = '\\[?[0-9]{1,3}(\\.[0-9]{1,3}){3}\\]?';
7e235a1a 40$Host_RegExp_Match = '(' . $IP_RegExp_Match .
01d27858 41 '|[0-9a-z]([-.]?[0-9a-z])*\\.[a-z][a-z]+)';
a6ec6dff 42// NB: the backslash in the following line escapes the forward slash, which assumes that the regular expression will be enclosed in /.../
43$atext = '([a-z0-9!#$&%*+\/=?^_`{|}~-]|&amp;)';
ff18cccd 44$dot_atom = $atext . '+(\.' . $atext . '+)*';
45$Email_RegExp_Match = $dot_atom . '(%' . $Host_RegExp_Match . ')?@' .
46 $Host_RegExp_Match;
7e235a1a 47
8b096f0a 48/**
49 * Parses a body and converts all found email addresses to clickable links.
50 *
51 * @param string body the body to process, by ref
52 * @return int the number of unique addresses found
53 */
01d27858 54function parseEmail (&$body) {
916669ad 55 global $Email_RegExp_Match;
cf0d436a 56 $sbody = $body;
57 $addresses = array();
58
59 /* Find all the email addresses in the body */
b7910e12 60 while (preg_match('/' . $Email_RegExp_Match . '/i', $sbody, $regs)) {
ff18cccd 61 $addresses[$regs[0]] = strtr($regs[0], array('&amp;' => '&'));
cf0d436a 62 $start = strpos($sbody, $regs[0]) + strlen($regs[0]);
63 $sbody = substr($sbody, $start);
64 }
916669ad 65
cf0d436a 66 /* Replace each email address with a compose URL */
ff18cccd 67 foreach ($addresses as $text => $email) {
68 $comp_uri = makeComposeLink('src/compose.php?send_to='.urlencode($email), $text);
69 $body = str_replace($text, $comp_uri, $body);
7e235a1a 70 }
916669ad 71
cf0d436a 72 /* Return number of unique addresses found */
73 return count($addresses);
01d27858 74}
43fcef5c 75
43fcef5c 76
01d27858 77/* We don't want to re-initialize this stuff for every line. Save work
78 * and just do it once here.
79 */
80global $url_parser_url_tokens;
81$url_parser_url_tokens = array(
82 'http://',
83 'https://',
84 'ftp://',
85 'telnet:', // Special case -- doesn't need the slashes
7efaee4f 86 'mailto:', // Special case -- doesn't use the slashes
01d27858 87 'gopher://',
88 'news://');
8f7163e7 89
01d27858 90global $url_parser_poss_ends;
62f7daa5 91$url_parser_poss_ends = array(' ', "\n", "\r", '<', '>', ".\r", ".\n",
92 '.&nbsp;', '&nbsp;', ')', '(', '&quot;', '&lt;', '&gt;', '.<',
01d27858 93 ']', '[', '{', '}', "\240", ', ', '. ', ",\n", ",\r");
8f7163e7 94
20a60f89 95
8b096f0a 96/**
7efaee4f 97 * rfc 2368 (mailto URL) preg_match() regexp
99554426 98 * @link http://www.ietf.org/rfc/rfc2368.txt
7efaee4f 99 * @global string MailTo_PReg_Match the encapsulated regexp for preg_match()
100 */
101global $MailTo_PReg_Match;
102$Mailto_Email_RegExp = '[0-9a-z%]([-_.+%]?[0-9a-z])*(%' . $Host_RegExp_Match . ')?@' . $Host_RegExp_Match;
103$MailTo_PReg_Match = '/((?:' . $Mailto_Email_RegExp . ')*)((?:\?(?:to|cc|bcc|subject|body)=[^\s\?&=,()]+)?(?:&amp;(?:to|cc|bcc|subject|body)=[^\s\?&=,()]+)*)/i';
104
105/**
8b096f0a 106 * Parses a body and converts all found URLs to clickable links.
107 *
108 * @param string body the body to process, by ref
109 * @return void
110 */
01d27858 111function parseUrl (&$body) {
99554426 112 global $url_parser_poss_ends, $url_parser_url_tokens;
cf0d436a 113 $start = 0;
114 $blength = strlen($body);
cf0d436a 115
7efaee4f 116 while ($start < $blength) {
8f7163e7 117 $target_token = '';
0d3ff000 118 $target_pos = $blength;
cf0d436a 119
01d27858 120 /* Find the first token to replace */
121 foreach ($url_parser_url_tokens as $the_token) {
122 $pos = strpos(strtolower($body), $the_token, $start);
6f1450f4 123 if (is_int($pos) && $pos < $target_pos) {
cf0d436a 124 $target_pos = $pos;
01d27858 125 $target_token = $the_token;
126 }
8f7163e7 127 }
cf0d436a 128
01d27858 129 /* Look for email addresses between $start and $target_pos */
cf0d436a 130 $check_str = substr($body, $start, $target_pos-$start);
131
01d27858 132 if (parseEmail($check_str)) {
133 replaceBlock($body, $check_str, $start, $target_pos);
cf0d436a 134 $blength = strlen($body);
01d27858 135 $target_pos = strlen($check_str) + $start;
8f7163e7 136 }
e2ef6f4b 137
879c8945 138 // rfc 2368 (mailto URL)
139 if ($target_token == 'mailto:') {
91e0dccc 140 $target_pos += 7; //skip mailto:
7efaee4f 141 $end = $blength;
142
143 $mailto = substr($body, $target_pos, $end-$target_pos);
144
145 global $MailTo_PReg_Match;
99554426 146 if ((preg_match($MailTo_PReg_Match, $mailto, $regs)) && ($regs[0] != '')) {
7efaee4f 147 //sm_print_r($regs);
148 $mailto_before = $target_token . $regs[0];
149 $mailto_params = $regs[10];
91e0dccc 150 if ($regs[1]) { //if there is an email addr before '?', we need to merge it with the params
7efaee4f 151 $to = 'to=' . $regs[1];
91e0dccc 152 if (strpos($mailto_params, 'to=') > -1) //already a 'to='
7efaee4f 153 $mailto_params = str_replace('to=', $to . '%2C%20', $mailto_params);
154 else {
91e0dccc 155 if ($mailto_params) //already some params, append to them
99554426 156 $mailto_params .= '&amp;' . $to;
7efaee4f 157 else
99554426 158 $mailto_params .= '?' . $to;
7efaee4f 159 }
160 }
1bc52de5 161 $url_str = preg_replace(array('/to=/i', '/(?<!b)cc=/i', '/bcc=/i'), array('send_to=', 'send_to_cc=', 'send_to_bcc='), $mailto_params);
7efaee4f 162 $comp_uri = makeComposeLink('src/compose.php' . $url_str, $mailto_before);
99554426 163 replaceBlock($body, $comp_uri, $target_pos - 7, $target_pos + strlen($regs[0]));
164 $target_pos += strlen($comp_uri) - 7;
7efaee4f 165 }
166 }
167 else
879c8945 168 /* If there was a token to replace, replace it */
01d27858 169 if ($target_token != '') {
170 /* Find the end of the URL */
cf0d436a 171 $end = $blength;
172 foreach ($url_parser_poss_ends as $val) {
173 $enda = strpos($body, $val, $target_pos);
01d27858 174 if (is_int($enda) && $enda < $end) {
175 $end = $enda;
176 }
177 }
cf0d436a 178
1f18e313 179 /* make sure that there are no 8bit chars between $target_pos and suspected end of URL */
180 if (!is_bool($first8bit=sq_strpos_8bit($body,$target_pos,$end))) {
181 $end = $first8bit;
182 }
183
01d27858 184 /* Extract URL */
185 $url = substr($body, $target_pos, $end-$target_pos);
cf0d436a 186
01d27858 187 /* Needed since lines are not passed with \n or \r */
b7910e12 188 while ( preg_match('/[,.]$/', $url) ) {
01d27858 189 $url = substr( $url, 0, -1 );
190 $end--;
191 }
1b3324b3 192
01d27858 193 /* Replace URL with HyperLinked Url, requires 1 char in link */
194 if ($url != '' && $url != $target_token) {
195 $url_str = "<a href=\"$url\" target=\"_blank\">$url</a>";
196 replaceBlock($body,$url_str,$target_pos,$end);
197 $target_pos += strlen($url_str);
cf0d436a 198 }
199 else {
200 // Not quite a valid link, skip ahead to next chance
201 $target_pos += strlen($target_token);
01d27858 202 }
8f7163e7 203 }
cf0d436a 204
01d27858 205 /* Move forward */
cf0d436a 206 $start = $target_pos;
207 $blength = strlen($body);
01d27858 208 }
62f7daa5 209}
916669ad 210
211/**
212 * Parses a string and returns the first e-mail address found.
213 *
214 * @param string string the string to process
215 * @return string the first e-mail address found
216 */
217function getEmail($string) {
218 global $Email_RegExp_Match;
219 $addresses = array();
220
221 /* Find all the email addresses in the body */
b7910e12 222 while (preg_match('/' . $Email_RegExp_Match . '/i', $string, $regs)) {
916669ad 223 $addresses[$regs[0]] = strtr($regs[0], array('&amp;' => '&'));
91e0dccc 224 $start = strpos($string, $regs[0]) + strlen($regs[0]);
225 $string = substr($string, $start);
916669ad 226 }
227
228 /* Return the first address, or an empty string if no address was found */
229 $addresses = array_values($addresses);
230 return (array_key_exists(0, $addresses) ? $addresses[0] : '');
231}
232
1f18e313 233/**
234 * Finds first occurrence of 8bit data in the string
235 *
236 * Function finds first 8bit symbol or html entity that represents 8bit character.
237 * Search start is defined by $offset argument. Search ends at $maxlength position.
238 * If $maxlength is not defined or bigger than provided string, search ends when
239 * string ends.
240 *
241 * Check returned data type in order to avoid confusion between bool(false)
242 * (not found) and int(0) (first char in the string).
243 * @param string $haystack
244 * @param integer $offset
245 * @param integer $maxlength
246 * @return mixed integer with first 8bit character position or boolean false
247 * @since 1.5.2
248 */
249function sq_strpos_8bit($haystack,$offset=0,$maxlength=false) {
250 $ret = false;
251
252 if ($maxlength===false || strlen($haystack) < $maxlength) {
253 $maxlength=strlen($haystack);
254 }
255
256 for($i=$offset;$i<$maxlength;$i++) {
257 /* rh7-8 compatibility. don't use full 8bit range in regexp */
258 if (preg_match('/[\200-\237]|\240|[\241-\377]/',$haystack[$i])) {
259 /* we have 8bit char. stop here and return position */
260 $ret = $i;
261 break;
262 } elseif ($haystack[$i]=='&') {
263 $substring = substr($haystack,$i);
264 /**
265 * 1. look for "&#(decimal number);" where decimal_number is bigger than 127
266 * 2. look for "&x(hexadecimal number);", where hex number is bigger than x7f
267 * 3. look for any html character entity that is not 7bit html special char. Use
268 * own sq_get_html_translation_table() function with 'utf-8' character set in
269 * order to get all html entities.
270 */
271 if ((preg_match('/^&#(\d+);/',$substring,$match) && $match[1]>127) ||
272 (preg_match('/^&x([0-9a-f]+);/i',$substring,$match) && $match[1]>"\x7f") ||
273 (preg_match('/^&([a-z]+);/i',$substring,$match) &&
274 !in_array($match[0],get_html_translation_table(HTML_SPECIALCHARS)) &&
275 in_array($match[0],sq_get_html_translation_table(HTML_ENTITIES,ENT_COMPAT,'utf-8')))) {
276 $ret = $i;
277 break;
278 }
279 }
280 }
281 return $ret;
282}