Happy New Year
[squirrelmail.git] / functions / url_parser.php
CommitLineData
59177427 1<?php
43fcef5c 2
35586184 3/**
4 * url_parser.php
5 *
35586184 6 * This code provides various string manipulation functions that are
598294a7 7 * used by the rest of the SquirrelMail code.
35586184 8 *
c4faef33 9 * @copyright 1999-2020 The SquirrelMail Project Team
4b4abf93 10 * @license http://opensource.org/licenses/gpl-license.php GNU Public License
31841a9e 11 * @version $Id$
d6c32258 12 * @package squirrelmail
35586184 13 */
43fcef5c 14
d6c32258 15/**
16 * Undocumented - complain, then patch.
17 */
35586184 18function replaceBlock (&$in, $replace, $start, $end) {
19 $begin = substr($in,0,$start);
20 $end = substr($in,$end,strlen($in)-$end);
21 $in = $begin.$replace.$end;
22}
43fcef5c 23
01d27858 24/* Having this defined in just one spot could help when changes need
25 * to be made to the pattern
26 * Make sure that the expression is evaluated case insensitively
7e235a1a 27 *
ff18cccd 28 * RFC2822 (and RFC822) defines the left side of an email address as (roughly):
29 * 1*atext *("." 1*atext)
30 * where atext is: a-zA-Z0-9!#$%&'*+-/=?^_`{|}~
31 *
01d27858 32 * Here's pretty sophisticated IP matching:
33 * $IPMatch = '(2[0-5][0-9]|1?[0-9]{1,2})';
34 * $IPMatch = '\[?' . $IPMatch . '(\.' . $IPMatch . '){3}\]?';
35 */
36/* Here's enough: */
37global $IP_RegExp_Match, $Host_RegExp_Match, $Email_RegExp_Match;
a6ec6dff 38//FIXME: these were written for use in an ereg().... they are now being used in preg()... we need to run some tests to make sure they are fully working still
01d27858 39$IP_RegExp_Match = '\\[?[0-9]{1,3}(\\.[0-9]{1,3}){3}\\]?';
7e235a1a 40$Host_RegExp_Match = '(' . $IP_RegExp_Match .
01d27858 41 '|[0-9a-z]([-.]?[0-9a-z])*\\.[a-z][a-z]+)';
a6ec6dff 42// NB: the backslash in the following line escapes the forward slash, which assumes that the regular expression will be enclosed in /.../
43$atext = '([a-z0-9!#$&%*+\/=?^_`{|}~-]|&amp;)';
ff18cccd 44$dot_atom = $atext . '+(\.' . $atext . '+)*';
45$Email_RegExp_Match = $dot_atom . '(%' . $Host_RegExp_Match . ')?@' .
46 $Host_RegExp_Match;
7e235a1a 47
8b096f0a 48/**
49 * Parses a body and converts all found email addresses to clickable links.
50 *
51 * @param string body the body to process, by ref
52 * @return int the number of unique addresses found
53 */
01d27858 54function parseEmail (&$body) {
916669ad 55 global $Email_RegExp_Match;
cf0d436a 56 $sbody = $body;
57 $addresses = array();
58
59 /* Find all the email addresses in the body */
b7910e12 60 while (preg_match('/' . $Email_RegExp_Match . '/i', $sbody, $regs)) {
ff18cccd 61 $addresses[$regs[0]] = strtr($regs[0], array('&amp;' => '&'));
cf0d436a 62 $start = strpos($sbody, $regs[0]) + strlen($regs[0]);
63 $sbody = substr($sbody, $start);
64 }
916669ad 65
cf0d436a 66 /* Replace each email address with a compose URL */
ff18cccd 67 foreach ($addresses as $text => $email) {
68 $comp_uri = makeComposeLink('src/compose.php?send_to='.urlencode($email), $text);
69 $body = str_replace($text, $comp_uri, $body);
7e235a1a 70 }
916669ad 71
cf0d436a 72 /* Return number of unique addresses found */
73 return count($addresses);
01d27858 74}
43fcef5c 75
43fcef5c 76
01d27858 77/* We don't want to re-initialize this stuff for every line. Save work
78 * and just do it once here.
79 */
80global $url_parser_url_tokens;
81$url_parser_url_tokens = array(
82 'http://',
83 'https://',
84 'ftp://',
85 'telnet:', // Special case -- doesn't need the slashes
7efaee4f 86 'mailto:', // Special case -- doesn't use the slashes
01d27858 87 'gopher://',
88 'news://');
8f7163e7 89
01d27858 90global $url_parser_poss_ends;
62f7daa5 91$url_parser_poss_ends = array(' ', "\n", "\r", '<', '>', ".\r", ".\n",
92 '.&nbsp;', '&nbsp;', ')', '(', '&quot;', '&lt;', '&gt;', '.<',
01d27858 93 ']', '[', '{', '}', "\240", ', ', '. ', ",\n", ",\r");
8f7163e7 94
20a60f89 95
8b096f0a 96/**
7efaee4f 97 * rfc 2368 (mailto URL) preg_match() regexp
99554426 98 * @link http://www.ietf.org/rfc/rfc2368.txt
7efaee4f 99 * @global string MailTo_PReg_Match the encapsulated regexp for preg_match()
100 */
101global $MailTo_PReg_Match;
102$Mailto_Email_RegExp = '[0-9a-z%]([-_.+%]?[0-9a-z])*(%' . $Host_RegExp_Match . ')?@' . $Host_RegExp_Match;
103$MailTo_PReg_Match = '/((?:' . $Mailto_Email_RegExp . ')*)((?:\?(?:to|cc|bcc|subject|body)=[^\s\?&=,()]+)?(?:&amp;(?:to|cc|bcc|subject|body)=[^\s\?&=,()]+)*)/i';
104
105/**
8b096f0a 106 * Parses a body and converts all found URLs to clickable links.
107 *
108 * @param string body the body to process, by ref
109 * @return void
110 */
01d27858 111function parseUrl (&$body) {
99554426 112 global $url_parser_poss_ends, $url_parser_url_tokens;
cf0d436a 113 $start = 0;
114 $blength = strlen($body);
cf0d436a 115
7efaee4f 116 while ($start < $blength) {
8f7163e7 117 $target_token = '';
0d3ff000 118 $target_pos = $blength;
cf0d436a 119
01d27858 120 /* Find the first token to replace */
121 foreach ($url_parser_url_tokens as $the_token) {
122 $pos = strpos(strtolower($body), $the_token, $start);
6f1450f4 123 if (is_int($pos) && $pos < $target_pos) {
cf0d436a 124 $target_pos = $pos;
01d27858 125 $target_token = $the_token;
126 }
8f7163e7 127 }
cf0d436a 128
01d27858 129 /* Look for email addresses between $start and $target_pos */
cf0d436a 130 $check_str = substr($body, $start, $target_pos-$start);
131
01d27858 132 if (parseEmail($check_str)) {
133 replaceBlock($body, $check_str, $start, $target_pos);
cf0d436a 134 $blength = strlen($body);
01d27858 135 $target_pos = strlen($check_str) + $start;
8f7163e7 136 }
e2ef6f4b 137
879c8945 138 // rfc 2368 (mailto URL)
139 if ($target_token == 'mailto:') {
91e0dccc 140 $target_pos += 7; //skip mailto:
7efaee4f 141 $end = $blength;
142
143 $mailto = substr($body, $target_pos, $end-$target_pos);
144
145 global $MailTo_PReg_Match;
99554426 146 if ((preg_match($MailTo_PReg_Match, $mailto, $regs)) && ($regs[0] != '')) {
7efaee4f 147 //sm_print_r($regs);
148 $mailto_before = $target_token . $regs[0];
84edf699 149 /**
150 * '+' characters in a mailto URI don't need to be percent-encoded.
151 * However, when mailto URI data is transported via HTTP, '+' must
152 * be percent-encoded as %2B so that when the HTTP data is
153 * percent-decoded, you get '+' back and not a space.
154 */
155 $mailto_params = str_replace("+", "%2B", $regs[10]);
91e0dccc 156 if ($regs[1]) { //if there is an email addr before '?', we need to merge it with the params
84edf699 157 $to = 'to=' . str_replace("+", "%2B", $regs[1]);
91e0dccc 158 if (strpos($mailto_params, 'to=') > -1) //already a 'to='
7efaee4f 159 $mailto_params = str_replace('to=', $to . '%2C%20', $mailto_params);
160 else {
91e0dccc 161 if ($mailto_params) //already some params, append to them
99554426 162 $mailto_params .= '&amp;' . $to;
7efaee4f 163 else
99554426 164 $mailto_params .= '?' . $to;
7efaee4f 165 }
166 }
1bc52de5 167 $url_str = preg_replace(array('/to=/i', '/(?<!b)cc=/i', '/bcc=/i'), array('send_to=', 'send_to_cc=', 'send_to_bcc='), $mailto_params);
7efaee4f 168 $comp_uri = makeComposeLink('src/compose.php' . $url_str, $mailto_before);
99554426 169 replaceBlock($body, $comp_uri, $target_pos - 7, $target_pos + strlen($regs[0]));
170 $target_pos += strlen($comp_uri) - 7;
7efaee4f 171 }
172 }
173 else
879c8945 174 /* If there was a token to replace, replace it */
01d27858 175 if ($target_token != '') {
176 /* Find the end of the URL */
cf0d436a 177 $end = $blength;
178 foreach ($url_parser_poss_ends as $val) {
179 $enda = strpos($body, $val, $target_pos);
01d27858 180 if (is_int($enda) && $enda < $end) {
181 $end = $enda;
182 }
183 }
cf0d436a 184
1f18e313 185 /* make sure that there are no 8bit chars between $target_pos and suspected end of URL */
186 if (!is_bool($first8bit=sq_strpos_8bit($body,$target_pos,$end))) {
187 $end = $first8bit;
188 }
189
01d27858 190 /* Extract URL */
191 $url = substr($body, $target_pos, $end-$target_pos);
cf0d436a 192
01d27858 193 /* Needed since lines are not passed with \n or \r */
b7910e12 194 while ( preg_match('/[,.]$/', $url) ) {
01d27858 195 $url = substr( $url, 0, -1 );
196 $end--;
197 }
1b3324b3 198
01d27858 199 /* Replace URL with HyperLinked Url, requires 1 char in link */
200 if ($url != '' && $url != $target_token) {
201 $url_str = "<a href=\"$url\" target=\"_blank\">$url</a>";
202 replaceBlock($body,$url_str,$target_pos,$end);
203 $target_pos += strlen($url_str);
cf0d436a 204 }
205 else {
206 // Not quite a valid link, skip ahead to next chance
207 $target_pos += strlen($target_token);
01d27858 208 }
8f7163e7 209 }
cf0d436a 210
01d27858 211 /* Move forward */
cf0d436a 212 $start = $target_pos;
213 $blength = strlen($body);
01d27858 214 }
62f7daa5 215}
916669ad 216
217/**
218 * Parses a string and returns the first e-mail address found.
219 *
220 * @param string string the string to process
221 * @return string the first e-mail address found
222 */
223function getEmail($string) {
224 global $Email_RegExp_Match;
225 $addresses = array();
226
227 /* Find all the email addresses in the body */
b7910e12 228 while (preg_match('/' . $Email_RegExp_Match . '/i', $string, $regs)) {
916669ad 229 $addresses[$regs[0]] = strtr($regs[0], array('&amp;' => '&'));
91e0dccc 230 $start = strpos($string, $regs[0]) + strlen($regs[0]);
231 $string = substr($string, $start);
916669ad 232 }
233
234 /* Return the first address, or an empty string if no address was found */
235 $addresses = array_values($addresses);
236 return (array_key_exists(0, $addresses) ? $addresses[0] : '');
237}
238
1f18e313 239/**
240 * Finds first occurrence of 8bit data in the string
241 *
242 * Function finds first 8bit symbol or html entity that represents 8bit character.
243 * Search start is defined by $offset argument. Search ends at $maxlength position.
244 * If $maxlength is not defined or bigger than provided string, search ends when
245 * string ends.
246 *
247 * Check returned data type in order to avoid confusion between bool(false)
248 * (not found) and int(0) (first char in the string).
249 * @param string $haystack
250 * @param integer $offset
251 * @param integer $maxlength
252 * @return mixed integer with first 8bit character position or boolean false
253 * @since 1.5.2
254 */
255function sq_strpos_8bit($haystack,$offset=0,$maxlength=false) {
256 $ret = false;
257
258 if ($maxlength===false || strlen($haystack) < $maxlength) {
259 $maxlength=strlen($haystack);
260 }
261
262 for($i=$offset;$i<$maxlength;$i++) {
263 /* rh7-8 compatibility. don't use full 8bit range in regexp */
264 if (preg_match('/[\200-\237]|\240|[\241-\377]/',$haystack[$i])) {
265 /* we have 8bit char. stop here and return position */
266 $ret = $i;
267 break;
268 } elseif ($haystack[$i]=='&') {
269 $substring = substr($haystack,$i);
270 /**
271 * 1. look for "&#(decimal number);" where decimal_number is bigger than 127
272 * 2. look for "&x(hexadecimal number);", where hex number is bigger than x7f
273 * 3. look for any html character entity that is not 7bit html special char. Use
274 * own sq_get_html_translation_table() function with 'utf-8' character set in
275 * order to get all html entities.
276 */
277 if ((preg_match('/^&#(\d+);/',$substring,$match) && $match[1]>127) ||
278 (preg_match('/^&x([0-9a-f]+);/i',$substring,$match) && $match[1]>"\x7f") ||
279 (preg_match('/^&([a-z]+);/i',$substring,$match) &&
280 !in_array($match[0],get_html_translation_table(HTML_SPECIALCHARS)) &&
281 in_array($match[0],sq_get_html_translation_table(HTML_ENTITIES,ENT_COMPAT,'utf-8')))) {
282 $ret = $i;
283 break;
284 }
285 }
286 }
287 return $ret;
288}