functions/url_parser.php

   1 <?php
   2
   3 /**
   4  * url_parser.php
   5  *
   6  * This code provides various string manipulation functions that are
   7  * used by the rest of the SquirrelMail code.
   8  *
   9  * @copyright &copy; 1999-2007 The SquirrelMail Project Team
  10  * @license http://opensource.org/licenses/gpl-license.php GNU Public License
  11  * @version $Id$
  12  * @package squirrelmail
  13  */
  14
  15 /**
  16  * Undocumented - complain, then patch.
  17  */
  18 function replaceBlock (&$in, $replace, $start, $end) {
  19     $begin = substr($in,0,$start);
  20     $end   = substr($in,$end,strlen($in)-$end);
  21     $in    = $begin.$replace.$end;
  22 }
  23
  24 /* Having this defined in just one spot could help when changes need
  25  * to be made to the pattern
  26  * Make sure that the expression is evaluated case insensitively
  27  *
  28  * RFC2822 (and RFC822) defines the left side of an email address as (roughly):
  29  *  1*atext *("." 1*atext)
  30  * where atext is: a-zA-Z0-9!#$%&'*+-/=?^_`{|}~
  31  *
  32  * Here's pretty sophisticated IP matching:
  33  * $IPMatch = '(2[0-5][0-9]|1?[0-9]{1,2})';
  34  * $IPMatch = '\[?' . $IPMatch . '(\.' . $IPMatch . '){3}\]?';
  35  */
  36 /* Here's enough: */
  37 global $IP_RegExp_Match, $Host_RegExp_Match, $Email_RegExp_Match;
  38 $IP_RegExp_Match = '\\[?[0-9]{1,3}(\\.[0-9]{1,3}){3}\\]?';
  39 $Host_RegExp_Match = '(' . $IP_RegExp_Match .
  40     '|[0-9a-z]([-.]?[0-9a-z])*\\.[a-z][a-z]+)';
  41 $atext = '([a-z0-9!#$&%*+/=?^_`{|}~-]|&amp;)';
  42 $dot_atom = $atext . '+(\.' . $atext . '+)*';
  43 $Email_RegExp_Match = $dot_atom . '(%' . $Host_RegExp_Match . ')?@' .
  44                       $Host_RegExp_Match;
  45
  46 /**
  47  * Parses a body and converts all found email addresses to clickable links.
  48  *
  49  * @param string body the body to process, by ref
  50  * @return int the number of unique addresses found
  51  */
  52 function parseEmail (&$body) {
  53     global $Email_RegExp_Match;
  54     $sbody     = $body;
  55     $addresses = array();
  56
  57     /* Find all the email addresses in the body */
  58     while(eregi($Email_RegExp_Match, $sbody, $regs)) {
  59         $addresses[$regs[0]] = strtr($regs[0], array('&amp;' => '&'));
  60         $start = strpos($sbody, $regs[0]) + strlen($regs[0]);
  61         $sbody = substr($sbody, $start);
  62     }
  63
  64     /* Replace each email address with a compose URL */
  65     foreach ($addresses as $text => $email) {
  66         $comp_uri = makeComposeLink('src/compose.php?send_to='.urlencode($email), $text);
  67         $body = str_replace($text, $comp_uri, $body);
  68     }
  69
  70     /* Return number of unique addresses found */
  71     return count($addresses);
  72 }
  73
  74
  75 /* We don't want to re-initialize this stuff for every line.  Save work
  76  * and just do it once here.
  77  */
  78 global $url_parser_url_tokens;
  79 $url_parser_url_tokens = array(
  80     'http://',
  81     'https://',
  82     'ftp://',
  83     'telnet:',  // Special case -- doesn't need the slashes
  84     'mailto:',  // Special case -- doesn't use the slashes
  85     'gopher://',
  86     'news://');
  87
  88 global $url_parser_poss_ends;
  89 $url_parser_poss_ends = array(' ', "\n", "\r", '<', '>', ".\r", ".\n",
  90     '.&nbsp;', '&nbsp;', ')', '(', '&quot;', '&lt;', '&gt;', '.<',
  91     ']', '[', '{', '}', "\240", ', ', '. ', ",\n", ",\r");
  92
  93
  94 /**
  95  * rfc 2368 (mailto URL) preg_match() regexp
  96  * @link http://www.ietf.org/rfc/rfc2368.txt
  97  * @global string MailTo_PReg_Match the encapsulated regexp for preg_match()
  98  */
  99 global $MailTo_PReg_Match;
 100 $Mailto_Email_RegExp = '[0-9a-z%]([-_.+%]?[0-9a-z])*(%' . $Host_RegExp_Match . ')?@' . $Host_RegExp_Match;
 101 $MailTo_PReg_Match = '/((?:' . $Mailto_Email_RegExp . ')*)((?:\?(?:to|cc|bcc|subject|body)=[^\s\?&=,()]+)?(?:&amp;(?:to|cc|bcc|subject|body)=[^\s\?&=,()]+)*)/i';
 102
 103 /**
 104  * Parses a body and converts all found URLs to clickable links.
 105  *
 106  * @param string body the body to process, by ref
 107  * @return void
 108  */
 109 function parseUrl (&$body) {
 110     global $url_parser_poss_ends, $url_parser_url_tokens;
 111     $start      = 0;
 112     $blength    = strlen($body);
 113
 114     while ($start < $blength) {
 115         $target_token = '';
 116         $target_pos = $blength;
 117
 118         /* Find the first token to replace */
 119         foreach ($url_parser_url_tokens as $the_token) {
 120             $pos = strpos(strtolower($body), $the_token, $start);
 121             if (is_int($pos) && $pos < $target_pos) {
 122                 $target_pos   = $pos;
 123                 $target_token = $the_token;
 124             }
 125         }
 126
 127         /* Look for email addresses between $start and $target_pos */
 128         $check_str = substr($body, $start, $target_pos-$start);
 129
 130         if (parseEmail($check_str)) {
 131             replaceBlock($body, $check_str, $start, $target_pos);
 132             $blength    = strlen($body);
 133             $target_pos = strlen($check_str) + $start;
 134         }
 135
 136         // rfc 2368 (mailto URL)
 137         if ($target_token == 'mailto:') {
 138             $target_pos += 7;    //skip mailto:
 139             $end = $blength;
 140
 141             $mailto = substr($body, $target_pos, $end-$target_pos);
 142
 143             global $MailTo_PReg_Match;
 144             if ((preg_match($MailTo_PReg_Match, $mailto, $regs)) && ($regs[0] != '')) {
 145                 //sm_print_r($regs);
 146                 $mailto_before = $target_token . $regs[0];
 147                 $mailto_params = $regs[10];
 148                 if ($regs[1]) {    //if there is an email addr before '?', we need to merge it with the params
 149                     $to = 'to=' . $regs[1];
 150                     if (strpos($mailto_params, 'to=') > -1)    //already a 'to='
 151                         $mailto_params = str_replace('to=', $to . '%2C%20', $mailto_params);
 152                     else {
 153                         if ($mailto_params)    //already some params, append to them
 154                             $mailto_params .= '&amp;' . $to;
 155                         else
 156                             $mailto_params .= '?' . $to;
 157                     }
 158                 }
 159                 $url_str = preg_replace(array('/to=/i', '/(?<!b)cc=/i', '/bcc=/i'), array('send_to=', 'send_to_cc=', 'send_to_bcc='), $mailto_params);
 160                 $comp_uri = makeComposeLink('src/compose.php' . $url_str, $mailto_before);
 161                 replaceBlock($body, $comp_uri, $target_pos - 7, $target_pos + strlen($regs[0]));
 162                 $target_pos += strlen($comp_uri) - 7;
 163             }
 164         }
 165         else
 166         /* If there was a token to replace, replace it */
 167         if ($target_token != '') {
 168             /* Find the end of the URL */
 169             $end = $blength;
 170             foreach ($url_parser_poss_ends as $val) {
 171                 $enda = strpos($body, $val, $target_pos);
 172                 if (is_int($enda) && $enda < $end) {
 173                     $end = $enda;
 174                 }
 175             }
 176
 177             /* make sure that there are no 8bit chars between $target_pos and suspected end of URL */
 178             if (!is_bool($first8bit=sq_strpos_8bit($body,$target_pos,$end))) {
 179                 $end = $first8bit;
 180             }
 181
 182             /* Extract URL */
 183             $url = substr($body, $target_pos, $end-$target_pos);
 184
 185             /* Needed since lines are not passed with \n or \r */
 186             while ( ereg("[,\.]$", $url) ) {
 187                 $url = substr( $url, 0, -1 );
 188                 $end--;
 189             }
 190
 191             /* Replace URL with HyperLinked Url, requires 1 char in link */
 192             if ($url != '' && $url != $target_token) {
 193                 $url_str = "<a href=\"$url\" target=\"_blank\">$url</a>";
 194                 replaceBlock($body,$url_str,$target_pos,$end);
 195                 $target_pos += strlen($url_str);
 196             }
 197             else {
 198                 // Not quite a valid link, skip ahead to next chance
 199                 $target_pos += strlen($target_token);
 200             }
 201         }
 202
 203         /* Move forward */
 204         $start   = $target_pos;
 205         $blength = strlen($body);
 206     }
 207 }
 208
 209 /**
 210  * Parses a string and returns the first e-mail address found.
 211  *
 212  * @param string string the string to process
 213  * @return string the first e-mail address found
 214  */
 215 function getEmail($string) {
 216     global $Email_RegExp_Match;
 217     $addresses = array();
 218
 219     /* Find all the email addresses in the body */
 220     while (eregi($Email_RegExp_Match, $string, $regs)) {
 221         $addresses[$regs[0]] = strtr($regs[0], array('&amp;' => '&'));
 222         $start = strpos($string, $regs[0]) + strlen($regs[0]);
 223         $string = substr($string, $start);
 224     }
 225
 226     /* Return the first address, or an empty string if no address was found */
 227     $addresses = array_values($addresses);
 228     return (array_key_exists(0, $addresses) ? $addresses[0] : '');
 229 }
 230
 231 /**
 232  * Finds first occurrence of 8bit data in the string
 233  *
 234  * Function finds first 8bit symbol or html entity that represents 8bit character.
 235  * Search start is defined by $offset argument. Search ends at $maxlength position.
 236  * If $maxlength is not defined or bigger than provided string, search ends when
 237  * string ends.
 238  *
 239  * Check returned data type in order to avoid confusion between bool(false)
 240  * (not found) and int(0) (first char in the string).
 241  * @param string $haystack
 242  * @param integer $offset
 243  * @param integer $maxlength
 244  * @return mixed integer with first 8bit character position or boolean false
 245  * @since 1.5.2
 246  */
 247 function sq_strpos_8bit($haystack,$offset=0,$maxlength=false) {
 248     $ret = false;
 249
 250     if ($maxlength===false || strlen($haystack) < $maxlength) {
 251         $maxlength=strlen($haystack);
 252     }
 253
 254     for($i=$offset;$i<$maxlength;$i++) {
 255         /* rh7-8 compatibility. don't use full 8bit range in regexp */
 256         if (preg_match('/[\200-\237]|\240|[\241-\377]/',$haystack[$i])) {
 257             /* we have 8bit char. stop here and return position */
 258             $ret = $i;
 259             break;
 260         } elseif ($haystack[$i]=='&') {
 261             $substring = substr($haystack,$i);
 262             /**
 263              * 1. look for "&#(decimal number);" where decimal_number is bigger than 127
 264              * 2. look for "&x(hexadecimal number);", where hex number is bigger than x7f
 265              * 3. look for any html character entity that is not 7bit html special char. Use
 266              * own sq_get_html_translation_table() function with 'utf-8' character set in
 267              * order to get all html entities.
 268              */
 269             if ((preg_match('/^&#(\d+);/',$substring,$match) && $match[1]>127) ||
 270                 (preg_match('/^&x([0-9a-f]+);/i',$substring,$match) && $match[1]>"\x7f") ||
 271                 (preg_match('/^&([a-z]+);/i',$substring,$match) &&
 272                  !in_array($match[0],get_html_translation_table(HTML_SPECIALCHARS)) &&
 273                  in_array($match[0],sq_get_html_translation_table(HTML_ENTITIES,ENT_COMPAT,'utf-8')))) {
 274                 $ret = $i;
 275                 break;
 276             }
 277         }
 278     }
 279     return $ret;
 280 }