Add handling for RCDATA and RAWTEXT elements in HTML sanitizer (CVE-2019-12970)

[squirrelmail.git] / functions / mime.php
diff --git a/functions/mime.php b/functions/mime.php

index 33a87747e0038c4169fb4177cfb29bcf52a48028..b17eb5b2cc7737c24fdd6b2e1913d724eadd0b82 100644 (file)
--- a/functions/mime.php
+++ b/functions/mime.php
@@ -6,7 +6,7 @@
   * This contains the functions necessary to detect and decode MIME
   * messages.
   *
- * @copyright 1999-2012 The SquirrelMail Project Team
+ * @copyright 1999-2019 The SquirrelMail Project Team
   * @license http://opensource.org/licenses/gpl-license.php GNU Public License
   * @version $Id$
   * @package squirrelmail
@@ -63,7 +63,7 @@ function mime_structure ($bodystructure, $flags=array()) {
          displayPageHeader( $color, $mailbox );
          $errormessage  = _("SquirrelMail could not decode the bodystructure of the message");
          $errormessage .= '<br />'._("The bodystructure provided by your IMAP server:").'<br /><br />';
-        $errormessage .= '<pre>' . htmlspecialchars($read) . '</pre>';
+        $errormessage .= '<pre>' . sm_encode_html_special_chars($read) . '</pre>';
          plain_error_message( $errormessage );
          echo '</body></html>';
          exit;
@@ -137,7 +137,13 @@ function mime_fetch_body($imap_stream, $id, $ent_id=1, $fetch_size=0) {
      $data = sqimap_run_command ($imap_stream, $cmd, true, $response, $message, TRUE);
      do {
          $topline = trim(array_shift($data));
-    } while($topline && ($topline[0] == '*') && !preg_match('/\* [0-9]+ FETCH.*/i', $topline)) ;
+    } while($topline && ($topline[0] == '*') && !preg_match('/\* [0-9]+ FETCH .*BODY.*/i', $topline)) ;
+    // Matching with "BODY" above is difficult: in most cases "FETCH \(BODY" would work
+    // but some servers may put other things in the same result, perhaps something such
+    // as "* 23 FETCH (FLAGS (\Seen) BODY[1] {174}".  There is some small chance that
+    // if the character sequence "BODY" appears in a response where it isn't actually
+    // a FETCH response data item name, the current regex will break things.  The better
+    // way to do this would be to parse the response correctly and not use a regex.
  
      $wholemessage = implode('', $data);
      if (preg_match('/\{([^\}]*)\}/', $topline, $regs)) {
@@ -508,7 +514,8 @@ function formatBody($imap_stream, $message, $color, $wrap_at, $ent_num, $id, $ma
   * @param integer $id message id
   */
  function buildAttachmentArray($message, $exclude_id, $mailbox, $id) {
-    global $where, $what, $startMessage, $color, $passed_ent_id, $base_uri;
+    global $where, $what, $startMessage, $color, $passed_ent_id,
+           $base_uri, $block_svg_download;
  
      $att_ar = $message->getAttachments($exclude_id);
      $urlMailbox = urlencode($mailbox);
@@ -519,6 +526,9 @@ function buildAttachmentArray($message, $exclude_id, $mailbox, $id) {
          $header = $att->header;
          $type0 = strtolower($header->type0);
          $type1 = strtolower($header->type1);
+        if ($block_svg_download && strpos($type1, 'svg') === 0)
+            continue;
+
          $name = '';
          $links = array();
          $links['download link']['text'] = _("Download");
@@ -626,7 +636,7 @@ function buildAttachmentArray($message, $exclude_id, $mailbox, $id) {
          $this_attachment['DownloadHREF'] = $links['download link']['href'];
          $this_attachment['ViewHREF'] = isset($links['attachment_common']) ? $links['attachment_common']['href'] : '';
          $this_attachment['Size'] = $header->size;
-        $this_attachment['ContentType'] = htmlspecialchars($type0 .'/'. $type1);
+        $this_attachment['ContentType'] = sm_encode_html_special_chars($type0 .'/'. $type1);
          $this_attachment['OtherLinks'] = array();
          foreach ($links as $val) {
              if ($val['text']==_("Download") || $val['text'] == _("View"))
@@ -785,7 +795,7 @@ function decodeBody($string, $encoding, $force_crlf='') {
   * @return string decoded header string
   */
  function decodeHeader ($string, $utfencode=true,$htmlsafe=true,$decide=false) {
-    global $languages, $squirrelmail_language,$default_charset;
+    global $languages, $squirrelmail_language,$default_charset, $fix_broken_base64_encoded_messages;
      if (is_array($string)) {
          $string = implode("\n", $string);
      }
@@ -800,6 +810,7 @@ function decodeHeader ($string, $utfencode=true,$htmlsafe=true,$decide=false) {
      $iLastMatch = -2;
      $encoded = true;
  
+// FIXME: spaces are allowed inside quoted-printable encoding, but the following line will bust up any such encoded strings
      $aString = explode(' ',$string);
      $ret = '';
      foreach ($aString as $chunk) {
@@ -825,7 +836,7 @@ function decodeHeader ($string, $utfencode=true,$htmlsafe=true,$decide=false) {
              $iLastMatch = $i;
              $j = $i;
              if ($htmlsafe) {
-                $ret .= htmlspecialchars($res[1]);
+                $ret .= sm_encode_html_special_chars($res[1]);
              } else {
                  $ret .= $res[1];
              }
@@ -841,6 +852,13 @@ function decodeHeader ($string, $utfencode=true,$htmlsafe=true,$decide=false) {
              switch ($encoding)
              {
                  case 'B':
+                    // fix broken base64-encoded strings (remove end = padding,
+                    // change any = to + in middle of string, add padding back
+                    // to the end)
+                    if ($fix_broken_base64_encoded_messages) {
+                        $encoded_string_minus_padding = strtr(rtrim($res[4], '='), '=', '+');
+                        $res[4] = str_pad($encoded_string_minus_padding, strlen($res[4]), '=');
+                    }
                      $replace = base64_decode($res[4]);
                      if ($utfencode) {
                          if ($can_be_encoded) {
@@ -854,14 +872,15 @@ function decodeHeader ($string, $utfencode=true,$htmlsafe=true,$decide=false) {
                          }
                      } else {
                          if ($htmlsafe) {
-                            $replace = htmlspecialchars($replace);
+                            $replace = sm_encode_html_special_chars($replace);
                          }
                          $ret.= $replace;
                      }
                      break;
                  case 'Q':
                      $replace = str_replace('_', ' ', $res[4]);
-                    $replace = preg_replace('/=([0-9a-f]{2})/ie', 'chr(hexdec("\1"))',
+                    $replace = preg_replace_callback('/=([0-9a-f]{2})/i',
+                            create_function ('$matches', 'return chr(hexdec($matches[1]));'),
                              $replace);
                      if ($utfencode) {
                          if ($can_be_encoded) {
@@ -875,7 +894,7 @@ function decodeHeader ($string, $utfencode=true,$htmlsafe=true,$decide=false) {
                          }
                      } else {
                          if ($htmlsafe) {
-                            $replace = htmlspecialchars($replace);
+                            $replace = sm_encode_html_special_chars($replace);
                          }
                      }
                      $ret .= $replace;
@@ -895,7 +914,7 @@ function decodeHeader ($string, $utfencode=true,$htmlsafe=true,$decide=false) {
          }
  
          if (!$encoded && $htmlsafe) {
-            $ret .= htmlspecialchars($chunk);
+            $ret .= sm_encode_html_special_chars($chunk);
          } else {
              $ret .= $chunk;
          }
@@ -1378,9 +1397,8 @@ function sq_casenormalize(&$val){
  function sq_skipspace($body, $offset){
      $me = 'sq_skipspace';
      preg_match('/^(\s*)/s', substr($body, $offset), $matches);
-    if (sizeof($matches{1})){
-        $count = strlen($matches{1});
-        $offset += $count;
+    if (!empty($matches[1])){
+        $offset += strlen($matches[1]);
      }
      return $offset;
  }
@@ -1853,7 +1871,9 @@ function sq_fixatts($tagname,
          /**
           * Use white list based filtering on attributes which can contain url's
           */
-        else if ($attname == 'href' || $attname == 'src' || $attname == 'background') {
+        else if ($attname == 'href' || $attname == 'xlink:href' || $attname == 'src'
+              || $attname == 'poster' || $attname == 'formaction'
+              || $attname == 'background' || $attname == 'action') {
              sq_fix_url($attname, $attvalue, $message, $id, $mailbox);
              $attary{$attname} = $attvalue;
          }
@@ -1890,7 +1910,9 @@ function sq_fix_url($attname, &$attvalue, $message, $id, $mailbox,$sQuote = '"')
      // images off by default.
      sqgetGlobalVar('view_unsafe_images', $view_unsafe_images, SQ_GET, FALSE);
  
-    $secremoveimg = '../images/' . _("sec_remove_eng.png");
+    global $use_transparent_security_image;
+    if ($use_transparent_security_image) $secremoveimg = '../images/spacer.png';
+    else $secremoveimg = '../images/' . _("sec_remove_eng.png");
  
      /**
       * Replace empty src tags with the blank image.  src is only used
@@ -2011,7 +2033,7 @@ function sq_fix_url($attname, &$attvalue, $message, $id, $mailbox,$sQuote = '"')
                          break;
                  }
              } else {
-                if (!(isset($aUrl['path']) && $aUrl['path'] == $secremoveimg)) {
+                if (!isset($aUrl['path']) || $aUrl['path'] != $secremoveimg) {
                      // parse_url did not lead to satisfying result
                      $attvalue = $sQuote . SM_PATH . 'images/blank.png' . $sQuote;
                  }
@@ -2103,7 +2125,11 @@ function sq_fixstyle($body, $pos, $message, $id, $mailbox){
       * and change it to .bodyclass so we can just assign it to a <div>
       */
      $content = preg_replace("|body(\s*\{.*?\})|si", ".bodyclass\\1", $content);
-    $secremoveimg = '../images/' . _("sec_remove_eng.png");
+
+    global $use_transparent_security_image;
+    if ($use_transparent_security_image) $secremoveimg = '../images/spacer.png';
+    else $secremoveimg = '../images/' . _("sec_remove_eng.png");
+
      /**
      * Fix url('blah') declarations.
      */
@@ -2155,7 +2181,7 @@ function sq_fixstyle($body, $pos, $message, $id, $mailbox){
       * be set to relative and move itself anywhere it wants to,
       * displaying content in areas it shouldn't be allowed to touch.
       */
-    $match   = Array('/\/\*.*\*\//',
+    $match   = Array('/\/\*.*\*\//', // removes /* blah blah */
                      '/expression/i',
                      '/behaviou*r/i',
                      '/binding/i',
@@ -2303,6 +2329,7 @@ function sq_body2div($attary, $mailbox, $message, $id){
   * @param $add_attr_to_tag      see description above
   * @param $message              message object
   * @param $id                   message id
+ * @param $recursively_called   boolean flag for recursive calls into this function (optional; default FALSE)
   * @return                      sanitized html safe to show on your pages.
   */
  function sq_sanitize($body,
@@ -2315,21 +2342,26 @@ function sq_sanitize($body,
                       $add_attr_to_tag,
                       $message,
                       $id,
-                     $mailbox
+                     $mailbox,
+                     $recursively_called=FALSE
                       ){
      $me = 'sq_sanitize';
+
+    /**
+     * See if tag_list is of tags to remove or tags to allow.
+     * false  means remove these tags
+     * true   means allow these tags
+     */
+    $orig_tag_list = $tag_list;
      $rm_tags = array_shift($tag_list);
+
      /**
       * Normalize rm_tags and rm_tags_with_content.
       */
      @array_walk($tag_list, 'sq_casenormalize');
      @array_walk($rm_tags_with_content, 'sq_casenormalize');
      @array_walk($self_closing_tags, 'sq_casenormalize');
-    /**
-     * See if tag_list is of tags to remove or tags to allow.
-     * false  means remove these tags
-     * true   means allow these tags
-     */
+
      $curpos = 0;
      $open_tags = Array();
      $trusted = "\n<!-- begin sanitized html -->\n";
@@ -2342,6 +2374,47 @@ function sq_sanitize($body,
  
      while (($curtag = sq_getnxtag($body, $curpos)) != FALSE){
          list($tagname, $attary, $tagtype, $lt, $gt) = $curtag;
+
+        /**
+         * RCDATA and RAWTEXT tags are handled differently:
+         * next instance of closing tag is used, whether or not
+         * the HTML is well formed before that
+         */
+        global $rcdata_rawtext_tags;
+        if (!$recursively_called
+         && in_array($tagname, $rcdata_rawtext_tags)
+         && $tagtype === 1){
+            $closing_tag = false;
+            $closing_tag_offset = $curpos;
+            // seek out the closing tag for the current RCDATA/RAWTEXT tag
+            while (1) {
+                // first we need to move forward to next available closing tag
+                // (intentionally leave off the closing > and let sq_getnxtag() validate a proper tag syntax)
+                $next_tag = sq_findnxreg($body, $closing_tag_offset, "</\s*$tagname");
+                if ($next_tag === false) {
+                    $closing_tag = false;
+                    break;
+                }
+                // but then we have to make sure it's a well-formed tag
+                $closing_tag = sq_getnxtag($body, $next_tag[0]);
+                if ($closing_tag === false)
+                    break;
+                else if ($closing_tag[0] !== false
+                 // these should be redundant
+                 && $closing_tag[0] === $tagname && $closing_tag[2] === 2) {
+                    $trusted .= sq_sanitize(substr($body, $curpos, $closing_tag[4] - $curpos + 1),
+                                            $orig_tag_list, $rm_tags_with_content, $self_closing_tags,
+                                            $force_tag_closing, $rm_attnames, $bad_attvals, $add_attr_to_tag,
+                                            $message, $id, $mailbox, true);
+                    $curpos = $closing_tag[4] + 1;
+                    continue 2;
+                }
+                $closing_tag_offset = $next_tag[0] + 1;
+            }
+            if ($closing_tag === false)
+            { /* no-op... there was no closing tag for this RCDATA/RAWTEXT tag - we could probably set $curpos to the end of $body, but this HTML is malformed anyway and should just fall apart on its own */ }
+        }
+
          $free_content = substr($body, $curpos, $lt-$curpos);
          /**
           * Take care of <style>
@@ -2489,7 +2562,17 @@ function magicHTML($body, $id, $message, $mailbox = 'INBOX', $take_mailto_links
      // require_once(SM_PATH . 'functions/url_parser.php');  // for $MailTo_PReg_Match
  
      global $attachment_common_show_images, $view_unsafe_images,
-           $has_unsafe_images;
+           $has_unsafe_images, $allow_svg_display, $rcdata_rawtext_tags,
+           $remove_rcdata_rawtext_tags_and_content;
+
+    $rcdata_rawtext_tags = array(
+        "noscript",
+        "noframes",
+        "noembed",
+        "textarea",
+        // also "title", "xmp", "script", "iframe", "plaintext" which we already remove below
+    );
+
      /**
       * Don't display attached images in HTML mode.
       *
@@ -2497,8 +2580,7 @@ function magicHTML($body, $id, $message, $mailbox = 'INBOX', $take_mailto_links
       */
      $attachment_common_show_images = false;
      $tag_list = Array(
-            false,
-            "object",
+            false, // remove these tags
              "meta",
              "html",
              "head",
@@ -2507,25 +2589,37 @@ function magicHTML($body, $id, $message, $mailbox = 'INBOX', $take_mailto_links
              "frame",
              "iframe",
              "plaintext",
-            "marquee"
+            "marquee",
              );
  
      $rm_tags_with_content = Array(
              "script",
+            "object",
              "applet",
              "embed",
              "title",
              "frameset",
              "xmp",
-            "xml"
+            "xml",
              );
+    if (!$allow_svg_display)
+        $rm_tags_with_content[] = 'svg';
+    /**
+     * SquirrelMail will parse RCDATA and RAWTEXT tags and handle them as the special
+     * case that they are, but if you prefer to remove them and their contents entirely
+     * (in most cases, should be a safe thing with minimal impact), you can add the
+     * following to config/config_local.php
+     *    $remove_rcdata_rawtext_tags_and_content = TRUE; 
+     */
+    if ($remove_rcdata_rawtext_tags_and_content)
+        $rm_tags_with_content = array_merge($rm_tags_with_content, $rcdata_rawtext_tags);
  
      $self_closing_tags =  Array(
              "img",
              "br",
              "hr",
              "input",
-            "outbind"
+            "outbind",
              );
  
      $force_tag_closing = true;
@@ -2537,11 +2631,14 @@ function magicHTML($body, $id, $message, $mailbox = 'INBOX', $take_mailto_links
                  "/^on.*/i",
                  "/^dynsrc/i",
                  "/^data.*/i",
-                "/^lowsrc.*/i"
+                "/^lowsrc.*/i",
                  )
              );
  
-    $secremoveimg = "../images/" . _("sec_remove_eng.png");
+    global $use_transparent_security_image;
+    if ($use_transparent_security_image) $secremoveimg = '../images/spacer.png';
+    else $secremoveimg = '../images/' . _("sec_remove_eng.png");
+
      $bad_attvals = Array(
              "/.*/" =>
              Array(
@@ -2668,19 +2765,25 @@ function magicHTML($body, $id, $message, $mailbox = 'INBOX', $take_mailto_links
      if ($take_mailto_links) {
          // parseUrl($trusted);   // this even parses URLs inside of tags... too aggressive
          global $MailTo_PReg_Match;
-        $MailTo_PReg_Match = '/mailto:' . substr($MailTo_PReg_Match, 1) ;
+        // some mailers (Microsoft, surprise surprise) produce mailto strings without being
+        // inside an anchor (link) tag, so we have to make sure the regex looks for the
+        // quote before mailto, and we'll also try to convert the non-links back into links
+        $MailTo_PReg_Match = '/([\'"])?mailto:' . substr($MailTo_PReg_Match, 1) ;
          if ((preg_match_all($MailTo_PReg_Match, $trusted, $regs)) && ($regs[0][0] != '')) {
              foreach ($regs[0] as $i => $mailto_before) {
-                $mailto_params = $regs[10][$i];
+                $mailto_params = $regs[11][$i];
+
+                // get rid of any leading quote we may have captured but don't care about
+                //
+                $mailto_before = ltrim($mailto_before, '"\'');
+
                  // get rid of any tailing quote since we have to add send_to to the end
                  //
-                if (substr($mailto_before, strlen($mailto_before) - 1) == '"')
-                    $mailto_before = substr($mailto_before, 0, strlen($mailto_before) - 1);
-                if (substr($mailto_params, strlen($mailto_params) - 1) == '"')
-                    $mailto_params = substr($mailto_params, 0, strlen($mailto_params) - 1);
+                $mailto_before = rtrim($mailto_before, '"\'');
+                $mailto_params = rtrim($mailto_params, '"\'');
  
-                if ($regs[1][$i]) {    //if there is an email addr before '?', we need to merge it with the params
-                    $to = 'to=' . $regs[1][$i];
+                if ($regs[2][$i]) {    //if there is an email addr before '?', we need to merge it with the params
+                    $to = 'to=' . $regs[2][$i];
                      if (strpos($mailto_params, 'to=') > -1)    //already a 'to='
                          $mailto_params = str_replace('to=', $to . '%2C%20', $mailto_params);
                      else {
@@ -2705,8 +2808,12 @@ function magicHTML($body, $id, $message, $mailbox = 'INBOX', $take_mailto_links
                  // remove <a href=" and anything after the next quote (we only
                  // need the uri, not the link HTML) in compose uri
                  //
-                $comp_uri = substr($comp_uri, 9);
-                $comp_uri = substr($comp_uri, 0, strpos($comp_uri, '"', 1));
+                // but only do this if the original mailto was in a real anchor tag
+                //
+                if (!empty($regs[1][$i])) {
+                    $comp_uri = substr($comp_uri, 9);
+                    $comp_uri = substr($comp_uri, 0, strpos($comp_uri, '"', 1));
+                }
                  $trusted = str_replace($mailto_before, $comp_uri, $trusted);
              }
          }
@@ -2769,7 +2876,8 @@ function SendDownloadHeaders($type0, $type1, $filename, $force, $filesize=0) {
          $filename=rawurlencode($filename);
          header ("Pragma: public");
          header ("Cache-Control: no-store, max-age=0, no-cache, must-revalidate"); // HTTP/1.1
-        header ("Cache-Control: post-check=0, pre-check=0", false);
+        // does nothing - see: https://blogs.msdn.microsoft.com/ieinternals/2009/07/20/internet-explorers-cache-control-extensions/
+        // header ("Cache-Control: post-check=0, pre-check=0", false);
          header ("Cache-Control: private");
  
          //set the inline header for IE, we'll add the attachment header later if we need it