A super strict HTML cleaner method with mediocre tests.

author Christopher Allan Webber <cwebber@dustycloud.org>

Tue, 14 Jun 2011 02:01:19 +0000 (21:01 -0500)

committer Christopher Allan Webber <cwebber@dustycloud.org>

Tue, 14 Jun 2011 02:01:19 +0000 (21:01 -0500)
author Christopher Allan Webber <cwebber@dustycloud.org>
Tue, 14 Jun 2011 02:01:19 +0000 (21:01 -0500)
committer Christopher Allan Webber <cwebber@dustycloud.org>
Tue, 14 Jun 2011 02:01:19 +0000 (21:01 -0500)
diff --git a/mediagoblin/tests/test_util.py b/mediagoblin/tests/test_util.py

index 7b00a07482ef4e7f30a990f41fe1730c1d98bca9..75e28aca0d6c834d6bc8b3992b6f876ac4d48b71 100644 (file)
--- a/mediagoblin/tests/test_util.py
+++ b/mediagoblin/tests/test_util.py
@@ -103,3 +103,22 @@ def test_locale_to_lower_lower():
      # crazy renditions.  Useful?
      assert util.locale_to_lower_lower('en-US') == 'en-us'
      assert util.locale_to_lower_lower('en_us') == 'en-us'
+
+
+def test_html_cleaner():
+    # Remove images
+    result = util.clean_html(
+        '<p>Hi everybody! '
+        '<img src="http://example.org/huge-purple-barney.png" /></p>\n'
+        '<p>:)</p>')
+    assert result == (
+        '<div>'
+        '<p>Hi everybody! </p>\n'
+        '<p>:)</p>'
+        '</div>')
+
+    # Remove evil javascript
+    result = util.clean_html(
+        '<p><a href="javascript:nasty_surprise">innocent link!</a></p>')
+    assert result == (
+        '<p><a href="">innocent link!</a></p>')
diff --git a/mediagoblin/util.py b/mediagoblin/util.py

index f29f8570d3c563a6c3e74f487fb827a823eddd65..fc380f4120f617473efe0e9e90994bdcd2fb662b 100644 (file)
--- a/mediagoblin/util.py
+++ b/mediagoblin/util.py
@@ -30,6 +30,7 @@ import jinja2
  import translitcodec
  from paste.deploy.loadwsgi import NicerConfigParser
  from webob import Response, exc
+from lxml.html.clean import Cleaner
  
  from mediagoblin import mg_globals
  from mediagoblin.db.util import ObjectId
@@ -373,6 +374,32 @@ def read_config_file(conf_file):
      return mgoblin_conf
  
  
+# A super strict version of the lxml.html cleaner class
+HTML_CLEANER = Cleaner(
+    scripts=True,
+    javascript=True,
+    comments=True,
+    style=True,
+    links=True,
+    page_structure=True,
+    processing_instructions=True,
+    embedded=True,
+    frames=True,
+    forms=True,
+    annoying_tags=True,
+    allow_tags=[
+        'div', 'b', 'i', 'em', 'strong', 'p', 'ul', 'ol', 'li', 'a', 'br'],
+    remove_unknown_tags=False, # can't be used with allow_tags
+    safe_attrs_only=True,
+    add_nofollow=True, # for now
+    host_whitelist=(),
+    whitelist_tags=set([]))
+
+
+def clean_html(html):
+    return HTML_CLEANER.clean_html(html)
+
+
  SETUP_GETTEXTS = {}
  
  def setup_gettext(locale):
author	Christopher Allan Webber <cwebber@dustycloud.org>
	Tue, 14 Jun 2011 02:01:19 +0000 (21:01 -0500)
committer	Christopher Allan Webber <cwebber@dustycloud.org>
	Tue, 14 Jun 2011 02:01:19 +0000 (21:01 -0500)
mediagoblin/tests/test_util.py		patch \| blob \| blame \| history
mediagoblin/util.py		patch \| blob \| blame \| history