From: Christopher Allan Webber Date: Tue, 14 Jun 2011 02:01:19 +0000 (-0500) Subject: A super strict HTML cleaner method with mediocre tests. X-Git-Url: https://vcs.fsf.org/?a=commitdiff_plain;h=a68ee5556e2cf78abd1e87546f8627ec07c1f89d;p=mediagoblin.git A super strict HTML cleaner method with mediocre tests. --- diff --git a/mediagoblin/tests/test_util.py b/mediagoblin/tests/test_util.py index 7b00a074..75e28aca 100644 --- a/mediagoblin/tests/test_util.py +++ b/mediagoblin/tests/test_util.py @@ -103,3 +103,22 @@ def test_locale_to_lower_lower(): # crazy renditions. Useful? assert util.locale_to_lower_lower('en-US') == 'en-us' assert util.locale_to_lower_lower('en_us') == 'en-us' + + +def test_html_cleaner(): + # Remove images + result = util.clean_html( + '

Hi everybody! ' + '

\n' + '

:)

') + assert result == ( + '
' + '

Hi everybody!

\n' + '

:)

' + '
') + + # Remove evil javascript + result = util.clean_html( + '

innocent link!

') + assert result == ( + '

innocent link!

') diff --git a/mediagoblin/util.py b/mediagoblin/util.py index f29f8570..fc380f41 100644 --- a/mediagoblin/util.py +++ b/mediagoblin/util.py @@ -30,6 +30,7 @@ import jinja2 import translitcodec from paste.deploy.loadwsgi import NicerConfigParser from webob import Response, exc +from lxml.html.clean import Cleaner from mediagoblin import mg_globals from mediagoblin.db.util import ObjectId @@ -373,6 +374,32 @@ def read_config_file(conf_file): return mgoblin_conf +# A super strict version of the lxml.html cleaner class +HTML_CLEANER = Cleaner( + scripts=True, + javascript=True, + comments=True, + style=True, + links=True, + page_structure=True, + processing_instructions=True, + embedded=True, + frames=True, + forms=True, + annoying_tags=True, + allow_tags=[ + 'div', 'b', 'i', 'em', 'strong', 'p', 'ul', 'ol', 'li', 'a', 'br'], + remove_unknown_tags=False, # can't be used with allow_tags + safe_attrs_only=True, + add_nofollow=True, # for now + host_whitelist=(), + whitelist_tags=set([])) + + +def clean_html(html): + return HTML_CLEANER.clean_html(html) + + SETUP_GETTEXTS = {} def setup_gettext(locale):