This was a big commit! I included lots of documentation below, but generally I

[mediagoblin.git] / mediagoblin / tools / text.py
diff --git a/mediagoblin/tools/text.py b/mediagoblin/tools/text.py

index de4bb28190487e95c302732fa80c50f9cfa2555a..96df49d27a929d894d074c06459c853df0354244 100644 (file)
--- a/mediagoblin/tools/text.py
+++ b/mediagoblin/tools/text.py
@@ -1,5 +1,5 @@
  # GNU MediaGoblin -- federated, autonomous media hosting
-# Copyright (C) 2011 MediaGoblin contributors.  See AUTHORS.
+# Copyright (C) 2011, 2012 MediaGoblin contributors.  See AUTHORS.
  #
  # This program is free software: you can redistribute it and/or modify
  # it under the terms of the GNU Affero General Public License as published by
@@ -21,6 +21,7 @@ from lxml.html.clean import Cleaner
  from mediagoblin import mg_globals
  from mediagoblin.tools import url
  
+
  # A super strict version of the lxml.html cleaner class
  HTML_CLEANER = Cleaner(
      scripts=True,
@@ -35,13 +36,15 @@ HTML_CLEANER = Cleaner(
      forms=True,
      annoying_tags=True,
      allow_tags=[
-        'div', 'b', 'i', 'em', 'strong', 'p', 'ul', 'ol', 'li', 'a', 'br'],
-    remove_unknown_tags=False, # can't be used with allow_tags
+        'div', 'b', 'i', 'em', 'strong', 'p', 'ul', 'ol', 'li', 'a', 'br',
+        'pre', 'code'],
+    remove_unknown_tags=False,  # can't be used with allow_tags
      safe_attrs_only=True,
-    add_nofollow=True, # for now
+    add_nofollow=True,  # for now
      host_whitelist=(),
      whitelist_tags=set([]))
  
+
  def clean_html(html):
      # clean_html barfs on an empty string
      if not html:
@@ -49,6 +52,7 @@ def clean_html(html):
  
      return HTML_CLEANER.clean_html(html)
  
+
  def convert_to_tag_list_of_dicts(tag_string):
      """
      Filter input from incoming string containing user tags,
@@ -63,31 +67,31 @@ def convert_to_tag_list_of_dicts(tag_string):
          stripped_tag_string = u' '.join(tag_string.strip().split())
  
          # Split the tag string into a list of tags
-        for tag in stripped_tag_string.split(
-                                       mg_globals.app_config['tags_delimiter']):
-
+        for tag in stripped_tag_string.split(','):
+            tag = tag.strip()
              # Ignore empty or duplicate tags
-            if tag.strip() and tag.strip() not in [t['name'] for t in taglist]:
-
-                taglist.append({'name': tag.strip(),
-                                'slug': url.slugify(tag.strip())})
+            if tag and tag not in [t['name'] for t in taglist]:
+                taglist.append({'name': tag,
+                                'slug': url.slugify(tag)})
      return taglist
  
+
  def media_tags_as_string(media_entry_tags):
      """
      Generate a string from a media item's tags, stored as a list of dicts
  
      This is the opposite of convert_to_tag_list_of_dicts
      """
-    media_tag_string = ''
+    tags_string = ''
      if media_entry_tags:
-        media_tag_string = mg_globals.app_config['tags_delimiter'].join(
-                                      [tag['name'] for tag in media_entry_tags])
-    return media_tag_string
+        tags_string = u', '.join([tag['name'] for tag in media_entry_tags])
+    return tags_string
+
  
  TOO_LONG_TAG_WARNING = \
      u'Tags must be shorter than %s characters.  Tags that are too long: %s'
  
+
  def tag_length_validator(form, field):
      """
      Make sure tags do not exceed the maximum tag length.
@@ -99,11 +103,14 @@ def tag_length_validator(form, field):
  
      if too_long_tags:
          raise wtforms.ValidationError(
-            TOO_LONG_TAG_WARNING % (mg_globals.app_config['tags_max_length'], \
+            TOO_LONG_TAG_WARNING % (mg_globals.app_config['tags_max_length'],
                                      ', '.join(too_long_tags)))
  
  
-MARKDOWN_INSTANCE = markdown.Markdown(safe_mode='escape')
+# Don't use the safe mode, because lxml.html.clean is better and we are using
+# it anyway
+UNSAFE_MARKDOWN_INSTANCE = markdown.Markdown()
+
  
  def cleaned_markdown_conversion(text):
      """
@@ -114,4 +121,4 @@ def cleaned_markdown_conversion(text):
      if not text:
          return u''
  
-    return clean_html(MARKDOWN_INSTANCE.convert(text))
+    return clean_html(UNSAFE_MARKDOWN_INSTANCE.convert(text))