Merge remote-tracking branch 'upstream/master' into change_email
[mediagoblin.git] / mediagoblin / tools / text.py
1 # GNU MediaGoblin -- federated, autonomous media hosting
2 # Copyright (C) 2011, 2012 MediaGoblin contributors. See AUTHORS.
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published by
6 # the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
16
17 import wtforms
18 import markdown
19 from lxml.html.clean import Cleaner
20
21 from mediagoblin import mg_globals
22 from mediagoblin.tools import url
23
24
25 # A super strict version of the lxml.html cleaner class
26 HTML_CLEANER = Cleaner(
27 scripts=True,
28 javascript=True,
29 comments=True,
30 style=True,
31 links=True,
32 page_structure=True,
33 processing_instructions=True,
34 embedded=True,
35 frames=True,
36 forms=True,
37 annoying_tags=True,
38 allow_tags=[
39 'div', 'b', 'i', 'em', 'strong', 'p', 'ul', 'ol', 'li', 'a', 'br',
40 'pre', 'code'],
41 remove_unknown_tags=False, # can't be used with allow_tags
42 safe_attrs_only=True,
43 add_nofollow=True, # for now
44 host_whitelist=(),
45 whitelist_tags=set([]))
46
47
48 def clean_html(html):
49 # clean_html barfs on an empty string
50 if not html:
51 return u''
52
53 return HTML_CLEANER.clean_html(html)
54
55
56 def convert_to_tag_list_of_dicts(tag_string):
57 """
58 Filter input from incoming string containing user tags,
59
60 Strips trailing, leading, and internal whitespace, and also converts
61 the "tags" text into an array of tags
62 """
63 taglist = []
64 if tag_string:
65
66 # Strip out internal, trailing, and leading whitespace
67 stripped_tag_string = u' '.join(tag_string.strip().split())
68
69 # Split the tag string into a list of tags
70 for tag in stripped_tag_string.split(','):
71 tag = tag.strip()
72 # Ignore empty or duplicate tags
73 if tag and tag not in [t['name'] for t in taglist]:
74 taglist.append({'name': tag,
75 'slug': url.slugify(tag)})
76 return taglist
77
78
79 def media_tags_as_string(media_entry_tags):
80 """
81 Generate a string from a media item's tags, stored as a list of dicts
82
83 This is the opposite of convert_to_tag_list_of_dicts
84 """
85 tags_string = ''
86 if media_entry_tags:
87 tags_string = u', '.join([tag['name'] for tag in media_entry_tags])
88 return tags_string
89
90
91 TOO_LONG_TAG_WARNING = \
92 u'Tags must be shorter than %s characters. Tags that are too long: %s'
93
94
95 def tag_length_validator(form, field):
96 """
97 Make sure tags do not exceed the maximum tag length.
98 """
99 tags = convert_to_tag_list_of_dicts(field.data)
100 too_long_tags = [
101 tag['name'] for tag in tags
102 if len(tag['name']) > mg_globals.app_config['tags_max_length']]
103
104 if too_long_tags:
105 raise wtforms.ValidationError(
106 TOO_LONG_TAG_WARNING % (mg_globals.app_config['tags_max_length'],
107 ', '.join(too_long_tags)))
108
109
110 # Don't use the safe mode, because lxml.html.clean is better and we are using
111 # it anyway
112 UNSAFE_MARKDOWN_INSTANCE = markdown.Markdown()
113
114
115 def cleaned_markdown_conversion(text):
116 """
117 Take a block of text, run it through MarkDown, and clean its HTML.
118 """
119 # Markdown will do nothing with and clean_html can do nothing with
120 # an empty string :)
121 if not text:
122 return u''
123
124 return clean_html(UNSAFE_MARKDOWN_INSTANCE.convert(text))