Merge branch 'mediagoblin-upstream' into bug444_fix_utils_py_redux
[mediagoblin.git] / mediagoblin / tools / text.py
CommitLineData
03ae172a
AW
1# GNU MediaGoblin -- federated, autonomous media hosting
2# Copyright (C) 2011 MediaGoblin contributors. See AUTHORS.
3#
4# This program is free software: you can redistribute it and/or modify
5# it under the terms of the GNU Affero General Public License as published by
6# the Free Software Foundation, either version 3 of the License, or
7# (at your option) any later version.
8#
9# This program is distributed in the hope that it will be useful,
10# but WITHOUT ANY WARRANTY; without even the implied warranty of
11# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12# GNU Affero General Public License for more details.
13#
14# You should have received a copy of the GNU Affero General Public License
15# along with this program. If not, see <http://www.gnu.org/licenses/>.
16
17import wtforms
18import markdown
19from lxml.html.clean import Cleaner
20
21from mediagoblin import mg_globals
22from mediagoblin.tools import url
23
24# A super strict version of the lxml.html cleaner class
25HTML_CLEANER = Cleaner(
26 scripts=True,
27 javascript=True,
28 comments=True,
29 style=True,
30 links=True,
31 page_structure=True,
32 processing_instructions=True,
33 embedded=True,
34 frames=True,
35 forms=True,
36 annoying_tags=True,
37 allow_tags=[
38 'div', 'b', 'i', 'em', 'strong', 'p', 'ul', 'ol', 'li', 'a', 'br'],
39 remove_unknown_tags=False, # can't be used with allow_tags
40 safe_attrs_only=True,
41 add_nofollow=True, # for now
42 host_whitelist=(),
43 whitelist_tags=set([]))
44
45def clean_html(html):
46 # clean_html barfs on an empty string
47 if not html:
48 return u''
49
50 return HTML_CLEANER.clean_html(html)
51
52def convert_to_tag_list_of_dicts(tag_string):
53 """
54 Filter input from incoming string containing user tags,
55
56 Strips trailing, leading, and internal whitespace, and also converts
57 the "tags" text into an array of tags
58 """
59 taglist = []
60 if tag_string:
61
62 # Strip out internal, trailing, and leading whitespace
63 stripped_tag_string = u' '.join(tag_string.strip().split())
64
65 # Split the tag string into a list of tags
66 for tag in stripped_tag_string.split(
67 mg_globals.app_config['tags_delimiter']):
68
69 # Ignore empty or duplicate tags
70 if tag.strip() and tag.strip() not in [t['name'] for t in taglist]:
71
72 taglist.append({'name': tag.strip(),
73 'slug': url.slugify(tag.strip())})
74 return taglist
75
76def media_tags_as_string(media_entry_tags):
77 """
78 Generate a string from a media item's tags, stored as a list of dicts
79
80 This is the opposite of convert_to_tag_list_of_dicts
81 """
82 media_tag_string = ''
83 if media_entry_tags:
84 media_tag_string = mg_globals.app_config['tags_delimiter'].join(
85 [tag['name'] for tag in media_entry_tags])
86 return media_tag_string
87
88TOO_LONG_TAG_WARNING = \
89 u'Tags must be shorter than %s characters. Tags that are too long: %s'
90
91def tag_length_validator(form, field):
92 """
93 Make sure tags do not exceed the maximum tag length.
94 """
95 tags = convert_to_tag_list_of_dicts(field.data)
96 too_long_tags = [
97 tag['name'] for tag in tags
98 if len(tag['name']) > mg_globals.app_config['tags_max_length']]
99
100 if too_long_tags:
101 raise wtforms.ValidationError(
102 TOO_LONG_TAG_WARNING % (mg_globals.app_config['tags_max_length'], \
103 ', '.join(too_long_tags)))
104
105
106MARKDOWN_INSTANCE = markdown.Markdown(safe_mode='escape')
107
108def cleaned_markdown_conversion(text):
109 """
110 Take a block of text, run it through MarkDown, and clean its HTML.
111 """
112 # Markdown will do nothing with and clean_html can do nothing with
113 # an empty string :)
114 if not text:
115 return u''
116
117 return clean_html(MARKDOWN_INSTANCE.convert(text))