Remove tag delimiter configurability (#390), fixed pep-8 compliance
[mediagoblin.git] / mediagoblin / tools / text.py
CommitLineData
03ae172a 1# GNU MediaGoblin -- federated, autonomous media hosting
cf29e8a8 2# Copyright (C) 2011, 2012 MediaGoblin contributors. See AUTHORS.
03ae172a
AW
3#
4# This program is free software: you can redistribute it and/or modify
5# it under the terms of the GNU Affero General Public License as published by
6# the Free Software Foundation, either version 3 of the License, or
7# (at your option) any later version.
8#
9# This program is distributed in the hope that it will be useful,
10# but WITHOUT ANY WARRANTY; without even the implied warranty of
11# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12# GNU Affero General Public License for more details.
13#
14# You should have received a copy of the GNU Affero General Public License
15# along with this program. If not, see <http://www.gnu.org/licenses/>.
16
17import wtforms
18import markdown
19from lxml.html.clean import Cleaner
20
21from mediagoblin import mg_globals
22from mediagoblin.tools import url
23
ee91c2b8 24
03ae172a
AW
25# A super strict version of the lxml.html cleaner class
26HTML_CLEANER = Cleaner(
27 scripts=True,
28 javascript=True,
29 comments=True,
30 style=True,
31 links=True,
32 page_structure=True,
33 processing_instructions=True,
34 embedded=True,
35 frames=True,
36 forms=True,
37 annoying_tags=True,
38 allow_tags=[
cf29edcd
JK
39 'div', 'b', 'i', 'em', 'strong', 'p', 'ul', 'ol', 'li', 'a', 'br',
40 'pre', 'code'],
36c7d934 41 remove_unknown_tags=False, # can't be used with allow_tags
03ae172a 42 safe_attrs_only=True,
36c7d934 43 add_nofollow=True, # for now
03ae172a
AW
44 host_whitelist=(),
45 whitelist_tags=set([]))
46
ee91c2b8 47
03ae172a
AW
48def clean_html(html):
49 # clean_html barfs on an empty string
50 if not html:
51 return u''
52
53 return HTML_CLEANER.clean_html(html)
54
ee91c2b8 55
03ae172a
AW
56def convert_to_tag_list_of_dicts(tag_string):
57 """
58 Filter input from incoming string containing user tags,
59
60 Strips trailing, leading, and internal whitespace, and also converts
61 the "tags" text into an array of tags
62 """
63 taglist = []
64 if tag_string:
65
66 # Strip out internal, trailing, and leading whitespace
67 stripped_tag_string = u' '.join(tag_string.strip().split())
68
69 # Split the tag string into a list of tags
36c7d934 70 for tag in stripped_tag_string.split(','):
03ae172a
AW
71
72 # Ignore empty or duplicate tags
73 if tag.strip() and tag.strip() not in [t['name'] for t in taglist]:
74
75 taglist.append({'name': tag.strip(),
76 'slug': url.slugify(tag.strip())})
77 return taglist
78
ee91c2b8 79
03ae172a
AW
80def media_tags_as_string(media_entry_tags):
81 """
82 Generate a string from a media item's tags, stored as a list of dicts
83
84 This is the opposite of convert_to_tag_list_of_dicts
85 """
86 media_tag_string = ''
87 if media_entry_tags:
36c7d934 88 media_tag_string = u', '.join(
89 [tag['name']
90 for tag in media_entry_tags])
03ae172a
AW
91 return media_tag_string
92
ee91c2b8 93
03ae172a
AW
94TOO_LONG_TAG_WARNING = \
95 u'Tags must be shorter than %s characters. Tags that are too long: %s'
96
ee91c2b8 97
03ae172a
AW
98def tag_length_validator(form, field):
99 """
100 Make sure tags do not exceed the maximum tag length.
101 """
102 tags = convert_to_tag_list_of_dicts(field.data)
103 too_long_tags = [
104 tag['name'] for tag in tags
105 if len(tag['name']) > mg_globals.app_config['tags_max_length']]
106
107 if too_long_tags:
108 raise wtforms.ValidationError(
36c7d934 109 TOO_LONG_TAG_WARNING % (mg_globals.app_config['tags_max_length'],
03ae172a
AW
110 ', '.join(too_long_tags)))
111
112
c0428016
JK
113# Don't use the safe mode, because lxml.html.clean is better and we are using
114# it anyway
115UNSAFE_MARKDOWN_INSTANCE = markdown.Markdown()
03ae172a 116
ee91c2b8 117
03ae172a
AW
118def cleaned_markdown_conversion(text):
119 """
120 Take a block of text, run it through MarkDown, and clean its HTML.
121 """
122 # Markdown will do nothing with and clean_html can do nothing with
123 # an empty string :)
124 if not text:
125 return u''
126
c0428016 127 return clean_html(UNSAFE_MARKDOWN_INSTANCE.convert(text))