Change complimentary_task to complementary_tas
[mediagoblin.git] / mediagoblin / tools / text.py
CommitLineData
03ae172a 1# GNU MediaGoblin -- federated, autonomous media hosting
cf29e8a8 2# Copyright (C) 2011, 2012 MediaGoblin contributors. See AUTHORS.
03ae172a
AW
3#
4# This program is free software: you can redistribute it and/or modify
5# it under the terms of the GNU Affero General Public License as published by
6# the Free Software Foundation, either version 3 of the License, or
7# (at your option) any later version.
8#
9# This program is distributed in the hope that it will be useful,
10# but WITHOUT ANY WARRANTY; without even the implied warranty of
11# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12# GNU Affero General Public License for more details.
13#
14# You should have received a copy of the GNU Affero General Public License
15# along with this program. If not, see <http://www.gnu.org/licenses/>.
16
679f7292 17import collections
03ae172a
AW
18import wtforms
19import markdown
20from lxml.html.clean import Cleaner
21
22from mediagoblin import mg_globals
23from mediagoblin.tools import url
24
ee91c2b8 25
03ae172a
AW
26# A super strict version of the lxml.html cleaner class
27HTML_CLEANER = Cleaner(
28 scripts=True,
29 javascript=True,
30 comments=True,
31 style=True,
32 links=True,
33 page_structure=True,
34 processing_instructions=True,
35 embedded=True,
36 frames=True,
37 forms=True,
38 annoying_tags=True,
39 allow_tags=[
cf29edcd
JK
40 'div', 'b', 'i', 'em', 'strong', 'p', 'ul', 'ol', 'li', 'a', 'br',
41 'pre', 'code'],
36c7d934 42 remove_unknown_tags=False, # can't be used with allow_tags
03ae172a 43 safe_attrs_only=True,
36c7d934 44 add_nofollow=True, # for now
03ae172a
AW
45 host_whitelist=(),
46 whitelist_tags=set([]))
47
ee91c2b8 48
03ae172a
AW
49def clean_html(html):
50 # clean_html barfs on an empty string
51 if not html:
52 return u''
53
54 return HTML_CLEANER.clean_html(html)
55
ee91c2b8 56
03ae172a
AW
57def convert_to_tag_list_of_dicts(tag_string):
58 """
59 Filter input from incoming string containing user tags,
60
61 Strips trailing, leading, and internal whitespace, and also converts
62 the "tags" text into an array of tags
63 """
679f7292 64 slug_to_name = collections.OrderedDict()
03ae172a
AW
65 if tag_string:
66
67 # Strip out internal, trailing, and leading whitespace
68 stripped_tag_string = u' '.join(tag_string.strip().split())
69
70 # Split the tag string into a list of tags
36c7d934 71 for tag in stripped_tag_string.split(','):
9061383d 72 tag = tag.strip()
679f7292
LD
73 # Ignore empty tags or duplicate slugs
74 if tag:
75 slug_to_name[url.slugify(tag)] = tag
1eaad45f 76 return [{'name': v, 'slug': k} for (k,v) in slug_to_name.items()]
03ae172a 77
ee91c2b8 78
03ae172a
AW
79def media_tags_as_string(media_entry_tags):
80 """
81 Generate a string from a media item's tags, stored as a list of dicts
82
83 This is the opposite of convert_to_tag_list_of_dicts
84 """
2a6a3b8c 85 tags_string = ''
03ae172a 86 if media_entry_tags:
2a6a3b8c
SS
87 tags_string = u', '.join([tag['name'] for tag in media_entry_tags])
88 return tags_string
03ae172a 89
ee91c2b8 90
03ae172a
AW
91TOO_LONG_TAG_WARNING = \
92 u'Tags must be shorter than %s characters. Tags that are too long: %s'
93
ee91c2b8 94
03ae172a
AW
95def tag_length_validator(form, field):
96 """
97 Make sure tags do not exceed the maximum tag length.
98 """
99 tags = convert_to_tag_list_of_dicts(field.data)
100 too_long_tags = [
101 tag['name'] for tag in tags
102 if len(tag['name']) > mg_globals.app_config['tags_max_length']]
103
104 if too_long_tags:
105 raise wtforms.ValidationError(
36c7d934 106 TOO_LONG_TAG_WARNING % (mg_globals.app_config['tags_max_length'],
03ae172a
AW
107 ', '.join(too_long_tags)))
108
109
c0428016
JK
110# Don't use the safe mode, because lxml.html.clean is better and we are using
111# it anyway
112UNSAFE_MARKDOWN_INSTANCE = markdown.Markdown()
03ae172a 113
ee91c2b8 114
03ae172a
AW
115def cleaned_markdown_conversion(text):
116 """
117 Take a block of text, run it through MarkDown, and clean its HTML.
118 """
119 # Markdown will do nothing with and clean_html can do nothing with
120 # an empty string :)
121 if not text:
122 return u''
123
c0428016 124 return clean_html(UNSAFE_MARKDOWN_INSTANCE.convert(text))