Commit | Line | Data |
---|---|---|
03ae172a | 1 | # GNU MediaGoblin -- federated, autonomous media hosting |
cf29e8a8 | 2 | # Copyright (C) 2011, 2012 MediaGoblin contributors. See AUTHORS. |
03ae172a AW |
3 | # |
4 | # This program is free software: you can redistribute it and/or modify | |
5 | # it under the terms of the GNU Affero General Public License as published by | |
6 | # the Free Software Foundation, either version 3 of the License, or | |
7 | # (at your option) any later version. | |
8 | # | |
9 | # This program is distributed in the hope that it will be useful, | |
10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
12 | # GNU Affero General Public License for more details. | |
13 | # | |
14 | # You should have received a copy of the GNU Affero General Public License | |
15 | # along with this program. If not, see <http://www.gnu.org/licenses/>. | |
16 | ||
679f7292 | 17 | import collections |
03ae172a AW |
18 | import wtforms |
19 | import markdown | |
20 | from lxml.html.clean import Cleaner | |
21 | ||
22 | from mediagoblin import mg_globals | |
23 | from mediagoblin.tools import url | |
24 | ||
ee91c2b8 | 25 | |
03ae172a AW |
26 | # A super strict version of the lxml.html cleaner class |
27 | HTML_CLEANER = Cleaner( | |
28 | scripts=True, | |
29 | javascript=True, | |
30 | comments=True, | |
31 | style=True, | |
32 | links=True, | |
33 | page_structure=True, | |
34 | processing_instructions=True, | |
35 | embedded=True, | |
36 | frames=True, | |
37 | forms=True, | |
38 | annoying_tags=True, | |
39 | allow_tags=[ | |
cf29edcd JK |
40 | 'div', 'b', 'i', 'em', 'strong', 'p', 'ul', 'ol', 'li', 'a', 'br', |
41 | 'pre', 'code'], | |
36c7d934 | 42 | remove_unknown_tags=False, # can't be used with allow_tags |
03ae172a | 43 | safe_attrs_only=True, |
36c7d934 | 44 | add_nofollow=True, # for now |
03ae172a AW |
45 | host_whitelist=(), |
46 | whitelist_tags=set([])) | |
47 | ||
ee91c2b8 | 48 | |
03ae172a AW |
49 | def clean_html(html): |
50 | # clean_html barfs on an empty string | |
51 | if not html: | |
52 | return u'' | |
53 | ||
54 | return HTML_CLEANER.clean_html(html) | |
55 | ||
ee91c2b8 | 56 | |
03ae172a AW |
57 | def convert_to_tag_list_of_dicts(tag_string): |
58 | """ | |
59 | Filter input from incoming string containing user tags, | |
60 | ||
61 | Strips trailing, leading, and internal whitespace, and also converts | |
62 | the "tags" text into an array of tags | |
63 | """ | |
679f7292 | 64 | slug_to_name = collections.OrderedDict() |
03ae172a AW |
65 | if tag_string: |
66 | ||
67 | # Strip out internal, trailing, and leading whitespace | |
68 | stripped_tag_string = u' '.join(tag_string.strip().split()) | |
69 | ||
70 | # Split the tag string into a list of tags | |
36c7d934 | 71 | for tag in stripped_tag_string.split(','): |
9061383d | 72 | tag = tag.strip() |
679f7292 LD |
73 | # Ignore empty tags or duplicate slugs |
74 | if tag: | |
75 | slug_to_name[url.slugify(tag)] = tag | |
1eaad45f | 76 | return [{'name': v, 'slug': k} for (k,v) in slug_to_name.items()] |
03ae172a | 77 | |
ee91c2b8 | 78 | |
03ae172a AW |
79 | def media_tags_as_string(media_entry_tags): |
80 | """ | |
81 | Generate a string from a media item's tags, stored as a list of dicts | |
82 | ||
83 | This is the opposite of convert_to_tag_list_of_dicts | |
84 | """ | |
2a6a3b8c | 85 | tags_string = '' |
03ae172a | 86 | if media_entry_tags: |
2a6a3b8c SS |
87 | tags_string = u', '.join([tag['name'] for tag in media_entry_tags]) |
88 | return tags_string | |
03ae172a | 89 | |
ee91c2b8 | 90 | |
03ae172a AW |
91 | TOO_LONG_TAG_WARNING = \ |
92 | u'Tags must be shorter than %s characters. Tags that are too long: %s' | |
93 | ||
ee91c2b8 | 94 | |
03ae172a AW |
95 | def tag_length_validator(form, field): |
96 | """ | |
97 | Make sure tags do not exceed the maximum tag length. | |
98 | """ | |
99 | tags = convert_to_tag_list_of_dicts(field.data) | |
100 | too_long_tags = [ | |
101 | tag['name'] for tag in tags | |
102 | if len(tag['name']) > mg_globals.app_config['tags_max_length']] | |
103 | ||
104 | if too_long_tags: | |
105 | raise wtforms.ValidationError( | |
36c7d934 | 106 | TOO_LONG_TAG_WARNING % (mg_globals.app_config['tags_max_length'], |
03ae172a AW |
107 | ', '.join(too_long_tags))) |
108 | ||
109 | ||
c0428016 JK |
110 | # Don't use the safe mode, because lxml.html.clean is better and we are using |
111 | # it anyway | |
112 | UNSAFE_MARKDOWN_INSTANCE = markdown.Markdown() | |
03ae172a | 113 | |
ee91c2b8 | 114 | |
03ae172a AW |
115 | def cleaned_markdown_conversion(text): |
116 | """ | |
117 | Take a block of text, run it through MarkDown, and clean its HTML. | |
118 | """ | |
119 | # Markdown will do nothing with and clean_html can do nothing with | |
120 | # an empty string :) | |
121 | if not text: | |
122 | return u'' | |
123 | ||
c0428016 | 124 | return clean_html(UNSAFE_MARKDOWN_INSTANCE.convert(text)) |