[mediagoblin.git] / mediagoblin / tools / text.py

# GNU MediaGoblin -- federated, autonomous media hosting
# Copyright (C) 2011 MediaGoblin contributors.  See AUTHORS.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import wtforms
import markdown
from lxml.html.clean import Cleaner

from mediagoblin import mg_globals
from mediagoblin.tools import url

# A super strict version of the lxml.html cleaner class
HTML_CLEANER = Cleaner(
    scripts=True,
    javascript=True,
    comments=True,
    style=True,
    links=True,
    page_structure=True,
    processing_instructions=True,
    embedded=True,
    frames=True,
    forms=True,
    annoying_tags=True,
    allow_tags=[
        'div', 'b', 'i', 'em', 'strong', 'p', 'ul', 'ol', 'li', 'a', 'br'],
    remove_unknown_tags=False, # can't be used with allow_tags
    safe_attrs_only=True,
    add_nofollow=True, # for now
    host_whitelist=(),
    whitelist_tags=set([]))

def clean_html(html):
    # clean_html barfs on an empty string
    if not html:
        return u''

    return HTML_CLEANER.clean_html(html)

def convert_to_tag_list_of_dicts(tag_string):
    """
    Filter input from incoming string containing user tags,

    Strips trailing, leading, and internal whitespace, and also converts
    the "tags" text into an array of tags
    """
    taglist = []
    if tag_string:

        # Strip out internal, trailing, and leading whitespace
        stripped_tag_string = u' '.join(tag_string.strip().split())

        # Split the tag string into a list of tags
        for tag in stripped_tag_string.split(
                                       mg_globals.app_config['tags_delimiter']):

            # Ignore empty or duplicate tags
            if tag.strip() and tag.strip() not in [t['name'] for t in taglist]:

                taglist.append({'name': tag.strip(),
                                'slug': url.slugify(tag.strip())})
    return taglist

def media_tags_as_string(media_entry_tags):
    """
    Generate a string from a media item's tags, stored as a list of dicts

    This is the opposite of convert_to_tag_list_of_dicts
    """
    media_tag_string = ''
    if media_entry_tags:
        media_tag_string = mg_globals.app_config['tags_delimiter'].join(
                                      [tag['name'] for tag in media_entry_tags])
    return media_tag_string

TOO_LONG_TAG_WARNING = \
    u'Tags must be shorter than %s characters.  Tags that are too long: %s'

def tag_length_validator(form, field):
    """
    Make sure tags do not exceed the maximum tag length.
    """
    tags = convert_to_tag_list_of_dicts(field.data)
    too_long_tags = [
        tag['name'] for tag in tags
        if len(tag['name']) > mg_globals.app_config['tags_max_length']]

    if too_long_tags:
        raise wtforms.ValidationError(
            TOO_LONG_TAG_WARNING % (mg_globals.app_config['tags_max_length'], \
                                    ', '.join(too_long_tags)))


MARKDOWN_INSTANCE = markdown.Markdown(safe_mode='escape')

def cleaned_markdown_conversion(text):
    """
    Take a block of text, run it through MarkDown, and clean its HTML.
    """
    # Markdown will do nothing with and clean_html can do nothing with
    # an empty string :)
    if not text:
        return u''

    return clean_html(MARKDOWN_INSTANCE.convert(text))
Commit	Line	Data
03ae172a AW	1	# GNU MediaGoblin -- federated, autonomous media hosting
	2	# Copyright (C) 2011 MediaGoblin contributors. See AUTHORS.
	3	#
	4	# This program is free software: you can redistribute it and/or modify
	5	# it under the terms of the GNU Affero General Public License as published by
	6	# the Free Software Foundation, either version 3 of the License, or
	7	# (at your option) any later version.
	8	#
	9	# This program is distributed in the hope that it will be useful,
	10	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	12	# GNU Affero General Public License for more details.
	13	#
	14	# You should have received a copy of the GNU Affero General Public License
	15	# along with this program. If not, see <http://www.gnu.org/licenses/>.
	16
	17	import wtforms
	18	import markdown
	19	from lxml.html.clean import Cleaner
	20
	21	from mediagoblin import mg_globals
	22	from mediagoblin.tools import url
	23
	24	# A super strict version of the lxml.html cleaner class
	25	HTML_CLEANER = Cleaner(
	26	scripts=True,
	27	javascript=True,
	28	comments=True,
	29	style=True,
	30	links=True,
	31	page_structure=True,
	32	processing_instructions=True,
	33	embedded=True,
	34	frames=True,
	35	forms=True,
	36	annoying_tags=True,
	37	allow_tags=[
	38	'div', 'b', 'i', 'em', 'strong', 'p', 'ul', 'ol', 'li', 'a', 'br'],
	39	remove_unknown_tags=False, # can't be used with allow_tags
	40	safe_attrs_only=True,
	41	add_nofollow=True, # for now
	42	host_whitelist=(),
	43	whitelist_tags=set([]))
	44
	45	def clean_html(html):
	46	# clean_html barfs on an empty string
	47	if not html:
	48	return u''
	49
	50	return HTML_CLEANER.clean_html(html)
	51
	52	def convert_to_tag_list_of_dicts(tag_string):
	53	"""
	54	Filter input from incoming string containing user tags,
	55
	56	Strips trailing, leading, and internal whitespace, and also converts
	57	the "tags" text into an array of tags
	58	"""
	59	taglist = []
	60	if tag_string:
	61
	62	# Strip out internal, trailing, and leading whitespace
	63	stripped_tag_string = u' '.join(tag_string.strip().split())
	64
65	# Split the tag string into a list of tags
66	for tag in stripped_tag_string.split(
67	mg_globals.app_config['tags_delimiter']):
68
69	# Ignore empty or duplicate tags
70	if tag.strip() and tag.strip() not in [t['name'] for t in taglist]:
71
72	taglist.append({'name': tag.strip(),
73	'slug': url.slugify(tag.strip())})
74	return taglist
75
76	def media_tags_as_string(media_entry_tags):
77	"""
78	Generate a string from a media item's tags, stored as a list of dicts
79
80	This is the opposite of convert_to_tag_list_of_dicts
81	"""
82	media_tag_string = ''
83	if media_entry_tags:
84	media_tag_string = mg_globals.app_config['tags_delimiter'].join(
85	[tag['name'] for tag in media_entry_tags])
86	return media_tag_string
87
88	TOO_LONG_TAG_WARNING = \
89	u'Tags must be shorter than %s characters. Tags that are too long: %s'
90
91	def tag_length_validator(form, field):
92	"""
93	Make sure tags do not exceed the maximum tag length.
94	"""
95	tags = convert_to_tag_list_of_dicts(field.data)
96	too_long_tags = [
97	tag['name'] for tag in tags
98	if len(tag['name']) > mg_globals.app_config['tags_max_length']]
99
100	if too_long_tags:
101	raise wtforms.ValidationError(
102	TOO_LONG_TAG_WARNING % (mg_globals.app_config['tags_max_length'], \
103	', '.join(too_long_tags)))
104
105
106	MARKDOWN_INSTANCE = markdown.Markdown(safe_mode='escape')
107
108	def cleaned_markdown_conversion(text):
109	"""
110	Take a block of text, run it through MarkDown, and clean its HTML.
111	"""
112	# Markdown will do nothing with and clean_html can do nothing with
113	# an empty string :)
114	if not text:
115	return u''
116
117	return clean_html(MARKDOWN_INSTANCE.convert(text))