[mediagoblin.git] / mediagoblin / tools / text.py

# GNU MediaGoblin -- federated, autonomous media hosting
# Copyright (C) 2011, 2012 MediaGoblin contributors.  See AUTHORS.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import wtforms
import markdown
from lxml.html.clean import Cleaner

from mediagoblin import mg_globals
from mediagoblin.tools import url


# A super strict version of the lxml.html cleaner class
HTML_CLEANER = Cleaner(
    scripts=True,
    javascript=True,
    comments=True,
    style=True,
    links=True,
    page_structure=True,
    processing_instructions=True,
    embedded=True,
    frames=True,
    forms=True,
    annoying_tags=True,
    allow_tags=[
        'div', 'b', 'i', 'em', 'strong', 'p', 'ul', 'ol', 'li', 'a', 'br'],
    remove_unknown_tags=False, # can't be used with allow_tags
    safe_attrs_only=True,
    add_nofollow=True, # for now
    host_whitelist=(),
    whitelist_tags=set([]))

TAGS_DELIMITER=',';

def clean_html(html):
    # clean_html barfs on an empty string
    if not html:
        return u''

    return HTML_CLEANER.clean_html(html)


def convert_to_tag_list_of_dicts(tag_string):
    """
    Filter input from incoming string containing user tags,

    Strips trailing, leading, and internal whitespace, and also converts
    the "tags" text into an array of tags
    """
    taglist = []
    if tag_string:

        # Strip out internal, trailing, and leading whitespace
        stripped_tag_string = u' '.join(tag_string.strip().split())

        # Split the tag string into a list of tags
        for tag in stripped_tag_string.split(
                                       TAGS_DELIMITER):

            # Ignore empty or duplicate tags
            if tag.strip() and tag.strip() not in [t['name'] for t in taglist]:

                taglist.append({'name': tag.strip(),
                                'slug': url.slugify(tag.strip())})
    return taglist


def media_tags_as_string(media_entry_tags):
    """
    Generate a string from a media item's tags, stored as a list of dicts

    This is the opposite of convert_to_tag_list_of_dicts
    """
    media_tag_string = ''
    if media_entry_tags:
        media_tag_string = (TAGS_DELIMITER+u' ').join(
                                      [tag['name'] for tag in media_entry_tags])
    return media_tag_string


TOO_LONG_TAG_WARNING = \
    u'Tags must be shorter than %s characters.  Tags that are too long: %s'


def tag_length_validator(form, field):
    """
    Make sure tags do not exceed the maximum tag length.
    """
    tags = convert_to_tag_list_of_dicts(field.data)
    too_long_tags = [
        tag['name'] for tag in tags
        if len(tag['name']) > mg_globals.app_config['tags_max_length']]

    if too_long_tags:
        raise wtforms.ValidationError(
            TOO_LONG_TAG_WARNING % (mg_globals.app_config['tags_max_length'], \
                                    ', '.join(too_long_tags)))


MARKDOWN_INSTANCE = markdown.Markdown(safe_mode='escape')


def cleaned_markdown_conversion(text):
    """
    Take a block of text, run it through MarkDown, and clean its HTML.
    """
    # Markdown will do nothing with and clean_html can do nothing with
    # an empty string :)
    if not text:
        return u''

    return clean_html(MARKDOWN_INSTANCE.convert(text))
Commit	Line	Data
03ae172a	1	# GNU MediaGoblin -- federated, autonomous media hosting
cf29e8a8	2	# Copyright (C) 2011, 2012 MediaGoblin contributors. See AUTHORS.
03ae172a AW	3	#
	4	# This program is free software: you can redistribute it and/or modify
	5	# it under the terms of the GNU Affero General Public License as published by
	6	# the Free Software Foundation, either version 3 of the License, or
	7	# (at your option) any later version.
	8	#
	9	# This program is distributed in the hope that it will be useful,
	10	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	12	# GNU Affero General Public License for more details.
	13	#
	14	# You should have received a copy of the GNU Affero General Public License
	15	# along with this program. If not, see <http://www.gnu.org/licenses/>.
	16
	17	import wtforms
	18	import markdown
	19	from lxml.html.clean import Cleaner
	20
	21	from mediagoblin import mg_globals
	22	from mediagoblin.tools import url
	23
ee91c2b8	24
03ae172a AW	25	# A super strict version of the lxml.html cleaner class
	26	HTML_CLEANER = Cleaner(
	27	scripts=True,
	28	javascript=True,
	29	comments=True,
	30	style=True,
	31	links=True,
	32	page_structure=True,
	33	processing_instructions=True,
	34	embedded=True,
	35	frames=True,
	36	forms=True,
	37	annoying_tags=True,
	38	allow_tags=[
	39	'div', 'b', 'i', 'em', 'strong', 'p', 'ul', 'ol', 'li', 'a', 'br'],
	40	remove_unknown_tags=False, # can't be used with allow_tags
	41	safe_attrs_only=True,
	42	add_nofollow=True, # for now
	43	host_whitelist=(),
	44	whitelist_tags=set([]))
	45
3038ba87	46	TAGS_DELIMITER=',';
ee91c2b8	47
03ae172a AW	48	def clean_html(html):
	49	# clean_html barfs on an empty string
	50	if not html:
	51	return u''
	52
	53	return HTML_CLEANER.clean_html(html)
	54
ee91c2b8	55
03ae172a AW	56	def convert_to_tag_list_of_dicts(tag_string):
	57	"""
	58	Filter input from incoming string containing user tags,
	59
	60	Strips trailing, leading, and internal whitespace, and also converts
	61	the "tags" text into an array of tags
	62	"""
	63	taglist = []
	64	if tag_string:
	65
	66	# Strip out internal, trailing, and leading whitespace
	67	stripped_tag_string = u' '.join(tag_string.strip().split())
	68
	69	# Split the tag string into a list of tags
	70	for tag in stripped_tag_string.split(
3038ba87	71	TAGS_DELIMITER):
03ae172a AW	72
	73	# Ignore empty or duplicate tags
	74	if tag.strip() and tag.strip() not in [t['name'] for t in taglist]:
	75
	76	taglist.append({'name': tag.strip(),
	77	'slug': url.slugify(tag.strip())})
	78	return taglist
	79
ee91c2b8	80
03ae172a AW	81	def media_tags_as_string(media_entry_tags):
	82	"""
	83	Generate a string from a media item's tags, stored as a list of dicts
	84
	85	This is the opposite of convert_to_tag_list_of_dicts
	86	"""
	87	media_tag_string = ''
	88	if media_entry_tags:
3038ba87	89	media_tag_string = (TAGS_DELIMITER+u' ').join(
03ae172a AW	90	[tag['name'] for tag in media_entry_tags])
	91	return media_tag_string
	92
ee91c2b8	93
03ae172a AW	94	TOO_LONG_TAG_WARNING = \
	95	u'Tags must be shorter than %s characters. Tags that are too long: %s'
	96
ee91c2b8	97
03ae172a AW	98	def tag_length_validator(form, field):
	99	"""
	100	Make sure tags do not exceed the maximum tag length.
	101	"""
	102	tags = convert_to_tag_list_of_dicts(field.data)
	103	too_long_tags = [
	104	tag['name'] for tag in tags
	105	if len(tag['name']) > mg_globals.app_config['tags_max_length']]
	106
	107	if too_long_tags:
	108	raise wtforms.ValidationError(
	109	TOO_LONG_TAG_WARNING % (mg_globals.app_config['tags_max_length'], \
	110	', '.join(too_long_tags)))
	111
	112
	113	MARKDOWN_INSTANCE = markdown.Markdown(safe_mode='escape')
	114
ee91c2b8	115
03ae172a AW	116	def cleaned_markdown_conversion(text):
	117	"""
	118	Take a block of text, run it through MarkDown, and clean its HTML.
	119	"""
	120	# Markdown will do nothing with and clean_html can do nothing with
	121	# an empty string :)
	122	if not text:
	123	return u''
	124
	125	return clean_html(MARKDOWN_INSTANCE.convert(text))