[mediagoblin.git] / mediagoblin / tools / text.py

# GNU MediaGoblin -- federated, autonomous media hosting
# Copyright (C) 2011, 2012 MediaGoblin contributors.  See AUTHORS.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import collections
import wtforms
import markdown
from lxml.html.clean import Cleaner

from mediagoblin import mg_globals
from mediagoblin.tools import url


# A super strict version of the lxml.html cleaner class
HTML_CLEANER = Cleaner(
    scripts=True,
    javascript=True,
    comments=True,
    style=True,
    links=True,
    page_structure=True,
    processing_instructions=True,
    embedded=True,
    frames=True,
    forms=True,
    annoying_tags=True,
    allow_tags=[
        'div', 'b', 'i', 'em', 'strong', 'p', 'ul', 'ol', 'li', 'a', 'br',
        'pre', 'code'],
    remove_unknown_tags=False,  # can't be used with allow_tags
    safe_attrs_only=True,
    add_nofollow=True,  # for now
    host_whitelist=(),
    whitelist_tags=set([]))


def clean_html(html):
    # clean_html barfs on an empty string
    if not html:
        return u''

    return HTML_CLEANER.clean_html(html)


def convert_to_tag_list_of_dicts(tag_string):
    """
    Filter input from incoming string containing user tags,

    Strips trailing, leading, and internal whitespace, and also converts
    the "tags" text into an array of tags
    """
    slug_to_name = collections.OrderedDict()
    if tag_string:

        # Strip out internal, trailing, and leading whitespace
        stripped_tag_string = u' '.join(tag_string.strip().split())

        # Split the tag string into a list of tags
        for tag in stripped_tag_string.split(','):
            tag = tag.strip()
            # Ignore empty tags or duplicate slugs
            if tag:
                slug_to_name[url.slugify(tag)] = tag
    return [{'name': v, 'slug': k} for (k,v) in slug_to_name.items()]


def media_tags_as_string(media_entry_tags):
    """
    Generate a string from a media item's tags, stored as a list of dicts

    This is the opposite of convert_to_tag_list_of_dicts
    """
    tags_string = ''
    if media_entry_tags:
        tags_string = u', '.join([tag['name'] for tag in media_entry_tags])
    return tags_string


TOO_LONG_TAG_WARNING = \
    u'Tags must be shorter than %s characters.  Tags that are too long: %s'


def tag_length_validator(form, field):
    """
    Make sure tags do not exceed the maximum tag length.
    """
    tags = convert_to_tag_list_of_dicts(field.data)
    too_long_tags = [
        tag['name'] for tag in tags
        if len(tag['name']) > mg_globals.app_config['tags_max_length']]

    if too_long_tags:
        raise wtforms.ValidationError(
            TOO_LONG_TAG_WARNING % (mg_globals.app_config['tags_max_length'],
                                    ', '.join(too_long_tags)))


# Don't use the safe mode, because lxml.html.clean is better and we are using
# it anyway
UNSAFE_MARKDOWN_INSTANCE = markdown.Markdown()


def cleaned_markdown_conversion(text):
    """
    Take a block of text, run it through MarkDown, and clean its HTML.
    """
    # Markdown will do nothing with and clean_html can do nothing with
    # an empty string :)
    if not text:
        return u''

    return clean_html(UNSAFE_MARKDOWN_INSTANCE.convert(text))
Commit	Line	Data
03ae172a	1	# GNU MediaGoblin -- federated, autonomous media hosting
cf29e8a8	2	# Copyright (C) 2011, 2012 MediaGoblin contributors. See AUTHORS.
03ae172a AW	3	#
	4	# This program is free software: you can redistribute it and/or modify
	5	# it under the terms of the GNU Affero General Public License as published by
	6	# the Free Software Foundation, either version 3 of the License, or
	7	# (at your option) any later version.
	8	#
	9	# This program is distributed in the hope that it will be useful,
	10	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	12	# GNU Affero General Public License for more details.
	13	#
	14	# You should have received a copy of the GNU Affero General Public License
	15	# along with this program. If not, see <http://www.gnu.org/licenses/>.
	16
679f7292	17	import collections
03ae172a AW	18	import wtforms
	19	import markdown
	20	from lxml.html.clean import Cleaner
	21
	22	from mediagoblin import mg_globals
	23	from mediagoblin.tools import url
	24
ee91c2b8	25
03ae172a AW	26	# A super strict version of the lxml.html cleaner class
	27	HTML_CLEANER = Cleaner(
	28	scripts=True,
	29	javascript=True,
	30	comments=True,
	31	style=True,
	32	links=True,
	33	page_structure=True,
	34	processing_instructions=True,
	35	embedded=True,
	36	frames=True,
	37	forms=True,
	38	annoying_tags=True,
	39	allow_tags=[
cf29edcd JK	40	'div', 'b', 'i', 'em', 'strong', 'p', 'ul', 'ol', 'li', 'a', 'br',
cf29edcd JK	41	'pre', 'code'],
36c7d934	42	remove_unknown_tags=False, # can't be used with allow_tags
03ae172a	43	safe_attrs_only=True,
36c7d934	44	add_nofollow=True, # for now
03ae172a AW	45	host_whitelist=(),
	46	whitelist_tags=set([]))
	47
ee91c2b8	48
03ae172a AW	49	def clean_html(html):
	50	# clean_html barfs on an empty string
	51	if not html:
	52	return u''
	53
	54	return HTML_CLEANER.clean_html(html)
	55
ee91c2b8	56
03ae172a AW	57	def convert_to_tag_list_of_dicts(tag_string):
	58	"""
	59	Filter input from incoming string containing user tags,
	60
	61	Strips trailing, leading, and internal whitespace, and also converts
	62	the "tags" text into an array of tags
	63	"""
679f7292	64	slug_to_name = collections.OrderedDict()
03ae172a AW	65	if tag_string:
	66
	67	# Strip out internal, trailing, and leading whitespace
	68	stripped_tag_string = u' '.join(tag_string.strip().split())
	69
	70	# Split the tag string into a list of tags
36c7d934	71	for tag in stripped_tag_string.split(','):
9061383d	72	tag = tag.strip()
679f7292 LD	73	# Ignore empty tags or duplicate slugs
	74	if tag:
	75	slug_to_name[url.slugify(tag)] = tag
1eaad45f	76	return [{'name': v, 'slug': k} for (k,v) in slug_to_name.items()]
03ae172a	77
ee91c2b8	78
03ae172a AW	79	def media_tags_as_string(media_entry_tags):
	80	"""
	81	Generate a string from a media item's tags, stored as a list of dicts
	82
	83	This is the opposite of convert_to_tag_list_of_dicts
	84	"""
2a6a3b8c	85	tags_string = ''
03ae172a	86	if media_entry_tags:
2a6a3b8c SS	87	tags_string = u', '.join([tag['name'] for tag in media_entry_tags])
2a6a3b8c SS	88	return tags_string
03ae172a	89
ee91c2b8	90
03ae172a AW	91	TOO_LONG_TAG_WARNING = \
	92	u'Tags must be shorter than %s characters. Tags that are too long: %s'
	93
ee91c2b8	94
03ae172a AW	95	def tag_length_validator(form, field):
	96	"""
	97	Make sure tags do not exceed the maximum tag length.
	98	"""
	99	tags = convert_to_tag_list_of_dicts(field.data)
	100	too_long_tags = [
	101	tag['name'] for tag in tags
	102	if len(tag['name']) > mg_globals.app_config['tags_max_length']]
	103
	104	if too_long_tags:
	105	raise wtforms.ValidationError(
36c7d934	106	TOO_LONG_TAG_WARNING % (mg_globals.app_config['tags_max_length'],
03ae172a AW	107	', '.join(too_long_tags)))
	108
	109
c0428016 JK	110	# Don't use the safe mode, because lxml.html.clean is better and we are using
	111	# it anyway
	112	UNSAFE_MARKDOWN_INSTANCE = markdown.Markdown()
03ae172a	113
ee91c2b8	114
03ae172a AW	115	def cleaned_markdown_conversion(text):
	116	"""
	117	Take a block of text, run it through MarkDown, and clean its HTML.
	118	"""
	119	# Markdown will do nothing with and clean_html can do nothing with
	120	# an empty string :)
	121	if not text:
	122	return u''
	123
c0428016	124	return clean_html(UNSAFE_MARKDOWN_INSTANCE.convert(text))