[mediagoblin.git] / mediagoblin / tools / text.py

# GNU MediaGoblin -- federated, autonomous media hosting
# Copyright (C) 2011, 2012 MediaGoblin contributors.  See AUTHORS.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import wtforms
import markdown
from lxml.html.clean import Cleaner

from mediagoblin import mg_globals
from mediagoblin.tools import url


# A super strict version of the lxml.html cleaner class
HTML_CLEANER = Cleaner(
    scripts=True,
    javascript=True,
    comments=True,
    style=True,
    links=True,
    page_structure=True,
    processing_instructions=True,
    embedded=True,
    frames=True,
    forms=True,
    annoying_tags=True,
    allow_tags=[
        'div', 'b', 'i', 'em', 'strong', 'p', 'ul', 'ol', 'li', 'a', 'br',
        'pre', 'code'],
    remove_unknown_tags=False,  # can't be used with allow_tags
    safe_attrs_only=True,
    add_nofollow=True,  # for now
    host_whitelist=(),
    whitelist_tags=set([]))


def clean_html(html):
    # clean_html barfs on an empty string
    if not html:
        return u''

    return HTML_CLEANER.clean_html(html)


def convert_to_tag_list_of_dicts(tag_string):
    """
    Filter input from incoming string containing user tags,

    Strips trailing, leading, and internal whitespace, and also converts
    the "tags" text into an array of tags
    """
    taglist = []
    if tag_string:

        # Strip out internal, trailing, and leading whitespace
        stripped_tag_string = u' '.join(tag_string.strip().split())

        # Split the tag string into a list of tags
        for tag in stripped_tag_string.split(','):
            tag = tag.strip()
            # Ignore empty or duplicate tags
            if tag and tag not in [t['name'] for t in taglist]:
                taglist.append({'name': tag,
                                'slug': url.slugify(tag)})
    return taglist


def media_tags_as_string(media_entry_tags):
    """
    Generate a string from a media item's tags, stored as a list of dicts

    This is the opposite of convert_to_tag_list_of_dicts
    """
    tags_string = ''
    if media_entry_tags:
        tags_string = u', '.join([tag['name'] for tag in media_entry_tags])
    return tags_string


TOO_LONG_TAG_WARNING = \
    u'Tags must be shorter than %s characters.  Tags that are too long: %s'


def tag_length_validator(form, field):
    """
    Make sure tags do not exceed the maximum tag length.
    """
    tags = convert_to_tag_list_of_dicts(field.data)
    too_long_tags = [
        tag['name'] for tag in tags
        if len(tag['name']) > mg_globals.app_config['tags_max_length']]

    if too_long_tags:
        raise wtforms.ValidationError(
            TOO_LONG_TAG_WARNING % (mg_globals.app_config['tags_max_length'],
                                    ', '.join(too_long_tags)))


# Don't use the safe mode, because lxml.html.clean is better and we are using
# it anyway
UNSAFE_MARKDOWN_INSTANCE = markdown.Markdown()


def cleaned_markdown_conversion(text):
    """
    Take a block of text, run it through MarkDown, and clean its HTML.
    """
    # Markdown will do nothing with and clean_html can do nothing with
    # an empty string :)
    if not text:
        return u''

    return clean_html(UNSAFE_MARKDOWN_INSTANCE.convert(text))
Commit	Line	Data
03ae172a	1	# GNU MediaGoblin -- federated, autonomous media hosting
cf29e8a8	2	# Copyright (C) 2011, 2012 MediaGoblin contributors. See AUTHORS.
03ae172a AW	3	#
	4	# This program is free software: you can redistribute it and/or modify
	5	# it under the terms of the GNU Affero General Public License as published by
	6	# the Free Software Foundation, either version 3 of the License, or
	7	# (at your option) any later version.
	8	#
	9	# This program is distributed in the hope that it will be useful,
	10	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	12	# GNU Affero General Public License for more details.
	13	#
	14	# You should have received a copy of the GNU Affero General Public License
	15	# along with this program. If not, see <http://www.gnu.org/licenses/>.
	16
	17	import wtforms
	18	import markdown
	19	from lxml.html.clean import Cleaner
	20
	21	from mediagoblin import mg_globals
	22	from mediagoblin.tools import url
	23
ee91c2b8	24
03ae172a AW	25	# A super strict version of the lxml.html cleaner class
	26	HTML_CLEANER = Cleaner(
	27	scripts=True,
	28	javascript=True,
	29	comments=True,
	30	style=True,
	31	links=True,
	32	page_structure=True,
	33	processing_instructions=True,
	34	embedded=True,
	35	frames=True,
	36	forms=True,
	37	annoying_tags=True,
	38	allow_tags=[
cf29edcd JK	39	'div', 'b', 'i', 'em', 'strong', 'p', 'ul', 'ol', 'li', 'a', 'br',
cf29edcd JK	40	'pre', 'code'],
36c7d934	41	remove_unknown_tags=False, # can't be used with allow_tags
03ae172a	42	safe_attrs_only=True,
36c7d934	43	add_nofollow=True, # for now
03ae172a AW	44	host_whitelist=(),
	45	whitelist_tags=set([]))
	46
ee91c2b8	47
03ae172a AW	48	def clean_html(html):
	49	# clean_html barfs on an empty string
	50	if not html:
	51	return u''
	52
	53	return HTML_CLEANER.clean_html(html)
	54
ee91c2b8	55
03ae172a AW	56	def convert_to_tag_list_of_dicts(tag_string):
	57	"""
	58	Filter input from incoming string containing user tags,
	59
	60	Strips trailing, leading, and internal whitespace, and also converts
	61	the "tags" text into an array of tags
	62	"""
	63	taglist = []
	64	if tag_string:
	65
	66	# Strip out internal, trailing, and leading whitespace
	67	stripped_tag_string = u' '.join(tag_string.strip().split())
	68
	69	# Split the tag string into a list of tags
36c7d934	70	for tag in stripped_tag_string.split(','):
9061383d	71	tag = tag.strip()
03ae172a	72	# Ignore empty or duplicate tags
9061383d SS	73	if tag and tag not in [t['name'] for t in taglist]:
	74	taglist.append({'name': tag,
	75	'slug': url.slugify(tag)})
03ae172a AW	76	return taglist
03ae172a AW	77
ee91c2b8	78
03ae172a AW	79	def media_tags_as_string(media_entry_tags):
	80	"""
	81	Generate a string from a media item's tags, stored as a list of dicts
	82
	83	This is the opposite of convert_to_tag_list_of_dicts
	84	"""
2a6a3b8c	85	tags_string = ''
03ae172a	86	if media_entry_tags:
2a6a3b8c SS	87	tags_string = u', '.join([tag['name'] for tag in media_entry_tags])
2a6a3b8c SS	88	return tags_string
03ae172a	89
ee91c2b8	90
03ae172a AW	91	TOO_LONG_TAG_WARNING = \
	92	u'Tags must be shorter than %s characters. Tags that are too long: %s'
	93
ee91c2b8	94
03ae172a AW	95	def tag_length_validator(form, field):
	96	"""
	97	Make sure tags do not exceed the maximum tag length.
	98	"""
	99	tags = convert_to_tag_list_of_dicts(field.data)
	100	too_long_tags = [
	101	tag['name'] for tag in tags
	102	if len(tag['name']) > mg_globals.app_config['tags_max_length']]
	103
	104	if too_long_tags:
	105	raise wtforms.ValidationError(
36c7d934	106	TOO_LONG_TAG_WARNING % (mg_globals.app_config['tags_max_length'],
03ae172a AW	107	', '.join(too_long_tags)))
	108
	109
c0428016 JK	110	# Don't use the safe mode, because lxml.html.clean is better and we are using
	111	# it anyway
	112	UNSAFE_MARKDOWN_INSTANCE = markdown.Markdown()
03ae172a	113
ee91c2b8	114
03ae172a AW	115	def cleaned_markdown_conversion(text):
	116	"""
	117	Take a block of text, run it through MarkDown, and clean its HTML.
	118	"""
	119	# Markdown will do nothing with and clean_html can do nothing with
	120	# an empty string :)
	121	if not text:
	122	return u''
	123
c0428016	124	return clean_html(UNSAFE_MARKDOWN_INSTANCE.convert(text))