[mediagoblin.git] / mediagoblin / media_types / ascii / processing.py

# GNU MediaGoblin -- federated, autonomous media hosting
# Copyright (C) 2011, 2012 MediaGoblin contributors.  See AUTHORS.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
import chardet
import os
try:
    from PIL import Image
except ImportError:
    import Image
import logging

from mediagoblin import mg_globals as mgg
from mediagoblin.processing import create_pub_filepath
from mediagoblin.media_types.ascii import asciitoimage

_log = logging.getLogger(__name__)

SUPPORTED_EXTENSIONS = ['txt', 'asc', 'nfo']
MEDIA_TYPE = 'mediagoblin.media_types.ascii'


def sniff_handler(media_file, **kw):
    _log.info('Sniffing {0}'.format(MEDIA_TYPE))
    if kw.get('media') is not None:
        name, ext = os.path.splitext(kw['media'].filename)
        clean_ext = ext[1:].lower()

        if clean_ext in SUPPORTED_EXTENSIONS:
            return MEDIA_TYPE

    return None


def process_ascii(proc_state):
    """Code to process a txt file. Will be run by celery.

    A Workbench() represents a local tempory dir. It is automatically
    cleaned up when this function exits.
    """
    entry = proc_state.entry
    workbench = proc_state.workbench
    ascii_config = mgg.global_config['media_type:mediagoblin.media_types.ascii']
    # Conversions subdirectory to avoid collisions
    conversions_subdir = os.path.join(
        workbench.dir, 'conversions')
    os.mkdir(conversions_subdir)

    queued_filepath = entry.queued_media_file
    queued_filename = workbench.localized_file(
        mgg.queue_store, queued_filepath,
        'source')

    queued_file = file(queued_filename, 'rb')

    with queued_file:
        queued_file_charset = chardet.detect(queued_file.read())

        # Only select a non-utf-8 charset if chardet is *really* sure
        # Tested with "Feli\x0109an superjaron", which was detecte
        if queued_file_charset['confidence'] < 0.9:
            interpreted_charset = 'utf-8'
        else:
            interpreted_charset = queued_file_charset['encoding']

        _log.info('Charset detected: {0}\nWill interpret as: {1}'.format(
                queued_file_charset,
                interpreted_charset))

        queued_file.seek(0)  # Rewind the queued file

        thumb_filepath = create_pub_filepath(
            entry, 'thumbnail.png')

        tmp_thumb_filename = os.path.join(
            conversions_subdir, thumb_filepath[-1])

        ascii_converter_args = {}

        if ascii_config['thumbnail_font']:
            ascii_converter_args.update(
                    {'font': ascii_config['thumbnail_font']})

        converter = asciitoimage.AsciiToImage(
               **ascii_converter_args)

        thumb = converter._create_image(
            queued_file.read())

        with file(tmp_thumb_filename, 'w') as thumb_file:
            thumb.thumbnail(
                (mgg.global_config['media:thumb']['max_width'],
                 mgg.global_config['media:thumb']['max_height']),
                Image.ANTIALIAS)
            thumb.save(thumb_file)

        _log.debug('Copying local file to public storage')
        mgg.public_store.copy_local_to_storage(
            tmp_thumb_filename, thumb_filepath)

        queued_file.seek(0)

        original_filepath = create_pub_filepath(entry, queued_filepath[-1])

        with mgg.public_store.get_file(original_filepath, 'wb') \
            as original_file:
            original_file.write(queued_file.read())

        queued_file.seek(0)  # Rewind *again*

        unicode_filepath = create_pub_filepath(entry, 'ascii-portable.txt')

        with mgg.public_store.get_file(unicode_filepath, 'wb') \
                as unicode_file:
            # Decode the original file from its detected charset (or UTF8)
            # Encode the unicode instance to ASCII and replace any non-ASCII
            # with an HTML entity (&#
            unicode_file.write(
                unicode(queued_file.read().decode(
                        interpreted_charset)).encode(
                    'ascii',
                    'xmlcharrefreplace'))

    # Remove queued media file from storage and database.
    # queued_filepath is in the task_id directory which should
    # be removed too, but fail if the directory is not empty to be on
    # the super-safe side.
    mgg.queue_store.delete_file(queued_filepath)      # rm file
    mgg.queue_store.delete_dir(queued_filepath[:-1])  # rm dir
    entry.queued_media_file = []

    media_files_dict = entry.setdefault('media_files', {})
    media_files_dict['thumb'] = thumb_filepath
    media_files_dict['unicode'] = unicode_filepath
    media_files_dict['original'] = original_filepath

    entry.save()
Commit	Line	Data
a246ccca	1	# GNU MediaGoblin -- federated, autonomous media hosting
cf29e8a8	2	# Copyright (C) 2011, 2012 MediaGoblin contributors. See AUTHORS.
a246ccca JW	3	#
	4	# This program is free software: you can redistribute it and/or modify
	5	# it under the terms of the GNU Affero General Public License as published by
	6	# the Free Software Foundation, either version 3 of the License, or
	7	# (at your option) any later version.
	8	#
	9	# This program is distributed in the hope that it will be useful,
	10	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	12	# GNU Affero General Public License for more details.
	13	#
	14	# You should have received a copy of the GNU Affero General Public License
	15	# along with this program. If not, see <http://www.gnu.org/licenses/>.
a246ccca JW	16	import chardet
a246ccca JW	17	import os
d0e9f843 AL	18	try:
	19	from PIL import Image
	20	except ImportError:
	21	import Image
010d28b4	22	import logging
a246ccca JW	23
a246ccca JW	24	from mediagoblin import mg_globals as mgg
c56d4b55	25	from mediagoblin.processing import create_pub_filepath
c2dfe1dd	26	from mediagoblin.media_types.ascii import asciitoimage
a246ccca	27
010d28b4	28	_log = logging.getLogger(__name__)
a246ccca	29
10085b77	30	SUPPORTED_EXTENSIONS = ['txt', 'asc', 'nfo']
22930812	31	MEDIA_TYPE = 'mediagoblin.media_types.ascii'
10085b77	32
c56d4b55	33
ec4261a4	34	def sniff_handler(media_file, **kw):
22930812	35	_log.info('Sniffing {0}'.format(MEDIA_TYPE))
9743ce88	36	if kw.get('media') is not None:
10085b77 JW	37	name, ext = os.path.splitext(kw['media'].filename)
	38	clean_ext = ext[1:].lower()
	39
	40	if clean_ext in SUPPORTED_EXTENSIONS:
22930812	41	return MEDIA_TYPE
10085b77	42
22930812	43	return None
ec4261a4	44
c56d4b55	45
fb46fa66	46	def process_ascii(proc_state):
45ab3e07 SS	47	"""Code to process a txt file. Will be run by celery.
	48
	49	A Workbench() represents a local tempory dir. It is automatically
d0e9f843	50	cleaned up when this function exits.
45ab3e07	51	"""
fb46fa66 E	52	entry = proc_state.entry
fb46fa66 E	53	workbench = proc_state.workbench
196a5181	54	ascii_config = mgg.global_config['media_type:mediagoblin.media_types.ascii']
a246ccca JW	55	# Conversions subdirectory to avoid collisions
	56	conversions_subdir = os.path.join(
	57	workbench.dir, 'conversions')
	58	os.mkdir(conversions_subdir)
	59
6b45ec1b	60	queued_filepath = entry.queued_media_file
a246ccca JW	61	queued_filename = workbench.localized_file(
	62	mgg.queue_store, queued_filepath,
	63	'source')
	64
	65	queued_file = file(queued_filename, 'rb')
	66
	67	with queued_file:
	68	queued_file_charset = chardet.detect(queued_file.read())
	69
010d28b4 JW	70	# Only select a non-utf-8 charset if chardet is really sure
	71	# Tested with "Feli\x0109an superjaron", which was detecte
	72	if queued_file_charset['confidence'] < 0.9:
	73	interpreted_charset = 'utf-8'
	74	else:
	75	interpreted_charset = queued_file_charset['encoding']
	76
	77	_log.info('Charset detected: {0}\nWill interpret as: {1}'.format(
	78	queued_file_charset,
	79	interpreted_charset))
	80
a246ccca JW	81	queued_file.seek(0) # Rewind the queued file
	82
	83	thumb_filepath = create_pub_filepath(
	84	entry, 'thumbnail.png')
	85
	86	tmp_thumb_filename = os.path.join(
	87	conversions_subdir, thumb_filepath[-1])
	88
196a5181 JW	89	ascii_converter_args = {}
	90
	91	if ascii_config['thumbnail_font']:
	92	ascii_converter_args.update(
	93	{'font': ascii_config['thumbnail_font']})
	94
	95	converter = asciitoimage.AsciiToImage(
	96	**ascii_converter_args)
a246ccca JW	97
	98	thumb = converter._create_image(
	99	queued_file.read())
	100
	101	with file(tmp_thumb_filename, 'w') as thumb_file:
c56d4b55 JW	102	thumb.thumbnail(
	103	(mgg.global_config['media:thumb']['max_width'],
	104	mgg.global_config['media:thumb']['max_height']),
	105	Image.ANTIALIAS)
a246ccca JW	106	thumb.save(thumb_file)
a246ccca JW	107
64da09e8	108	_log.debug('Copying local file to public storage')
a246ccca JW	109	mgg.public_store.copy_local_to_storage(
	110	tmp_thumb_filename, thumb_filepath)
	111
	112	queued_file.seek(0)
	113
	114	original_filepath = create_pub_filepath(entry, queued_filepath[-1])
	115
	116	with mgg.public_store.get_file(original_filepath, 'wb') \
	117	as original_file:
	118	original_file.write(queued_file.read())
	119
a246ccca JW	120	queued_file.seek(0) # Rewind again
a246ccca JW	121
010d28b4	122	unicode_filepath = create_pub_filepath(entry, 'ascii-portable.txt')
a246ccca JW	123
	124	with mgg.public_store.get_file(unicode_filepath, 'wb') \
	125	as unicode_file:
010d28b4 JW	126	# Decode the original file from its detected charset (or UTF8)
	127	# Encode the unicode instance to ASCII and replace any non-ASCII
	128	# with an HTML entity (&#
a246ccca	129	unicode_file.write(
010d28b4 JW	130	unicode(queued_file.read().decode(
010d28b4 JW	131	interpreted_charset)).encode(
a246ccca JW	132	'ascii',
	133	'xmlcharrefreplace'))
	134
36ae6bcb SS	135	# Remove queued media file from storage and database.
	136	# queued_filepath is in the task_id directory which should
	137	# be removed too, but fail if the directory is not empty to be on
	138	# the super-safe side.
	139	mgg.queue_store.delete_file(queued_filepath) # rm file
	140	mgg.queue_store.delete_dir(queued_filepath[:-1]) # rm dir
6b45ec1b	141	entry.queued_media_file = []
36ae6bcb	142
a246ccca JW	143	media_files_dict = entry.setdefault('media_files', {})
	144	media_files_dict['thumb'] = thumb_filepath
	145	media_files_dict['unicode'] = unicode_filepath
	146	media_files_dict['original'] = original_filepath
	147
	148	entry.save()