[mediagoblin.git] / mediagoblin / media_types / pdf / processing.py

# GNU MediaGoblin -- federated, autonomous media hosting
# Copyright (C) 2011, 2012 MediaGoblin contributors.  See AUTHORS.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
import os
import logging
import dateutil.parser
from subprocess import PIPE, Popen

from mediagoblin import mg_globals as mgg
from mediagoblin.processing import (create_pub_filepath,
                                    FilenameBuilder, BadMediaFail)
from mediagoblin.tools.translate import fake_ugettext_passthrough as _

_log = logging.getLogger(__name__)

MEDIA_TYPE = 'mediagoblin.media_types.pdf'

# TODO - cache (memoize) util

# This is a list created via uniconv --show and hand removing some types that
# we already support via other media types better.
unoconv_supported = [
  'bib', #      - BibTeX [.bib]
  #bmp      - Windows Bitmap [.bmp]
  'csv', #      - Text CSV [.csv]
  'dbf', #      - dBASE [.dbf]
  'dif', #      - Data Interchange Format [.dif]
  'doc6', #     - Microsoft Word 6.0 [.doc]
  'doc95', #    - Microsoft Word 95 [.doc]
  'docbook', #  - DocBook [.xml]
  'doc', #      - Microsoft Word 97/2000/XP [.doc]
  'docx7', #    - Microsoft Office Open XML [.docx]
  'docx', #     - Microsoft Office Open XML [.docx]
  #emf      - Enhanced Metafile [.emf]
  'eps', #      - Encapsulated PostScript [.eps]
  'fodp', #     - OpenDocument Presentation (Flat XML) [.fodp]
  'fods', #     - OpenDocument Spreadsheet (Flat XML) [.fods]
  'fodt', #     - OpenDocument Text (Flat XML) [.fodt]
  #gif      - Graphics Interchange Format [.gif]
  'html', #     - HTML Document (OpenOffice.org Writer) [.html]
  #jpg      - Joint Photographic Experts Group [.jpg]
  'latex', #    - LaTeX 2e [.ltx]
  'mediawiki', # - MediaWiki [.txt]
  'met', #      - OS/2 Metafile [.met]
  'odd', #      - OpenDocument Drawing [.odd]
  'odg', #      - ODF Drawing (Impress) [.odg]
  'odp', #      - ODF Presentation [.odp]
  'ods', #      - ODF Spreadsheet [.ods]
  'odt', #      - ODF Text Document [.odt]
  'ooxml', #    - Microsoft Office Open XML [.xml]
  'otg', #      - OpenDocument Drawing Template [.otg]
  'otp', #      - ODF Presentation Template [.otp]
  'ots', #      - ODF Spreadsheet Template [.ots]
  'ott', #      - Open Document Text [.ott]
  #pbm      - Portable Bitmap [.pbm]
  #pct      - Mac Pict [.pct]
  'pdb', #      - AportisDoc (Palm) [.pdb]
  #pdf      - Portable Document Format [.pdf]
  #pgm      - Portable Graymap [.pgm]
  #png      - Portable Network Graphic [.png]
  'pot', #      - Microsoft PowerPoint 97/2000/XP Template [.pot]
  'potm', #     - Microsoft PowerPoint 2007/2010 XML Template [.potm]
  #ppm      - Portable Pixelmap [.ppm]
  'pps', #      - Microsoft PowerPoint 97/2000/XP (Autoplay) [.pps]
  'ppt', #      - Microsoft PowerPoint 97/2000/XP [.ppt]
  'pptx', #     - Microsoft PowerPoint 2007/2010 XML [.pptx]
  'psw', #      - Pocket Word [.psw]
  'pwp', #      - PlaceWare [.pwp]
  'pxl', #      - Pocket Excel [.pxl]
  #ras      - Sun Raster Image [.ras]
  'rtf', #      - Rich Text Format [.rtf]
  'sda', #      - StarDraw 5.0 (OpenOffice.org Impress) [.sda]
  'sdc3', #     - StarCalc 3.0 [.sdc]
  'sdc4', #     - StarCalc 4.0 [.sdc]
  'sdc', #      - StarCalc 5.0 [.sdc]
  'sdd3', #     - StarDraw 3.0 (OpenOffice.org Impress) [.sdd]
  'sdd4', #     - StarImpress 4.0 [.sdd]
  'sdd', #      - StarImpress 5.0 [.sdd]
  'sdw3', #     - StarWriter 3.0 [.sdw]
  'sdw4', #     - StarWriter 4.0 [.sdw]
  'sdw', #      - StarWriter 5.0 [.sdw]
  'slk', #      - SYLK [.slk]
  'stc', #      - OpenOffice.org 1.0 Spreadsheet Template [.stc]
  'std', #      - OpenOffice.org 1.0 Drawing Template [.std]
  'sti', #      - OpenOffice.org 1.0 Presentation Template [.sti]
  'stw', #      - Open Office.org 1.0 Text Document Template [.stw]
  #svg      - Scalable Vector Graphics [.svg]
  'svm', #      - StarView Metafile [.svm]
  'swf', #      - Macromedia Flash (SWF) [.swf]
  'sxc', #      - OpenOffice.org 1.0 Spreadsheet [.sxc]
  'sxd3', #     - StarDraw 3.0 [.sxd]
  'sxd5', #     - StarDraw 5.0 [.sxd]
  'sxd', #      - OpenOffice.org 1.0 Drawing (OpenOffice.org Impress) [.sxd]
  'sxi', #      - OpenOffice.org 1.0 Presentation [.sxi]
  'sxw', #      - Open Office.org 1.0 Text Document [.sxw]
  #text     - Text Encoded [.txt]
  #tiff     - Tagged Image File Format [.tiff]
  #txt      - Text [.txt]
  'uop', #      - Unified Office Format presentation [.uop]
  'uos', #      - Unified Office Format spreadsheet [.uos]
  'uot', #      - Unified Office Format text [.uot]
  'vor3', #     - StarDraw 3.0 Template (OpenOffice.org Impress) [.vor]
  'vor4', #     - StarWriter 4.0 Template [.vor]
  'vor5', #     - StarDraw 5.0 Template (OpenOffice.org Impress) [.vor]
  'vor', #      - StarCalc 5.0 Template [.vor]
  #wmf      - Windows Metafile [.wmf]
  'xhtml', #    - XHTML Document [.html]
  'xls5', #     - Microsoft Excel 5.0 [.xls]
  'xls95', #    - Microsoft Excel 95 [.xls]
  'xls', #      - Microsoft Excel 97/2000/XP [.xls]
  'xlt5', #     - Microsoft Excel 5.0 Template [.xlt]
  'xlt95', #    - Microsoft Excel 95 Template [.xlt]
  'xlt', #      - Microsoft Excel 97/2000/XP Template [.xlt]
  #xpm      - X PixMap [.xpm]
]

def is_unoconv_working():
    # TODO: must have libreoffice-headless installed too, need to check for it
    unoconv = where('unoconv')
    if not unoconv:
        return False
    try:
        proc = Popen([unoconv, '--show'], stderr=PIPE)
        output = proc.stderr.read()
    except OSError, e:
        _log.warn(_('unoconv failing to run, check log file'))
        return False
    if 'ERROR' in output:
        return False
    return True

def supported_extensions(cache=[None]):
    if cache[0] == None:
        cache[0] = 'pdf'
        if is_unoconv_working():
            cache.extend(unoconv_supported)
    return cache

def where(name):
    for p in os.environ['PATH'].split(os.pathsep):
        fullpath = os.path.join(p, name)
        if os.path.exists(fullpath):
            return fullpath
    return None

def check_prerequisites():
    if not where('pdfinfo'):
        _log.warn('missing pdfinfo')
        return False
    if not where('pdftocairo'):
        _log.warn('missing pdfcairo')
        return False
    return True

def sniff_handler(media_file, **kw):
    _log.info('Sniffing {0}'.format(MEDIA_TYPE))
    if not check_prerequisites():
        return None
    if kw.get('media') is not None:
        name, ext = os.path.splitext(kw['media'].filename)
        clean_ext = ext[1:].lower()

        if clean_ext in supported_extensions():
            return MEDIA_TYPE

    return None

def create_pdf_thumb(original, thumb_filename, width, height):
    # Note: pdftocairo adds '.png', remove it
    thumb_filename = thumb_filename[:-4]
    executable = where('pdftocairo')
    args = [executable, '-scale-to', str(min(width, height)),
            '-singlefile', '-png', original, thumb_filename]
    _log.debug('calling {0}'.format(repr(' '.join(args))))
    Popen(executable=executable, args=args).wait()

def pdf_info(original):
    """
    Extract dictionary of pdf information. This could use a library instead
    of a process.

    Note: I'm assuming pdfinfo output is sanitized (integers where integers are
    expected, etc.) - if this is wrong then an exception will be raised and caught
    leading to the dreaded error page. It seems a safe assumption.
    """
    ret_dict = {}
    pdfinfo = where('pdfinfo')
    try:
        proc = Popen(executable=pdfinfo,
                     args=[pdfinfo, original], stdout=PIPE)
        lines = proc.stdout.readlines()
    except OSError:
        _log.debug('pdfinfo could not read the pdf file.')
        raise BadMediaFail()

    info_dict = dict([[part.strip() for part in l.strip().split(':', 1)]
                      for l in lines if ':' in l])

    for date_key in [('pdf_mod_date', 'ModDate'),
                     ('pdf_creation_date', 'CreationDate')]:
        if date_key in info_dict:
            ret_dict[date_key] = dateutil.parser.parse(info_dict[date_key])
    for db_key, int_key in [('pdf_pages', 'Pages')]:
        if int_key in info_dict:
            ret_dict[db_key] = int(info_dict[int_key])

    # parse 'PageSize' field: 595 x 842 pts (A4)
    page_size_parts = info_dict['Page size'].split()
    ret_dict['pdf_page_size_width'] = float(page_size_parts[0])
    ret_dict['pdf_page_size_height'] = float(page_size_parts[2])

    for db_key, str_key in [('pdf_keywords', 'Keywords'),
        ('pdf_creator', 'Creator'), ('pdf_producer', 'Producer'),
        ('pdf_author', 'Author'), ('pdf_title', 'Title')]:
        ret_dict[db_key] = info_dict.get(str_key, None)
    ret_dict['pdf_version_major'], ret_dict['pdf_version_minor'] = \
        map(int, info_dict['PDF version'].split('.'))

    return ret_dict


class CommonPdfProcessor(MediaProcessor):
    """
    Provides a base for various pdf processing steps
    """
    def common_setup(self):
        """
        Set up common pdf processing steps
        """
        # Pull down and set up the original file
        self.orig_filename = get_orig_filename(
            self.entry, self.workbench)
        self.name_builder = FilenameBuilder(self.orig_filename)

        self._set_pdf_filename()

    def _set_pdf_filename(self):
        if self.name_builder.ext == 'pdf':
            self.pdf_filename = self.orig_filename
        else:
            self.pdf_filename = self.name_builder.fill('{basename}.pdf')

    def copy_original(self):
        copy_original(
            self.entry, self.orig_filename,
            self.name_builder.fill('{basename}{ext}'))

    def generate_thumb(self, thumb_size=None):
        if not thumb_size:
            thumb_size = (mgg.global_config['media:thumb']['max_width'],
                          mgg.global_config['media:thumb']['max_height'])

        # Note: pdftocairo adds '.png', so don't include an ext
        thumb_filename = self.name_builder.fill('{basename}.thumbnail')

        executable = where('pdftocairo')
        args = [executable, '-scale-to', str(thumb_size),
                '-singlefile', '-png', self.pdf_filename, thumb_filename]

        _log.debug('calling {0}'.format(repr(' '.join(args))))
        Popen(executable=executable, args=args).wait()

        store_public(self.entry, 'thumb', thumb_filename,
                     self.name_builder.fill('{basename}.thumbnail.png'))

    def generate_pdf(self):
        """
        Store the pdf. If the file is not a pdf, make it a pdf
        """
        if self.name_builder.ext != 'pdf':
            unoconv = where('unoconv')
            Popen(executable=unoconv,
                args=[unoconv, '-v', '-f', 'pdf', self.orig_filename]).wait()

            if not os.path.exists(self.pdf_filename):
                _log.debug('unoconv failed to convert file to pdf')
                raise BadMediaFail()

        store_public(self.entry, 'pdf', self.pdf_filename,
                     self.name_builder.fill('{basename}.pdf'))

    def extract_pdf_info(self):
        pdf_info_dict = pdf_info(self.pdf_filename)
        entry.media_data_init(**pdf_info_dict)

    def generate_medium(self, size=None):
        if not size:
            size = (mgg.global_config['media:medium']['max_width'],
                    mgg.global_config['media:medium']['max_height'])

        # Note: pdftocairo adds '.png', so don't include an ext
        filename = self.name_builder.fill('{basename}.medium')

        executable = where('pdftocairo')
        args = [executable, '-scale-to', str(size),
                '-singlefile', '-png', self.pdf_filename, filename]

        _log.debug('calling {0}'.format(repr(' '.join(args))))
        Popen(executable=executable, args=args).wait()

        store_public(self.entry, 'thumb', filename,
                     self.name_builder.fill('{basename}.medium.png'))

class InitialProcessor(CommonPdfProcessor):
    """
    Initial processing step for new pdfs
    """
    name = "initial"
    description = "Initial processing"

    @classmethod
    def media_is_eligible(cls, entry=None, state=None):
        """
        Determine if this media type is eligible for processing
        """
        if not state:
            state = entry.state
        return state in (
            "unprocessed", "failed")

    @classmethod
    def generate_parser(cls):
        parser = argparse.ArgumentParser(
            description=cls.description,
            prog=cls.name)

        parser.add_argument(
            '--size',
            nargs=2,
            metavar=('max_width', 'max_height'),
            type=int)

        parser.add_argument(
            '--thumb-size',
            nargs=2,
            metavar=('max_width', 'max_height'),
            type=int)

        return parser

    @classmethod
    def args_to_request(cls, args):
        return request_from_args(
            args, ['size', 'thumb_size'])

    def process(self, size=None, thumb_size=None):
        self.common_setup()
        self.generate_pdf()
        self.extract_pdf_info()
        self.copy_original()
        self.generate_medium(size=size)
        self.generate_thumb(thumb_size=thumb_size)
        self.delete_queue_file()


class PdfProcessingManager(ProcessingManager):
    def __init__(self):
        super(self.__class__, self).__init__()
        self.add_processor(InitialProcessor)
Commit	Line	Data
a80ebf3b AL	1	# GNU MediaGoblin -- federated, autonomous media hosting
	2	# Copyright (C) 2011, 2012 MediaGoblin contributors. See AUTHORS.
	3	#
	4	# This program is free software: you can redistribute it and/or modify
	5	# it under the terms of the GNU Affero General Public License as published by
	6	# the Free Software Foundation, either version 3 of the License, or
	7	# (at your option) any later version.
	8	#
	9	# This program is distributed in the hope that it will be useful,
	10	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	12	# GNU Affero General Public License for more details.
	13	#
	14	# You should have received a copy of the GNU Affero General Public License
	15	# along with this program. If not, see <http://www.gnu.org/licenses/>.
a80ebf3b	16	import os
a80ebf3b AL	17	import logging
a80ebf3b AL	18	import dateutil.parser
519bcfb0	19	from subprocess import PIPE, Popen
a80ebf3b AL	20
	21	from mediagoblin import mg_globals as mgg
	22	from mediagoblin.processing import (create_pub_filepath,
	23	FilenameBuilder, BadMediaFail)
	24	from mediagoblin.tools.translate import fake_ugettext_passthrough as _
	25
	26	_log = logging.getLogger(__name__)
	27
51e4e435 RE	28	MEDIA_TYPE = 'mediagoblin.media_types.pdf'
51e4e435 RE	29
a80ebf3b AL	30	# TODO - cache (memoize) util
	31
	32	# This is a list created via uniconv --show and hand removing some types that
	33	# we already support via other media types better.
	34	unoconv_supported = [
	35	'bib', # - BibTeX [.bib]
	36	#bmp - Windows Bitmap [.bmp]
	37	'csv', # - Text CSV [.csv]
	38	'dbf', # - dBASE [.dbf]
	39	'dif', # - Data Interchange Format [.dif]
	40	'doc6', # - Microsoft Word 6.0 [.doc]
	41	'doc95', # - Microsoft Word 95 [.doc]
	42	'docbook', # - DocBook [.xml]
	43	'doc', # - Microsoft Word 97/2000/XP [.doc]
	44	'docx7', # - Microsoft Office Open XML [.docx]
	45	'docx', # - Microsoft Office Open XML [.docx]
	46	#emf - Enhanced Metafile [.emf]
	47	'eps', # - Encapsulated PostScript [.eps]
	48	'fodp', # - OpenDocument Presentation (Flat XML) [.fodp]
	49	'fods', # - OpenDocument Spreadsheet (Flat XML) [.fods]
	50	'fodt', # - OpenDocument Text (Flat XML) [.fodt]
	51	#gif - Graphics Interchange Format [.gif]
	52	'html', # - HTML Document (OpenOffice.org Writer) [.html]
	53	#jpg - Joint Photographic Experts Group [.jpg]
	54	'latex', # - LaTeX 2e [.ltx]
	55	'mediawiki', # - MediaWiki [.txt]
	56	'met', # - OS/2 Metafile [.met]
	57	'odd', # - OpenDocument Drawing [.odd]
	58	'odg', # - ODF Drawing (Impress) [.odg]
	59	'odp', # - ODF Presentation [.odp]
	60	'ods', # - ODF Spreadsheet [.ods]
	61	'odt', # - ODF Text Document [.odt]
	62	'ooxml', # - Microsoft Office Open XML [.xml]
	63	'otg', # - OpenDocument Drawing Template [.otg]
	64	'otp', # - ODF Presentation Template [.otp]
	65	'ots', # - ODF Spreadsheet Template [.ots]
	66	'ott', # - Open Document Text [.ott]
	67	#pbm - Portable Bitmap [.pbm]
	68	#pct - Mac Pict [.pct]
	69	'pdb', # - AportisDoc (Palm) [.pdb]
	70	#pdf - Portable Document Format [.pdf]
	71	#pgm - Portable Graymap [.pgm]
	72	#png - Portable Network Graphic [.png]
	73	'pot', # - Microsoft PowerPoint 97/2000/XP Template [.pot]
	74	'potm', # - Microsoft PowerPoint 2007/2010 XML Template [.potm]
	75	#ppm - Portable Pixelmap [.ppm]
	76	'pps', # - Microsoft PowerPoint 97/2000/XP (Autoplay) [.pps]
	77	'ppt', # - Microsoft PowerPoint 97/2000/XP [.ppt]
	78	'pptx', # - Microsoft PowerPoint 2007/2010 XML [.pptx]
	79	'psw', # - Pocket Word [.psw]
	80	'pwp', # - PlaceWare [.pwp]
	81	'pxl', # - Pocket Excel [.pxl]
	82	#ras - Sun Raster Image [.ras]
	83	'rtf', # - Rich Text Format [.rtf]
	84	'sda', # - StarDraw 5.0 (OpenOffice.org Impress) [.sda]
	85	'sdc3', # - StarCalc 3.0 [.sdc]
	86	'sdc4', # - StarCalc 4.0 [.sdc]
	87	'sdc', # - StarCalc 5.0 [.sdc]
	88	'sdd3', # - StarDraw 3.0 (OpenOffice.org Impress) [.sdd]
	89	'sdd4', # - StarImpress 4.0 [.sdd]
	90	'sdd', # - StarImpress 5.0 [.sdd]
	91	'sdw3', # - StarWriter 3.0 [.sdw]
	92	'sdw4', # - StarWriter 4.0 [.sdw]
	93	'sdw', # - StarWriter 5.0 [.sdw]
94	'slk', # - SYLK [.slk]
95	'stc', # - OpenOffice.org 1.0 Spreadsheet Template [.stc]
96	'std', # - OpenOffice.org 1.0 Drawing Template [.std]
97	'sti', # - OpenOffice.org 1.0 Presentation Template [.sti]
98	'stw', # - Open Office.org 1.0 Text Document Template [.stw]
99	#svg - Scalable Vector Graphics [.svg]
100	'svm', # - StarView Metafile [.svm]
101	'swf', # - Macromedia Flash (SWF) [.swf]
102	'sxc', # - OpenOffice.org 1.0 Spreadsheet [.sxc]
103	'sxd3', # - StarDraw 3.0 [.sxd]
104	'sxd5', # - StarDraw 5.0 [.sxd]
105	'sxd', # - OpenOffice.org 1.0 Drawing (OpenOffice.org Impress) [.sxd]
106	'sxi', # - OpenOffice.org 1.0 Presentation [.sxi]
107	'sxw', # - Open Office.org 1.0 Text Document [.sxw]
108	#text - Text Encoded [.txt]
109	#tiff - Tagged Image File Format [.tiff]
110	#txt - Text [.txt]
111	'uop', # - Unified Office Format presentation [.uop]
112	'uos', # - Unified Office Format spreadsheet [.uos]
113	'uot', # - Unified Office Format text [.uot]
114	'vor3', # - StarDraw 3.0 Template (OpenOffice.org Impress) [.vor]
115	'vor4', # - StarWriter 4.0 Template [.vor]
116	'vor5', # - StarDraw 5.0 Template (OpenOffice.org Impress) [.vor]
117	'vor', # - StarCalc 5.0 Template [.vor]
118	#wmf - Windows Metafile [.wmf]
119	'xhtml', # - XHTML Document [.html]
120	'xls5', # - Microsoft Excel 5.0 [.xls]
121	'xls95', # - Microsoft Excel 95 [.xls]
122	'xls', # - Microsoft Excel 97/2000/XP [.xls]
123	'xlt5', # - Microsoft Excel 5.0 Template [.xlt]
124	'xlt95', # - Microsoft Excel 95 Template [.xlt]
125	'xlt', # - Microsoft Excel 97/2000/XP Template [.xlt]
126	#xpm - X PixMap [.xpm]
127	]
128
129	def is_unoconv_working():
519bcfb0 AL	130	# TODO: must have libreoffice-headless installed too, need to check for it
	131	unoconv = where('unoconv')
	132	if not unoconv:
	133	return False
a80ebf3b	134	try:
519bcfb0 AL	135	proc = Popen([unoconv, '--show'], stderr=PIPE)
	136	output = proc.stderr.read()
	137	except OSError, e:
a80ebf3b AL	138	_log.warn(_('unoconv failing to run, check log file'))
	139	return False
	140	if 'ERROR' in output:
	141	return False
	142	return True
	143
	144	def supported_extensions(cache=[None]):
	145	if cache[0] == None:
	146	cache[0] = 'pdf'
519bcfb0	147	if is_unoconv_working():
a80ebf3b AL	148	cache.extend(unoconv_supported)
	149	return cache
	150
	151	def where(name):
	152	for p in os.environ['PATH'].split(os.pathsep):
	153	fullpath = os.path.join(p, name)
	154	if os.path.exists(fullpath):
	155	return fullpath
	156	return None
	157
	158	def check_prerequisites():
	159	if not where('pdfinfo'):
	160	_log.warn('missing pdfinfo')
	161	return False
	162	if not where('pdftocairo'):
	163	_log.warn('missing pdfcairo')
	164	return False
	165	return True
	166
	167	def sniff_handler(media_file, **kw):
51e4e435	168	_log.info('Sniffing {0}'.format(MEDIA_TYPE))
a80ebf3b	169	if not check_prerequisites():
51e4e435	170	return None
a80ebf3b AL	171	if kw.get('media') is not None:
	172	name, ext = os.path.splitext(kw['media'].filename)
	173	clean_ext = ext[1:].lower()
	174
	175	if clean_ext in supported_extensions():
51e4e435	176	return MEDIA_TYPE
a80ebf3b	177
51e4e435	178	return None
a80ebf3b AL	179
	180	def create_pdf_thumb(original, thumb_filename, width, height):
	181	# Note: pdftocairo adds '.png', remove it
	182	thumb_filename = thumb_filename[:-4]
	183	executable = where('pdftocairo')
	184	args = [executable, '-scale-to', str(min(width, height)),
	185	'-singlefile', '-png', original, thumb_filename]
	186	_log.debug('calling {0}'.format(repr(' '.join(args))))
519bcfb0	187	Popen(executable=executable, args=args).wait()
a80ebf3b AL	188
	189	def pdf_info(original):
	190	"""
	191	Extract dictionary of pdf information. This could use a library instead
	192	of a process.
	193
	194	Note: I'm assuming pdfinfo output is sanitized (integers where integers are
	195	expected, etc.) - if this is wrong then an exception will be raised and caught
	196	leading to the dreaded error page. It seems a safe assumption.
	197	"""
	198	ret_dict = {}
	199	pdfinfo = where('pdfinfo')
	200	try:
519bcfb0 AL	201	proc = Popen(executable=pdfinfo,
	202	args=[pdfinfo, original], stdout=PIPE)
	203	lines = proc.stdout.readlines()
	204	except OSError:
a80ebf3b AL	205	_log.debug('pdfinfo could not read the pdf file.')
	206	raise BadMediaFail()
	207
	208	info_dict = dict([[part.strip() for part in l.strip().split(':', 1)]
	209	for l in lines if ':' in l])
	210
	211	for date_key in [('pdf_mod_date', 'ModDate'),
	212	('pdf_creation_date', 'CreationDate')]:
	213	if date_key in info_dict:
	214	ret_dict[date_key] = dateutil.parser.parse(info_dict[date_key])
	215	for db_key, int_key in [('pdf_pages', 'Pages')]:
	216	if int_key in info_dict:
	217	ret_dict[db_key] = int(info_dict[int_key])
	218
	219	# parse 'PageSize' field: 595 x 842 pts (A4)
	220	page_size_parts = info_dict['Page size'].split()
	221	ret_dict['pdf_page_size_width'] = float(page_size_parts[0])
	222	ret_dict['pdf_page_size_height'] = float(page_size_parts[2])
	223
	224	for db_key, str_key in [('pdf_keywords', 'Keywords'),
	225	('pdf_creator', 'Creator'), ('pdf_producer', 'Producer'),
	226	('pdf_author', 'Author'), ('pdf_title', 'Title')]:
	227	ret_dict[db_key] = info_dict.get(str_key, None)
	228	ret_dict['pdf_version_major'], ret_dict['pdf_version_minor'] = \
	229	map(int, info_dict['PDF version'].split('.'))
	230
	231	return ret_dict
	232
a80ebf3b	233
5fabbcc4	234	class CommonPdfProcessor(MediaProcessor):
a80ebf3b	235	"""
5fabbcc4 RE	236	Provides a base for various pdf processing steps
	237	"""
	238	def common_setup(self):
	239	"""
	240	Set up common pdf processing steps
	241	"""
	242	# Pull down and set up the original file
	243	self.orig_filename = get_orig_filename(
	244	self.entry, self.workbench)
	245	self.name_builder = FilenameBuilder(self.orig_filename)
	246
	247	self._set_pdf_filename()
	248
	249	def _set_pdf_filename(self):
	250	if self.name_builder.ext == 'pdf':
	251	self.pdf_filename = self.orig_filename
	252	else:
	253	self.pdf_filename = self.name_builder.fill('{basename}.pdf')
	254
	255	def copy_original(self):
	256	copy_original(
	257	self.entry, self.orig_filename,
	258	self.name_builder.fill('{basename}{ext}'))
	259
	260	def generate_thumb(self, thumb_size=None):
	261	if not thumb_size:
	262	thumb_size = (mgg.global_config['media:thumb']['max_width'],
	263	mgg.global_config['media:thumb']['max_height'])
	264
	265	# Note: pdftocairo adds '.png', so don't include an ext
	266	thumb_filename = self.name_builder.fill('{basename}.thumbnail')
	267
	268	executable = where('pdftocairo')
	269	args = [executable, '-scale-to', str(thumb_size),
	270	'-singlefile', '-png', self.pdf_filename, thumb_filename]
	271
	272	_log.debug('calling {0}'.format(repr(' '.join(args))))
	273	Popen(executable=executable, args=args).wait()
	274
	275	store_public(self.entry, 'thumb', thumb_filename,
	276	self.name_builder.fill('{basename}.thumbnail.png'))
	277
	278	def generate_pdf(self):
	279	"""
	280	Store the pdf. If the file is not a pdf, make it a pdf
	281	"""
	282	if self.name_builder.ext != 'pdf':
	283	unoconv = where('unoconv')
	284	Popen(executable=unoconv,
	285	args=[unoconv, '-v', '-f', 'pdf', self.orig_filename]).wait()
	286
	287	if not os.path.exists(self.pdf_filename):
	288	_log.debug('unoconv failed to convert file to pdf')
	289	raise BadMediaFail()
	290
	291	store_public(self.entry, 'pdf', self.pdf_filename,
	292	self.name_builder.fill('{basename}.pdf'))
	293
	294	def extract_pdf_info(self):
	295	pdf_info_dict = pdf_info(self.pdf_filename)
	296	entry.media_data_init(**pdf_info_dict)
	297
	298	def generate_medium(self, size=None):
	299	if not size:
300	size = (mgg.global_config['media:medium']['max_width'],
301	mgg.global_config['media:medium']['max_height'])
302
303	# Note: pdftocairo adds '.png', so don't include an ext
304	filename = self.name_builder.fill('{basename}.medium')
305
306	executable = where('pdftocairo')
307	args = [executable, '-scale-to', str(size),
308	'-singlefile', '-png', self.pdf_filename, filename]
309
310	_log.debug('calling {0}'.format(repr(' '.join(args))))
311	Popen(executable=executable, args=args).wait()
312
313	store_public(self.entry, 'thumb', filename,
314	self.name_builder.fill('{basename}.medium.png'))
315
316	class InitialProcessor(CommonPdfProcessor):
317	"""
318	Initial processing step for new pdfs
319	"""
320	name = "initial"
321	description = "Initial processing"
322
323	@classmethod
324	def media_is_eligible(cls, entry=None, state=None):
325	"""
326	Determine if this media type is eligible for processing
327	"""
328	if not state:
329	state = entry.state
330	return state in (
331	"unprocessed", "failed")
332
333	@classmethod
334	def generate_parser(cls):
335	parser = argparse.ArgumentParser(
336	description=cls.description,
337	prog=cls.name)
338
339	parser.add_argument(
340	'--size',
341	nargs=2,
342	metavar=('max_width', 'max_height'),
343	type=int)
344
345	parser.add_argument(
346	'--thumb-size',
347	nargs=2,
348	metavar=('max_width', 'max_height'),
349	type=int)
350
351	return parser
352
353	@classmethod
354	def args_to_request(cls, args):
355	return request_from_args(
356	args, ['size', 'thumb_size'])
357
358	def process(self, size=None, thumb_size=None):
359	self.common_setup()
360	self.generate_pdf()
361	self.extract_pdf_info()
362	self.copy_original()
363	self.generate_medium(size=size)
364	self.generate_thumb(thumb_size=thumb_size)
365	self.delete_queue_file()
366
367
368	class PdfProcessingManager(ProcessingManager):
369	def __init__(self):
370	super(self.__class__, self).__init__()
371	self.add_processor(InitialProcessor)