mediagoblin/media_types/pdf/processing.py

   1 # GNU MediaGoblin -- federated, autonomous media hosting
   2 # Copyright (C) 2011, 2012 MediaGoblin contributors.  See AUTHORS.
   3 #
   4 # This program is free software: you can redistribute it and/or modify
   5 # it under the terms of the GNU Affero General Public License as published by
   6 # the Free Software Foundation, either version 3 of the License, or
   7 # (at your option) any later version.
   8 #
   9 # This program is distributed in the hope that it will be useful,
  10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 # GNU Affero General Public License for more details.
  13 #
  14 # You should have received a copy of the GNU Affero General Public License
  15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16 import argparse
  17 import os
  18 import logging
  19 import dateutil.parser
  20 from subprocess import PIPE, Popen
  21
  22 from mediagoblin import mg_globals as mgg
  23 from mediagoblin.processing import (
  24     FilenameBuilder, BadMediaFail,
  25     MediaProcessor, ProcessingManager,
  26     request_from_args, get_process_filename,
  27     store_public, copy_original)
  28 from mediagoblin.tools.translate import fake_ugettext_passthrough as _
  29
  30 _log = logging.getLogger(__name__)
  31
  32 MEDIA_TYPE = 'mediagoblin.media_types.pdf'
  33
  34 # TODO - cache (memoize) util
  35
  36 # This is a list created via uniconv --show and hand removing some types that
  37 # we already support via other media types better.
  38 unoconv_supported = [
  39   'bib', #      - BibTeX [.bib]
  40   #bmp      - Windows Bitmap [.bmp]
  41   'csv', #      - Text CSV [.csv]
  42   'dbf', #      - dBASE [.dbf]
  43   'dif', #      - Data Interchange Format [.dif]
  44   'doc6', #     - Microsoft Word 6.0 [.doc]
  45   'doc95', #    - Microsoft Word 95 [.doc]
  46   'docbook', #  - DocBook [.xml]
  47   'doc', #      - Microsoft Word 97/2000/XP [.doc]
  48   'docx7', #    - Microsoft Office Open XML [.docx]
  49   'docx', #     - Microsoft Office Open XML [.docx]
  50   #emf      - Enhanced Metafile [.emf]
  51   'eps', #      - Encapsulated PostScript [.eps]
  52   'fodp', #     - OpenDocument Presentation (Flat XML) [.fodp]
  53   'fods', #     - OpenDocument Spreadsheet (Flat XML) [.fods]
  54   'fodt', #     - OpenDocument Text (Flat XML) [.fodt]
  55   #gif      - Graphics Interchange Format [.gif]
  56   'html', #     - HTML Document (OpenOffice.org Writer) [.html]
  57   #jpg      - Joint Photographic Experts Group [.jpg]
  58   'latex', #    - LaTeX 2e [.ltx]
  59   'mediawiki', # - MediaWiki [.txt]
  60   'met', #      - OS/2 Metafile [.met]
  61   'odd', #      - OpenDocument Drawing [.odd]
  62   'odg', #      - ODF Drawing (Impress) [.odg]
  63   'odp', #      - ODF Presentation [.odp]
  64   'ods', #      - ODF Spreadsheet [.ods]
  65   'odt', #      - ODF Text Document [.odt]
  66   'ooxml', #    - Microsoft Office Open XML [.xml]
  67   'otg', #      - OpenDocument Drawing Template [.otg]
  68   'otp', #      - ODF Presentation Template [.otp]
  69   'ots', #      - ODF Spreadsheet Template [.ots]
  70   'ott', #      - Open Document Text [.ott]
  71   #pbm      - Portable Bitmap [.pbm]
  72   #pct      - Mac Pict [.pct]
  73   'pdb', #      - AportisDoc (Palm) [.pdb]
  74   #pdf      - Portable Document Format [.pdf]
  75   #pgm      - Portable Graymap [.pgm]
  76   #png      - Portable Network Graphic [.png]
  77   'pot', #      - Microsoft PowerPoint 97/2000/XP Template [.pot]
  78   'potm', #     - Microsoft PowerPoint 2007/2010 XML Template [.potm]
  79   #ppm      - Portable Pixelmap [.ppm]
  80   'pps', #      - Microsoft PowerPoint 97/2000/XP (Autoplay) [.pps]
  81   'ppt', #      - Microsoft PowerPoint 97/2000/XP [.ppt]
  82   'pptx', #     - Microsoft PowerPoint 2007/2010 XML [.pptx]
  83   'psw', #      - Pocket Word [.psw]
  84   'pwp', #      - PlaceWare [.pwp]
  85   'pxl', #      - Pocket Excel [.pxl]
  86   #ras      - Sun Raster Image [.ras]
  87   'rtf', #      - Rich Text Format [.rtf]
  88   'sda', #      - StarDraw 5.0 (OpenOffice.org Impress) [.sda]
  89   'sdc3', #     - StarCalc 3.0 [.sdc]
  90   'sdc4', #     - StarCalc 4.0 [.sdc]
  91   'sdc', #      - StarCalc 5.0 [.sdc]
  92   'sdd3', #     - StarDraw 3.0 (OpenOffice.org Impress) [.sdd]
  93   'sdd4', #     - StarImpress 4.0 [.sdd]
  94   'sdd', #      - StarImpress 5.0 [.sdd]
  95   'sdw3', #     - StarWriter 3.0 [.sdw]
  96   'sdw4', #     - StarWriter 4.0 [.sdw]
  97   'sdw', #      - StarWriter 5.0 [.sdw]
  98   'slk', #      - SYLK [.slk]
  99   'stc', #      - OpenOffice.org 1.0 Spreadsheet Template [.stc]
 100   'std', #      - OpenOffice.org 1.0 Drawing Template [.std]
 101   'sti', #      - OpenOffice.org 1.0 Presentation Template [.sti]
 102   'stw', #      - Open Office.org 1.0 Text Document Template [.stw]
 103   #svg      - Scalable Vector Graphics [.svg]
 104   'svm', #      - StarView Metafile [.svm]
 105   'swf', #      - Macromedia Flash (SWF) [.swf]
 106   'sxc', #      - OpenOffice.org 1.0 Spreadsheet [.sxc]
 107   'sxd3', #     - StarDraw 3.0 [.sxd]
 108   'sxd5', #     - StarDraw 5.0 [.sxd]
 109   'sxd', #      - OpenOffice.org 1.0 Drawing (OpenOffice.org Impress) [.sxd]
 110   'sxi', #      - OpenOffice.org 1.0 Presentation [.sxi]
 111   'sxw', #      - Open Office.org 1.0 Text Document [.sxw]
 112   #text     - Text Encoded [.txt]
 113   #tiff     - Tagged Image File Format [.tiff]
 114   #txt      - Text [.txt]
 115   'uop', #      - Unified Office Format presentation [.uop]
 116   'uos', #      - Unified Office Format spreadsheet [.uos]
 117   'uot', #      - Unified Office Format text [.uot]
 118   'vor3', #     - StarDraw 3.0 Template (OpenOffice.org Impress) [.vor]
 119   'vor4', #     - StarWriter 4.0 Template [.vor]
 120   'vor5', #     - StarDraw 5.0 Template (OpenOffice.org Impress) [.vor]
 121   'vor', #      - StarCalc 5.0 Template [.vor]
 122   #wmf      - Windows Metafile [.wmf]
 123   'xhtml', #    - XHTML Document [.html]
 124   'xls5', #     - Microsoft Excel 5.0 [.xls]
 125   'xls95', #    - Microsoft Excel 95 [.xls]
 126   'xls', #      - Microsoft Excel 97/2000/XP [.xls]
 127   'xlt5', #     - Microsoft Excel 5.0 Template [.xlt]
 128   'xlt95', #    - Microsoft Excel 95 Template [.xlt]
 129   'xlt', #      - Microsoft Excel 97/2000/XP Template [.xlt]
 130   #xpm      - X PixMap [.xpm]
 131 ]
 132
 133 def is_unoconv_working():
 134     # TODO: must have libreoffice-headless installed too, need to check for it
 135     unoconv = where('unoconv')
 136     if not unoconv:
 137         return False
 138     try:
 139         proc = Popen([unoconv, '--show'], stderr=PIPE)
 140         output = proc.stderr.read()
 141     except OSError, e:
 142         _log.warn(_('unoconv failing to run, check log file'))
 143         return False
 144     if 'ERROR' in output:
 145         return False
 146     return True
 147
 148 def supported_extensions(cache=[None]):
 149     if cache[0] == None:
 150         cache[0] = 'pdf'
 151         if is_unoconv_working():
 152             cache.extend(unoconv_supported)
 153     return cache
 154
 155 def where(name):
 156     for p in os.environ['PATH'].split(os.pathsep):
 157         fullpath = os.path.join(p, name)
 158         if os.path.exists(fullpath):
 159             return fullpath
 160     return None
 161
 162 def check_prerequisites():
 163     if not where('pdfinfo'):
 164         _log.warn('missing pdfinfo')
 165         return False
 166     if not where('pdftocairo'):
 167         _log.warn('missing pdfcairo')
 168         return False
 169     return True
 170
 171 def sniff_handler(media_file, **kw):
 172     _log.info('Sniffing {0}'.format(MEDIA_TYPE))
 173     if not check_prerequisites():
 174         return None
 175     if kw.get('media') is not None:
 176         name, ext = os.path.splitext(kw['media'].filename)
 177         clean_ext = ext[1:].lower()
 178
 179         if clean_ext in supported_extensions():
 180             return MEDIA_TYPE
 181
 182     return None
 183
 184 def create_pdf_thumb(original, thumb_filename, width, height):
 185     # Note: pdftocairo adds '.png', remove it
 186     thumb_filename = thumb_filename[:-4]
 187     executable = where('pdftocairo')
 188     args = [executable, '-scale-to', str(min(width, height)),
 189             '-singlefile', '-png', original, thumb_filename]
 190     _log.debug('calling {0}'.format(repr(' '.join(args))))
 191     Popen(executable=executable, args=args).wait()
 192
 193 def pdf_info(original):
 194     """
 195     Extract dictionary of pdf information. This could use a library instead
 196     of a process.
 197
 198     Note: I'm assuming pdfinfo output is sanitized (integers where integers are
 199     expected, etc.) - if this is wrong then an exception will be raised and caught
 200     leading to the dreaded error page. It seems a safe assumption.
 201     """
 202     ret_dict = {}
 203     pdfinfo = where('pdfinfo')
 204     try:
 205         proc = Popen(executable=pdfinfo,
 206                      args=[pdfinfo, original], stdout=PIPE)
 207         lines = proc.stdout.readlines()
 208     except OSError:
 209         _log.debug('pdfinfo could not read the pdf file.')
 210         raise BadMediaFail()
 211
 212     info_dict = dict([[part.strip() for part in l.strip().split(':', 1)]
 213                       for l in lines if ':' in l])
 214
 215     for date_key in [('pdf_mod_date', 'ModDate'),
 216                      ('pdf_creation_date', 'CreationDate')]:
 217         if date_key in info_dict:
 218             ret_dict[date_key] = dateutil.parser.parse(info_dict[date_key])
 219     for db_key, int_key in [('pdf_pages', 'Pages')]:
 220         if int_key in info_dict:
 221             ret_dict[db_key] = int(info_dict[int_key])
 222
 223     # parse 'PageSize' field: 595 x 842 pts (A4)
 224     page_size_parts = info_dict['Page size'].split()
 225     ret_dict['pdf_page_size_width'] = float(page_size_parts[0])
 226     ret_dict['pdf_page_size_height'] = float(page_size_parts[2])
 227
 228     for db_key, str_key in [('pdf_keywords', 'Keywords'),
 229         ('pdf_creator', 'Creator'), ('pdf_producer', 'Producer'),
 230         ('pdf_author', 'Author'), ('pdf_title', 'Title')]:
 231         ret_dict[db_key] = info_dict.get(str_key, None)
 232     ret_dict['pdf_version_major'], ret_dict['pdf_version_minor'] = \
 233         map(int, info_dict['PDF version'].split('.'))
 234
 235     return ret_dict
 236
 237
 238 class CommonPdfProcessor(MediaProcessor):
 239     """
 240     Provides a base for various pdf processing steps
 241     """
 242     acceptable_files = ['original', 'pdf']
 243
 244     def common_setup(self):
 245         """
 246         Set up common pdf processing steps
 247         """
 248         # Pull down and set up the processing file
 249         self.process_filename = get_process_filename(
 250             self.entry, self.workbench, self.acceptable_files)
 251         self.name_builder = FilenameBuilder(self.process_filename)
 252
 253         self._set_pdf_filename()
 254
 255     def _set_pdf_filename(self):
 256         if self.name_builder.ext == '.pdf':
 257             self.pdf_filename = self.process_filename
 258         elif self.entry.media_files.get('pdf'):
 259             self.pdf_filename = self.workbench.localized_file(
 260                 mgg.public_store, self.entry.media_files['pdf'])
 261         else:
 262             self.pdf_filename = self._generate_pdf()
 263
 264     def copy_original(self):
 265         copy_original(
 266             self.entry, self.process_filename,
 267             self.name_builder.fill('{basename}{ext}'))
 268
 269     def generate_thumb(self, thumb_size=None):
 270         if not thumb_size:
 271             thumb_size = (mgg.global_config['media:thumb']['max_width'],
 272                           mgg.global_config['media:thumb']['max_height'])
 273
 274         # Note: pdftocairo adds '.png', so don't include an ext
 275         thumb_filename = os.path.join(self.workbench.dir,
 276                                       self.name_builder.fill(
 277                                           '{basename}.thumbnail'))
 278
 279         executable = where('pdftocairo')
 280         args = [executable, '-scale-to', str(min(thumb_size)),
 281                 '-singlefile', '-png', self.pdf_filename, thumb_filename]
 282
 283         _log.debug('calling {0}'.format(repr(' '.join(args))))
 284         Popen(executable=executable, args=args).wait()
 285
 286         # since pdftocairo added '.png', we need to include it with the
 287         # filename
 288         store_public(self.entry, 'thumb', thumb_filename + '.png',
 289                      self.name_builder.fill('{basename}.thumbnail.png'))
 290
 291     def _generate_pdf(self):
 292         """
 293         Store the pdf. If the file is not a pdf, make it a pdf
 294         """
 295         tmp_pdf = self.process_filename
 296
 297         unoconv = where('unoconv')
 298         Popen(executable=unoconv,
 299               args=[unoconv, '-v', '-f', 'pdf', self.process_filename]).wait()
 300
 301         if not os.path.exists(tmp_pdf):
 302             _log.debug('unoconv failed to convert file to pdf')
 303             raise BadMediaFail()
 304
 305         store_public(self.entry, 'pdf', tmp_pdf,
 306                      self.name_builder.fill('{basename}.pdf'))
 307
 308         return self.workbench.localized_file(
 309             mgg.public_store, self.entry.media_files['pdf'])
 310
 311     def extract_pdf_info(self):
 312         pdf_info_dict = pdf_info(self.pdf_filename)
 313         self.entry.media_data_init(**pdf_info_dict)
 314
 315     def generate_medium(self, size=None):
 316         if not size:
 317             size = (mgg.global_config['media:medium']['max_width'],
 318                     mgg.global_config['media:medium']['max_height'])
 319
 320         # Note: pdftocairo adds '.png', so don't include an ext
 321         filename = os.path.join(self.workbench.dir,
 322                                 self.name_builder.fill('{basename}.medium'))
 323
 324         executable = where('pdftocairo')
 325         args = [executable, '-scale-to', str(min(size)),
 326                 '-singlefile', '-png', self.pdf_filename, filename]
 327
 328         _log.debug('calling {0}'.format(repr(' '.join(args))))
 329         Popen(executable=executable, args=args).wait()
 330
 331         # since pdftocairo added '.png', we need to include it with the
 332         # filename
 333         store_public(self.entry, 'medium', filename + '.png',
 334                      self.name_builder.fill('{basename}.medium.png'))
 335
 336
 337 class InitialProcessor(CommonPdfProcessor):
 338     """
 339     Initial processing step for new pdfs
 340     """
 341     name = "initial"
 342     description = "Initial processing"
 343
 344     @classmethod
 345     def media_is_eligible(cls, entry=None, state=None):
 346         """
 347         Determine if this media type is eligible for processing
 348         """
 349         if not state:
 350             state = entry.state
 351         return state in (
 352             "unprocessed", "failed")
 353
 354     @classmethod
 355     def generate_parser(cls):
 356         parser = argparse.ArgumentParser(
 357             description=cls.description,
 358             prog=cls.name)
 359
 360         parser.add_argument(
 361             '--size',
 362             nargs=2,
 363             metavar=('max_width', 'max_height'),
 364             type=int)
 365
 366         parser.add_argument(
 367             '--thumb-size',
 368             nargs=2,
 369             metavar=('max_width', 'max_height'),
 370             type=int)
 371
 372         return parser
 373
 374     @classmethod
 375     def args_to_request(cls, args):
 376         return request_from_args(
 377             args, ['size', 'thumb_size'])
 378
 379     def process(self, size=None, thumb_size=None):
 380         self.common_setup()
 381         self.extract_pdf_info()
 382         self.copy_original()
 383         self.generate_medium(size=size)
 384         self.generate_thumb(thumb_size=thumb_size)
 385         self.delete_queue_file()
 386
 387
 388 class Resizer(CommonPdfProcessor):
 389     """
 390     Resizing process steps for processed pdfs
 391     """
 392     name = 'resize'
 393     description = 'Resize thumbnail and medium'
 394     thumb_size = 'size'
 395
 396     @classmethod
 397     def media_is_eligible(cls, entry=None, state=None):
 398         """
 399         Determine if this media type is eligible for processing
 400         """
 401         if not state:
 402             state = entry.state
 403         return state in 'processed'
 404
 405     @classmethod
 406     def generate_parser(cls):
 407         parser = argparse.ArgumentParser(
 408             description=cls.description,
 409             prog=cls.name)
 410
 411         parser.add_argument(
 412             '--size',
 413             nargs=2,
 414             metavar=('max_width', 'max_height'),
 415             type=int)
 416
 417         parser.add_argument(
 418             'file',
 419             choices=['medium', 'thumb'])
 420
 421         return parser
 422
 423     @classmethod
 424     def args_to_request(cls, args):
 425         return request_from_args(
 426             args, ['size', 'file'])
 427
 428     def process(self, file, size=None):
 429         self.common_setup()
 430         if file == 'medium':
 431             self.generate_medium(size=size)
 432         elif file == 'thumb':
 433             self.generate_thumb(thumb_size=size)
 434
 435
 436 class PdfProcessingManager(ProcessingManager):
 437     def __init__(self):
 438         super(self.__class__, self).__init__()
 439         self.add_processor(InitialProcessor)
 440         self.add_processor(Resizer)