mediagoblin/media_types/pdf/processing.py

   1 # GNU MediaGoblin -- federated, autonomous media hosting
   2 # Copyright (C) 2011, 2012 MediaGoblin contributors.  See AUTHORS.
   3 #
   4 # This program is free software: you can redistribute it and/or modify
   5 # it under the terms of the GNU Affero General Public License as published by
   6 # the Free Software Foundation, either version 3 of the License, or
   7 # (at your option) any later version.
   8 #
   9 # This program is distributed in the hope that it will be useful,
  10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 # GNU Affero General Public License for more details.
  13 #
  14 # You should have received a copy of the GNU Affero General Public License
  15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16 import argparse
  17 import os
  18 import logging
  19 import dateutil.parser
  20 from subprocess import PIPE, Popen
  21
  22 from mediagoblin import mg_globals as mgg
  23 from mediagoblin.processing import (
  24     FilenameBuilder, BadMediaFail,
  25     MediaProcessor, ProcessingManager,
  26     request_from_args, get_process_filename,
  27     store_public, copy_original)
  28 from mediagoblin.tools.translate import fake_ugettext_passthrough as _
  29
  30 _log = logging.getLogger(__name__)
  31
  32 MEDIA_TYPE = 'mediagoblin.media_types.pdf'
  33
  34 # TODO - cache (memoize) util
  35
  36 # This is a list created via uniconv --show and hand removing some types that
  37 # we already support via other media types better.
  38 unoconv_supported = [
  39   'bib', #      - BibTeX [.bib]
  40   #bmp      - Windows Bitmap [.bmp]
  41   'csv', #      - Text CSV [.csv]
  42   'dbf', #      - dBASE [.dbf]
  43   'dif', #      - Data Interchange Format [.dif]
  44   'doc6', #     - Microsoft Word 6.0 [.doc]
  45   'doc95', #    - Microsoft Word 95 [.doc]
  46   'docbook', #  - DocBook [.xml]
  47   'doc', #      - Microsoft Word 97/2000/XP [.doc]
  48   'docx7', #    - Microsoft Office Open XML [.docx]
  49   'docx', #     - Microsoft Office Open XML [.docx]
  50   #emf      - Enhanced Metafile [.emf]
  51   'eps', #      - Encapsulated PostScript [.eps]
  52   'fodp', #     - OpenDocument Presentation (Flat XML) [.fodp]
  53   'fods', #     - OpenDocument Spreadsheet (Flat XML) [.fods]
  54   'fodt', #     - OpenDocument Text (Flat XML) [.fodt]
  55   #gif      - Graphics Interchange Format [.gif]
  56   'html', #     - HTML Document (OpenOffice.org Writer) [.html]
  57   #jpg      - Joint Photographic Experts Group [.jpg]
  58   'latex', #    - LaTeX 2e [.ltx]
  59   'mediawiki', # - MediaWiki [.txt]
  60   'met', #      - OS/2 Metafile [.met]
  61   'odd', #      - OpenDocument Drawing [.odd]
  62   'odg', #      - ODF Drawing (Impress) [.odg]
  63   'odp', #      - ODF Presentation [.odp]
  64   'ods', #      - ODF Spreadsheet [.ods]
  65   'odt', #      - ODF Text Document [.odt]
  66   'ooxml', #    - Microsoft Office Open XML [.xml]
  67   'otg', #      - OpenDocument Drawing Template [.otg]
  68   'otp', #      - ODF Presentation Template [.otp]
  69   'ots', #      - ODF Spreadsheet Template [.ots]
  70   'ott', #      - Open Document Text [.ott]
  71   #pbm      - Portable Bitmap [.pbm]
  72   #pct      - Mac Pict [.pct]
  73   'pdb', #      - AportisDoc (Palm) [.pdb]
  74   #pdf      - Portable Document Format [.pdf]
  75   #pgm      - Portable Graymap [.pgm]
  76   #png      - Portable Network Graphic [.png]
  77   'pot', #      - Microsoft PowerPoint 97/2000/XP Template [.pot]
  78   'potm', #     - Microsoft PowerPoint 2007/2010 XML Template [.potm]
  79   #ppm      - Portable Pixelmap [.ppm]
  80   'pps', #      - Microsoft PowerPoint 97/2000/XP (Autoplay) [.pps]
  81   'ppt', #      - Microsoft PowerPoint 97/2000/XP [.ppt]
  82   'pptx', #     - Microsoft PowerPoint 2007/2010 XML [.pptx]
  83   'psw', #      - Pocket Word [.psw]
  84   'pwp', #      - PlaceWare [.pwp]
  85   'pxl', #      - Pocket Excel [.pxl]
  86   #ras      - Sun Raster Image [.ras]
  87   'rtf', #      - Rich Text Format [.rtf]
  88   'sda', #      - StarDraw 5.0 (OpenOffice.org Impress) [.sda]
  89   'sdc3', #     - StarCalc 3.0 [.sdc]
  90   'sdc4', #     - StarCalc 4.0 [.sdc]
  91   'sdc', #      - StarCalc 5.0 [.sdc]
  92   'sdd3', #     - StarDraw 3.0 (OpenOffice.org Impress) [.sdd]
  93   'sdd4', #     - StarImpress 4.0 [.sdd]
  94   'sdd', #      - StarImpress 5.0 [.sdd]
  95   'sdw3', #     - StarWriter 3.0 [.sdw]
  96   'sdw4', #     - StarWriter 4.0 [.sdw]
  97   'sdw', #      - StarWriter 5.0 [.sdw]
  98   'slk', #      - SYLK [.slk]
  99   'stc', #      - OpenOffice.org 1.0 Spreadsheet Template [.stc]
 100   'std', #      - OpenOffice.org 1.0 Drawing Template [.std]
 101   'sti', #      - OpenOffice.org 1.0 Presentation Template [.sti]
 102   'stw', #      - Open Office.org 1.0 Text Document Template [.stw]
 103   #svg      - Scalable Vector Graphics [.svg]
 104   'svm', #      - StarView Metafile [.svm]
 105   'swf', #      - Macromedia Flash (SWF) [.swf]
 106   'sxc', #      - OpenOffice.org 1.0 Spreadsheet [.sxc]
 107   'sxd3', #     - StarDraw 3.0 [.sxd]
 108   'sxd5', #     - StarDraw 5.0 [.sxd]
 109   'sxd', #      - OpenOffice.org 1.0 Drawing (OpenOffice.org Impress) [.sxd]
 110   'sxi', #      - OpenOffice.org 1.0 Presentation [.sxi]
 111   'sxw', #      - Open Office.org 1.0 Text Document [.sxw]
 112   #text     - Text Encoded [.txt]
 113   #tiff     - Tagged Image File Format [.tiff]
 114   #txt      - Text [.txt]
 115   'uop', #      - Unified Office Format presentation [.uop]
 116   'uos', #      - Unified Office Format spreadsheet [.uos]
 117   'uot', #      - Unified Office Format text [.uot]
 118   'vor3', #     - StarDraw 3.0 Template (OpenOffice.org Impress) [.vor]
 119   'vor4', #     - StarWriter 4.0 Template [.vor]
 120   'vor5', #     - StarDraw 5.0 Template (OpenOffice.org Impress) [.vor]
 121   'vor', #      - StarCalc 5.0 Template [.vor]
 122   #wmf      - Windows Metafile [.wmf]
 123   'xhtml', #    - XHTML Document [.html]
 124   'xls5', #     - Microsoft Excel 5.0 [.xls]
 125   'xls95', #    - Microsoft Excel 95 [.xls]
 126   'xls', #      - Microsoft Excel 97/2000/XP [.xls]
 127   'xlt5', #     - Microsoft Excel 5.0 Template [.xlt]
 128   'xlt95', #    - Microsoft Excel 95 Template [.xlt]
 129   'xlt', #      - Microsoft Excel 97/2000/XP Template [.xlt]
 130   #xpm      - X PixMap [.xpm]
 131 ]
 132
 133 def is_unoconv_working():
 134     # TODO: must have libreoffice-headless installed too, need to check for it
 135     unoconv = where('unoconv')
 136     if not unoconv:
 137         return False
 138     try:
 139         proc = Popen([unoconv, '--show'], stderr=PIPE)
 140         output = proc.stderr.read()
 141     except OSError:
 142         _log.warn(_('unoconv failing to run, check log file'))
 143         return False
 144     if b'ERROR' in output:
 145         return False
 146     return True
 147
 148 def supported_extensions(cache=[None]):
 149     if cache[0] == None:
 150         cache[0] = 'pdf'
 151         if is_unoconv_working():
 152             cache.extend(unoconv_supported)
 153     return cache
 154
 155 def where(name):
 156     for p in os.environ['PATH'].split(os.pathsep):
 157         fullpath = os.path.join(p, name)
 158         if os.path.exists(fullpath):
 159             return fullpath
 160     return None
 161
 162 def check_prerequisites():
 163     if not where('pdfinfo'):
 164         _log.warn('missing pdfinfo')
 165         return False
 166     if not where('pdftocairo'):
 167         _log.warn('missing pdfcairo')
 168         return False
 169     return True
 170
 171 def sniff_handler(media_file, filename):
 172     _log.info('Sniffing {0}'.format(MEDIA_TYPE))
 173     if not check_prerequisites():
 174         return None
 175
 176     name, ext = os.path.splitext(filename)
 177     clean_ext = ext[1:].lower()
 178
 179     if clean_ext in supported_extensions():
 180         return MEDIA_TYPE
 181
 182 def create_pdf_thumb(original, thumb_filename, width, height):
 183     # Note: pdftocairo adds '.png', remove it
 184     thumb_filename = thumb_filename[:-4]
 185     executable = where('pdftocairo')
 186     args = [executable, '-scale-to', str(min(width, height)),
 187             '-singlefile', '-png', original, thumb_filename]
 188     _log.debug('calling {0}'.format(repr(' '.join(args))))
 189     Popen(executable=executable, args=args).wait()
 190
 191 def pdf_info(original):
 192     """
 193     Extract dictionary of pdf information. This could use a library instead
 194     of a process.
 195
 196     Note: I'm assuming pdfinfo output is sanitized (integers where integers are
 197     expected, etc.) - if this is wrong then an exception will be raised and caught
 198     leading to the dreaded error page. It seems a safe assumption.
 199     """
 200     ret_dict = {}
 201     pdfinfo = where('pdfinfo')
 202     try:
 203         proc = Popen(executable=pdfinfo,
 204                      args=[pdfinfo, original], stdout=PIPE)
 205         lines = proc.stdout.readlines()
 206     except OSError:
 207         _log.debug('pdfinfo could not read the pdf file.')
 208         raise BadMediaFail()
 209
 210     lines = [l.decode('utf-8', 'replace') for l in lines]
 211     info_dict = dict([[part.strip() for part in l.strip().split(':', 1)]
 212                       for l in lines if ':' in l])
 213
 214     if 'Page size' not in info_dict.keys():
 215         # TODO - message is for the user, not debug, but BadMediaFail not taking an argument, fix that.
 216         _log.debug('Missing "Page size" key in returned pdf - conversion failed?')
 217         raise BadMediaFail()
 218
 219     for date_key in [('pdf_mod_date', 'ModDate'),
 220                      ('pdf_creation_date', 'CreationDate')]:
 221         if date_key in info_dict:
 222             ret_dict[date_key] = dateutil.parser.parse(info_dict[date_key])
 223     for db_key, int_key in [('pdf_pages', 'Pages')]:
 224         if int_key in info_dict:
 225             ret_dict[db_key] = int(info_dict[int_key])
 226
 227     # parse 'PageSize' field: 595 x 842 pts (A4)
 228     page_size_parts = info_dict['Page size'].split()
 229     ret_dict['pdf_page_size_width'] = float(page_size_parts[0])
 230     ret_dict['pdf_page_size_height'] = float(page_size_parts[2])
 231
 232     for db_key, str_key in [('pdf_keywords', 'Keywords'),
 233         ('pdf_creator', 'Creator'), ('pdf_producer', 'Producer'),
 234         ('pdf_author', 'Author'), ('pdf_title', 'Title')]:
 235         ret_dict[db_key] = info_dict.get(str_key, None)
 236     ret_dict['pdf_version_major'], ret_dict['pdf_version_minor'] = \
 237         map(int, info_dict['PDF version'].split('.'))
 238
 239     return ret_dict
 240
 241
 242 class CommonPdfProcessor(MediaProcessor):
 243     """
 244     Provides a base for various pdf processing steps
 245     """
 246     acceptable_files = ['original', 'pdf']
 247
 248     def common_setup(self):
 249         """
 250         Set up common pdf processing steps
 251         """
 252         # Pull down and set up the processing file
 253         self.process_filename = get_process_filename(
 254             self.entry, self.workbench, self.acceptable_files)
 255         self.name_builder = FilenameBuilder(self.process_filename)
 256
 257         self._set_pdf_filename()
 258
 259     def _set_pdf_filename(self):
 260         if self.name_builder.ext == '.pdf':
 261             self.pdf_filename = self.process_filename
 262         elif self.entry.media_files.get('pdf'):
 263             self.pdf_filename = self.workbench.localized_file(
 264                 mgg.public_store, self.entry.media_files['pdf'])
 265         else:
 266             self.pdf_filename = self._generate_pdf()
 267
 268     def _skip_processing(self, keyname, **kwargs):
 269         file_metadata = self.entry.get_file_metadata(keyname)
 270         skip = True
 271
 272         if not file_metadata:
 273             return False
 274
 275         if keyname == 'thumb':
 276             if kwargs.get('thumb_size') != file_metadata.get('thumb_size'):
 277                 skip = False
 278         elif keyname == 'medium':
 279             if kwargs.get('size') != file_metadata.get('size'):
 280                 skip = False
 281
 282         return skip
 283
 284     def copy_original(self):
 285         copy_original(
 286             self.entry, self.process_filename,
 287             self.name_builder.fill('{basename}{ext}'))
 288
 289     def generate_thumb(self, thumb_size=None):
 290         if not thumb_size:
 291             thumb_size = (mgg.global_config['media:thumb']['max_width'],
 292                           mgg.global_config['media:thumb']['max_height'])
 293
 294         if self._skip_processing('thumb', thumb_size=thumb_size):
 295             return
 296
 297         # Note: pdftocairo adds '.png', so don't include an ext
 298         thumb_filename = os.path.join(self.workbench.dir,
 299                                       self.name_builder.fill(
 300                                           '{basename}.thumbnail'))
 301
 302         executable = where('pdftocairo')
 303         args = [executable, '-scale-to', str(min(thumb_size)),
 304                 '-singlefile', '-png', self.pdf_filename, thumb_filename]
 305
 306         _log.debug('calling {0}'.format(repr(' '.join(args))))
 307         Popen(executable=executable, args=args).wait()
 308
 309         # since pdftocairo added '.png', we need to include it with the
 310         # filename
 311         store_public(self.entry, 'thumb', thumb_filename + '.png',
 312                      self.name_builder.fill('{basename}.thumbnail.png'))
 313
 314         self.entry.set_file_metadata('thumb', thumb_size=thumb_size)
 315
 316     def _generate_pdf(self):
 317         """
 318         Store the pdf. If the file is not a pdf, make it a pdf
 319         """
 320         tmp_pdf = os.path.splitext(self.process_filename)[0] + '.pdf'
 321
 322         unoconv = where('unoconv')
 323         args = [unoconv, '-v', '-f', 'pdf', self.process_filename]
 324         _log.debug('calling %s' % repr(args))
 325         Popen(executable=unoconv,
 326               args=args).wait()
 327
 328         if not os.path.exists(tmp_pdf):
 329             _log.debug('unoconv failed to convert file to pdf')
 330             raise BadMediaFail()
 331
 332         store_public(self.entry, 'pdf', tmp_pdf,
 333                      self.name_builder.fill('{basename}.pdf'))
 334
 335         return self.workbench.localized_file(
 336             mgg.public_store, self.entry.media_files['pdf'])
 337
 338     def extract_pdf_info(self):
 339         pdf_info_dict = pdf_info(self.pdf_filename)
 340         self.entry.media_data_init(**pdf_info_dict)
 341
 342     def generate_medium(self, size=None):
 343         if not size:
 344             size = (mgg.global_config['media:medium']['max_width'],
 345                     mgg.global_config['media:medium']['max_height'])
 346
 347         if self._skip_processing('medium', size=size):
 348             return
 349
 350         # Note: pdftocairo adds '.png', so don't include an ext
 351         filename = os.path.join(self.workbench.dir,
 352                                 self.name_builder.fill('{basename}.medium'))
 353
 354         executable = where('pdftocairo')
 355         args = [executable, '-scale-to', str(min(size)),
 356                 '-singlefile', '-png', self.pdf_filename, filename]
 357
 358         _log.debug('calling {0}'.format(repr(' '.join(args))))
 359         Popen(executable=executable, args=args).wait()
 360
 361         # since pdftocairo added '.png', we need to include it with the
 362         # filename
 363         store_public(self.entry, 'medium', filename + '.png',
 364                      self.name_builder.fill('{basename}.medium.png'))
 365
 366         self.entry.set_file_metadata('medium', size=size)
 367
 368
 369 class InitialProcessor(CommonPdfProcessor):
 370     """
 371     Initial processing step for new pdfs
 372     """
 373     name = "initial"
 374     description = "Initial processing"
 375
 376     @classmethod
 377     def media_is_eligible(cls, entry=None, state=None):
 378         """
 379         Determine if this media type is eligible for processing
 380         """
 381         if not state:
 382             state = entry.state
 383         return state in (
 384             "unprocessed", "failed")
 385
 386     @classmethod
 387     def generate_parser(cls):
 388         parser = argparse.ArgumentParser(
 389             description=cls.description,
 390             prog=cls.name)
 391
 392         parser.add_argument(
 393             '--size',
 394             nargs=2,
 395             metavar=('max_width', 'max_height'),
 396             type=int)
 397
 398         parser.add_argument(
 399             '--thumb-size',
 400             nargs=2,
 401             metavar=('max_width', 'max_height'),
 402             type=int)
 403
 404         return parser
 405
 406     @classmethod
 407     def args_to_request(cls, args):
 408         return request_from_args(
 409             args, ['size', 'thumb_size'])
 410
 411     def process(self, size=None, thumb_size=None):
 412         self.common_setup()
 413         self.extract_pdf_info()
 414         self.copy_original()
 415         self.generate_medium(size=size)
 416         self.generate_thumb(thumb_size=thumb_size)
 417         self.delete_queue_file()
 418
 419
 420 class Resizer(CommonPdfProcessor):
 421     """
 422     Resizing process steps for processed pdfs
 423     """
 424     name = 'resize'
 425     description = 'Resize thumbnail and medium'
 426     thumb_size = 'size'
 427
 428     @classmethod
 429     def media_is_eligible(cls, entry=None, state=None):
 430         """
 431         Determine if this media type is eligible for processing
 432         """
 433         if not state:
 434             state = entry.state
 435         return state in 'processed'
 436
 437     @classmethod
 438     def generate_parser(cls):
 439         parser = argparse.ArgumentParser(
 440             description=cls.description,
 441             prog=cls.name)
 442
 443         parser.add_argument(
 444             '--size',
 445             nargs=2,
 446             metavar=('max_width', 'max_height'),
 447             type=int)
 448
 449         parser.add_argument(
 450             'file',
 451             choices=['medium', 'thumb'])
 452
 453         return parser
 454
 455     @classmethod
 456     def args_to_request(cls, args):
 457         return request_from_args(
 458             args, ['size', 'file'])
 459
 460     def process(self, file, size=None):
 461         self.common_setup()
 462         if file == 'medium':
 463             self.generate_medium(size=size)
 464         elif file == 'thumb':
 465             self.generate_thumb(thumb_size=size)
 466
 467
 468 class PdfProcessingManager(ProcessingManager):
 469     def __init__(self):
 470         super(PdfProcessingManager, self).__init__()
 471         self.add_processor(InitialProcessor)
 472         self.add_processor(Resizer)
 473
 474     def workflow(self, entry, manager, feed_url, reprocess_action,
 475                  reprocess_info=None):
 476         ProcessMedia().apply_async(
 477             [entry.id, feed_url, reprocess_action, reprocess_info], {},
 478             task_id=entry.queued_task_id)