mediagoblin/media_types/pdf/processing.py

   1 # GNU MediaGoblin -- federated, autonomous media hosting
   2 # Copyright (C) 2011, 2012 MediaGoblin contributors.  See AUTHORS.
   3 #
   4 # This program is free software: you can redistribute it and/or modify
   5 # it under the terms of the GNU Affero General Public License as published by
   6 # the Free Software Foundation, either version 3 of the License, or
   7 # (at your option) any later version.
   8 #
   9 # This program is distributed in the hope that it will be useful,
  10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 # GNU Affero General Public License for more details.
  13 #
  14 # You should have received a copy of the GNU Affero General Public License
  15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16 import os
  17 import logging
  18 import dateutil.parser
  19 from subprocess import PIPE, Popen
  20
  21 from mediagoblin import mg_globals as mgg
  22 from mediagoblin.processing import (create_pub_filepath,
  23                                     FilenameBuilder, BadMediaFail)
  24 from mediagoblin.tools.translate import fake_ugettext_passthrough as _
  25
  26 _log = logging.getLogger(__name__)
  27
  28 MEDIA_TYPE = 'mediagoblin.media_types.pdf'
  29
  30 # TODO - cache (memoize) util
  31
  32 # This is a list created via uniconv --show and hand removing some types that
  33 # we already support via other media types better.
  34 unoconv_supported = [
  35   'bib', #      - BibTeX [.bib]
  36   #bmp      - Windows Bitmap [.bmp]
  37   'csv', #      - Text CSV [.csv]
  38   'dbf', #      - dBASE [.dbf]
  39   'dif', #      - Data Interchange Format [.dif]
  40   'doc6', #     - Microsoft Word 6.0 [.doc]
  41   'doc95', #    - Microsoft Word 95 [.doc]
  42   'docbook', #  - DocBook [.xml]
  43   'doc', #      - Microsoft Word 97/2000/XP [.doc]
  44   'docx7', #    - Microsoft Office Open XML [.docx]
  45   'docx', #     - Microsoft Office Open XML [.docx]
  46   #emf      - Enhanced Metafile [.emf]
  47   'eps', #      - Encapsulated PostScript [.eps]
  48   'fodp', #     - OpenDocument Presentation (Flat XML) [.fodp]
  49   'fods', #     - OpenDocument Spreadsheet (Flat XML) [.fods]
  50   'fodt', #     - OpenDocument Text (Flat XML) [.fodt]
  51   #gif      - Graphics Interchange Format [.gif]
  52   'html', #     - HTML Document (OpenOffice.org Writer) [.html]
  53   #jpg      - Joint Photographic Experts Group [.jpg]
  54   'latex', #    - LaTeX 2e [.ltx]
  55   'mediawiki', # - MediaWiki [.txt]
  56   'met', #      - OS/2 Metafile [.met]
  57   'odd', #      - OpenDocument Drawing [.odd]
  58   'odg', #      - ODF Drawing (Impress) [.odg]
  59   'odp', #      - ODF Presentation [.odp]
  60   'ods', #      - ODF Spreadsheet [.ods]
  61   'odt', #      - ODF Text Document [.odt]
  62   'ooxml', #    - Microsoft Office Open XML [.xml]
  63   'otg', #      - OpenDocument Drawing Template [.otg]
  64   'otp', #      - ODF Presentation Template [.otp]
  65   'ots', #      - ODF Spreadsheet Template [.ots]
  66   'ott', #      - Open Document Text [.ott]
  67   #pbm      - Portable Bitmap [.pbm]
  68   #pct      - Mac Pict [.pct]
  69   'pdb', #      - AportisDoc (Palm) [.pdb]
  70   #pdf      - Portable Document Format [.pdf]
  71   #pgm      - Portable Graymap [.pgm]
  72   #png      - Portable Network Graphic [.png]
  73   'pot', #      - Microsoft PowerPoint 97/2000/XP Template [.pot]
  74   'potm', #     - Microsoft PowerPoint 2007/2010 XML Template [.potm]
  75   #ppm      - Portable Pixelmap [.ppm]
  76   'pps', #      - Microsoft PowerPoint 97/2000/XP (Autoplay) [.pps]
  77   'ppt', #      - Microsoft PowerPoint 97/2000/XP [.ppt]
  78   'pptx', #     - Microsoft PowerPoint 2007/2010 XML [.pptx]
  79   'psw', #      - Pocket Word [.psw]
  80   'pwp', #      - PlaceWare [.pwp]
  81   'pxl', #      - Pocket Excel [.pxl]
  82   #ras      - Sun Raster Image [.ras]
  83   'rtf', #      - Rich Text Format [.rtf]
  84   'sda', #      - StarDraw 5.0 (OpenOffice.org Impress) [.sda]
  85   'sdc3', #     - StarCalc 3.0 [.sdc]
  86   'sdc4', #     - StarCalc 4.0 [.sdc]
  87   'sdc', #      - StarCalc 5.0 [.sdc]
  88   'sdd3', #     - StarDraw 3.0 (OpenOffice.org Impress) [.sdd]
  89   'sdd4', #     - StarImpress 4.0 [.sdd]
  90   'sdd', #      - StarImpress 5.0 [.sdd]
  91   'sdw3', #     - StarWriter 3.0 [.sdw]
  92   'sdw4', #     - StarWriter 4.0 [.sdw]
  93   'sdw', #      - StarWriter 5.0 [.sdw]
  94   'slk', #      - SYLK [.slk]
  95   'stc', #      - OpenOffice.org 1.0 Spreadsheet Template [.stc]
  96   'std', #      - OpenOffice.org 1.0 Drawing Template [.std]
  97   'sti', #      - OpenOffice.org 1.0 Presentation Template [.sti]
  98   'stw', #      - Open Office.org 1.0 Text Document Template [.stw]
  99   #svg      - Scalable Vector Graphics [.svg]
 100   'svm', #      - StarView Metafile [.svm]
 101   'swf', #      - Macromedia Flash (SWF) [.swf]
 102   'sxc', #      - OpenOffice.org 1.0 Spreadsheet [.sxc]
 103   'sxd3', #     - StarDraw 3.0 [.sxd]
 104   'sxd5', #     - StarDraw 5.0 [.sxd]
 105   'sxd', #      - OpenOffice.org 1.0 Drawing (OpenOffice.org Impress) [.sxd]
 106   'sxi', #      - OpenOffice.org 1.0 Presentation [.sxi]
 107   'sxw', #      - Open Office.org 1.0 Text Document [.sxw]
 108   #text     - Text Encoded [.txt]
 109   #tiff     - Tagged Image File Format [.tiff]
 110   #txt      - Text [.txt]
 111   'uop', #      - Unified Office Format presentation [.uop]
 112   'uos', #      - Unified Office Format spreadsheet [.uos]
 113   'uot', #      - Unified Office Format text [.uot]
 114   'vor3', #     - StarDraw 3.0 Template (OpenOffice.org Impress) [.vor]
 115   'vor4', #     - StarWriter 4.0 Template [.vor]
 116   'vor5', #     - StarDraw 5.0 Template (OpenOffice.org Impress) [.vor]
 117   'vor', #      - StarCalc 5.0 Template [.vor]
 118   #wmf      - Windows Metafile [.wmf]
 119   'xhtml', #    - XHTML Document [.html]
 120   'xls5', #     - Microsoft Excel 5.0 [.xls]
 121   'xls95', #    - Microsoft Excel 95 [.xls]
 122   'xls', #      - Microsoft Excel 97/2000/XP [.xls]
 123   'xlt5', #     - Microsoft Excel 5.0 Template [.xlt]
 124   'xlt95', #    - Microsoft Excel 95 Template [.xlt]
 125   'xlt', #      - Microsoft Excel 97/2000/XP Template [.xlt]
 126   #xpm      - X PixMap [.xpm]
 127 ]
 128
 129 def is_unoconv_working():
 130     # TODO: must have libreoffice-headless installed too, need to check for it
 131     unoconv = where('unoconv')
 132     if not unoconv:
 133         return False
 134     try:
 135         proc = Popen([unoconv, '--show'], stderr=PIPE)
 136         output = proc.stderr.read()
 137     except OSError, e:
 138         _log.warn(_('unoconv failing to run, check log file'))
 139         return False
 140     if 'ERROR' in output:
 141         return False
 142     return True
 143
 144 def supported_extensions(cache=[None]):
 145     if cache[0] == None:
 146         cache[0] = 'pdf'
 147         if is_unoconv_working():
 148             cache.extend(unoconv_supported)
 149     return cache
 150
 151 def where(name):
 152     for p in os.environ['PATH'].split(os.pathsep):
 153         fullpath = os.path.join(p, name)
 154         if os.path.exists(fullpath):
 155             return fullpath
 156     return None
 157
 158 def check_prerequisites():
 159     if not where('pdfinfo'):
 160         _log.warn('missing pdfinfo')
 161         return False
 162     if not where('pdftocairo'):
 163         _log.warn('missing pdfcairo')
 164         return False
 165     return True
 166
 167 def sniff_handler(media_file, **kw):
 168     _log.info('Sniffing {0}'.format(MEDIA_TYPE))
 169     if not check_prerequisites():
 170         return None
 171     if kw.get('media') is not None:
 172         name, ext = os.path.splitext(kw['media'].filename)
 173         clean_ext = ext[1:].lower()
 174
 175         if clean_ext in supported_extensions():
 176             return MEDIA_TYPE
 177
 178     return None
 179
 180 def create_pdf_thumb(original, thumb_filename, width, height):
 181     # Note: pdftocairo adds '.png', remove it
 182     thumb_filename = thumb_filename[:-4]
 183     executable = where('pdftocairo')
 184     args = [executable, '-scale-to', str(min(width, height)),
 185             '-singlefile', '-png', original, thumb_filename]
 186     _log.debug('calling {0}'.format(repr(' '.join(args))))
 187     Popen(executable=executable, args=args).wait()
 188
 189 def pdf_info(original):
 190     """
 191     Extract dictionary of pdf information. This could use a library instead
 192     of a process.
 193
 194     Note: I'm assuming pdfinfo output is sanitized (integers where integers are
 195     expected, etc.) - if this is wrong then an exception will be raised and caught
 196     leading to the dreaded error page. It seems a safe assumption.
 197     """
 198     ret_dict = {}
 199     pdfinfo = where('pdfinfo')
 200     try:
 201         proc = Popen(executable=pdfinfo,
 202                      args=[pdfinfo, original], stdout=PIPE)
 203         lines = proc.stdout.readlines()
 204     except OSError:
 205         _log.debug('pdfinfo could not read the pdf file.')
 206         raise BadMediaFail()
 207
 208     info_dict = dict([[part.strip() for part in l.strip().split(':', 1)]
 209                       for l in lines if ':' in l])
 210
 211     for date_key in [('pdf_mod_date', 'ModDate'),
 212                      ('pdf_creation_date', 'CreationDate')]:
 213         if date_key in info_dict:
 214             ret_dict[date_key] = dateutil.parser.parse(info_dict[date_key])
 215     for db_key, int_key in [('pdf_pages', 'Pages')]:
 216         if int_key in info_dict:
 217             ret_dict[db_key] = int(info_dict[int_key])
 218
 219     # parse 'PageSize' field: 595 x 842 pts (A4)
 220     page_size_parts = info_dict['Page size'].split()
 221     ret_dict['pdf_page_size_width'] = float(page_size_parts[0])
 222     ret_dict['pdf_page_size_height'] = float(page_size_parts[2])
 223
 224     for db_key, str_key in [('pdf_keywords', 'Keywords'),
 225         ('pdf_creator', 'Creator'), ('pdf_producer', 'Producer'),
 226         ('pdf_author', 'Author'), ('pdf_title', 'Title')]:
 227         ret_dict[db_key] = info_dict.get(str_key, None)
 228     ret_dict['pdf_version_major'], ret_dict['pdf_version_minor'] = \
 229         map(int, info_dict['PDF version'].split('.'))
 230
 231     return ret_dict
 232
 233 def process_pdf(proc_state):
 234     """Code to process a pdf file. Will be run by celery.
 235
 236     A Workbench() represents a local tempory dir. It is automatically
 237     cleaned up when this function exits.
 238     """
 239     entry = proc_state.entry
 240     workbench = proc_state.workbench
 241
 242     queued_filename = proc_state.get_queued_filename()
 243     name_builder = FilenameBuilder(queued_filename)
 244
 245     # Copy our queued local workbench to its final destination
 246     original_dest = name_builder.fill('{basename}{ext}')
 247     proc_state.copy_original(original_dest)
 248
 249     # Create a pdf if this is a different doc, store pdf for viewer
 250     ext = queued_filename.rsplit('.', 1)[-1].lower()
 251     if ext == 'pdf':
 252         pdf_filename = queued_filename
 253     else:
 254         pdf_filename = queued_filename.rsplit('.', 1)[0] + '.pdf'
 255         unoconv = where('unoconv')
 256         Popen(executable=unoconv,
 257               args=[unoconv, '-v', '-f', 'pdf', queued_filename]).wait()
 258         if not os.path.exists(pdf_filename):
 259             _log.debug('unoconv failed to convert file to pdf')
 260             raise BadMediaFail()
 261         proc_state.store_public(keyname=u'pdf', local_file=pdf_filename)
 262
 263     pdf_info_dict = pdf_info(pdf_filename)
 264
 265     for name, width, height in [
 266         (u'thumb', mgg.global_config['media:thumb']['max_width'],
 267                    mgg.global_config['media:thumb']['max_height']),
 268         (u'medium', mgg.global_config['media:medium']['max_width'],
 269                    mgg.global_config['media:medium']['max_height']),
 270         ]:
 271         filename = name_builder.fill('{basename}.%s.png' % name)
 272         path = workbench.joinpath(filename)
 273         create_pdf_thumb(pdf_filename, path, width, height)
 274         assert(os.path.exists(path))
 275         proc_state.store_public(keyname=name, local_file=path)
 276
 277     proc_state.delete_queue_file()
 278
 279     entry.media_data_init(**pdf_info_dict)
 280     entry.save()