6a13c8e3656f5e017ad0ade6d2a21817d51e7440
[mediagoblin.git] / mediagoblin / media_types / pdf / processing.py
1 # GNU MediaGoblin -- federated, autonomous media hosting
2 # Copyright (C) 2011, 2012 MediaGoblin contributors. See AUTHORS.
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published by
6 # the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
16 import argparse
17 import os
18 import logging
19 import dateutil.parser
20 from subprocess import PIPE, Popen
21
22 from mediagoblin import mg_globals as mgg
23 from mediagoblin.processing import (
24 FilenameBuilder, BadMediaFail,
25 MediaProcessor, ProcessingManager,
26 request_from_args, get_process_filename,
27 store_public, copy_original)
28 from mediagoblin.tools.translate import fake_ugettext_passthrough as _
29
30 _log = logging.getLogger(__name__)
31
32 MEDIA_TYPE = 'mediagoblin.media_types.pdf'
33
34 # TODO - cache (memoize) util
35
36 # This is a list created via uniconv --show and hand removing some types that
37 # we already support via other media types better.
38 unoconv_supported = [
39 'bib', # - BibTeX [.bib]
40 #bmp - Windows Bitmap [.bmp]
41 'csv', # - Text CSV [.csv]
42 'dbf', # - dBASE [.dbf]
43 'dif', # - Data Interchange Format [.dif]
44 'doc6', # - Microsoft Word 6.0 [.doc]
45 'doc95', # - Microsoft Word 95 [.doc]
46 'docbook', # - DocBook [.xml]
47 'doc', # - Microsoft Word 97/2000/XP [.doc]
48 'docx7', # - Microsoft Office Open XML [.docx]
49 'docx', # - Microsoft Office Open XML [.docx]
50 #emf - Enhanced Metafile [.emf]
51 'eps', # - Encapsulated PostScript [.eps]
52 'fodp', # - OpenDocument Presentation (Flat XML) [.fodp]
53 'fods', # - OpenDocument Spreadsheet (Flat XML) [.fods]
54 'fodt', # - OpenDocument Text (Flat XML) [.fodt]
55 #gif - Graphics Interchange Format [.gif]
56 'html', # - HTML Document (OpenOffice.org Writer) [.html]
57 #jpg - Joint Photographic Experts Group [.jpg]
58 'latex', # - LaTeX 2e [.ltx]
59 'mediawiki', # - MediaWiki [.txt]
60 'met', # - OS/2 Metafile [.met]
61 'odd', # - OpenDocument Drawing [.odd]
62 'odg', # - ODF Drawing (Impress) [.odg]
63 'odp', # - ODF Presentation [.odp]
64 'ods', # - ODF Spreadsheet [.ods]
65 'odt', # - ODF Text Document [.odt]
66 'ooxml', # - Microsoft Office Open XML [.xml]
67 'otg', # - OpenDocument Drawing Template [.otg]
68 'otp', # - ODF Presentation Template [.otp]
69 'ots', # - ODF Spreadsheet Template [.ots]
70 'ott', # - Open Document Text [.ott]
71 #pbm - Portable Bitmap [.pbm]
72 #pct - Mac Pict [.pct]
73 'pdb', # - AportisDoc (Palm) [.pdb]
74 #pdf - Portable Document Format [.pdf]
75 #pgm - Portable Graymap [.pgm]
76 #png - Portable Network Graphic [.png]
77 'pot', # - Microsoft PowerPoint 97/2000/XP Template [.pot]
78 'potm', # - Microsoft PowerPoint 2007/2010 XML Template [.potm]
79 #ppm - Portable Pixelmap [.ppm]
80 'pps', # - Microsoft PowerPoint 97/2000/XP (Autoplay) [.pps]
81 'ppt', # - Microsoft PowerPoint 97/2000/XP [.ppt]
82 'pptx', # - Microsoft PowerPoint 2007/2010 XML [.pptx]
83 'psw', # - Pocket Word [.psw]
84 'pwp', # - PlaceWare [.pwp]
85 'pxl', # - Pocket Excel [.pxl]
86 #ras - Sun Raster Image [.ras]
87 'rtf', # - Rich Text Format [.rtf]
88 'sda', # - StarDraw 5.0 (OpenOffice.org Impress) [.sda]
89 'sdc3', # - StarCalc 3.0 [.sdc]
90 'sdc4', # - StarCalc 4.0 [.sdc]
91 'sdc', # - StarCalc 5.0 [.sdc]
92 'sdd3', # - StarDraw 3.0 (OpenOffice.org Impress) [.sdd]
93 'sdd4', # - StarImpress 4.0 [.sdd]
94 'sdd', # - StarImpress 5.0 [.sdd]
95 'sdw3', # - StarWriter 3.0 [.sdw]
96 'sdw4', # - StarWriter 4.0 [.sdw]
97 'sdw', # - StarWriter 5.0 [.sdw]
98 'slk', # - SYLK [.slk]
99 'stc', # - OpenOffice.org 1.0 Spreadsheet Template [.stc]
100 'std', # - OpenOffice.org 1.0 Drawing Template [.std]
101 'sti', # - OpenOffice.org 1.0 Presentation Template [.sti]
102 'stw', # - Open Office.org 1.0 Text Document Template [.stw]
103 #svg - Scalable Vector Graphics [.svg]
104 'svm', # - StarView Metafile [.svm]
105 'swf', # - Macromedia Flash (SWF) [.swf]
106 'sxc', # - OpenOffice.org 1.0 Spreadsheet [.sxc]
107 'sxd3', # - StarDraw 3.0 [.sxd]
108 'sxd5', # - StarDraw 5.0 [.sxd]
109 'sxd', # - OpenOffice.org 1.0 Drawing (OpenOffice.org Impress) [.sxd]
110 'sxi', # - OpenOffice.org 1.0 Presentation [.sxi]
111 'sxw', # - Open Office.org 1.0 Text Document [.sxw]
112 #text - Text Encoded [.txt]
113 #tiff - Tagged Image File Format [.tiff]
114 #txt - Text [.txt]
115 'uop', # - Unified Office Format presentation [.uop]
116 'uos', # - Unified Office Format spreadsheet [.uos]
117 'uot', # - Unified Office Format text [.uot]
118 'vor3', # - StarDraw 3.0 Template (OpenOffice.org Impress) [.vor]
119 'vor4', # - StarWriter 4.0 Template [.vor]
120 'vor5', # - StarDraw 5.0 Template (OpenOffice.org Impress) [.vor]
121 'vor', # - StarCalc 5.0 Template [.vor]
122 #wmf - Windows Metafile [.wmf]
123 'xhtml', # - XHTML Document [.html]
124 'xls5', # - Microsoft Excel 5.0 [.xls]
125 'xls95', # - Microsoft Excel 95 [.xls]
126 'xls', # - Microsoft Excel 97/2000/XP [.xls]
127 'xlt5', # - Microsoft Excel 5.0 Template [.xlt]
128 'xlt95', # - Microsoft Excel 95 Template [.xlt]
129 'xlt', # - Microsoft Excel 97/2000/XP Template [.xlt]
130 #xpm - X PixMap [.xpm]
131 ]
132
133 def is_unoconv_working():
134 # TODO: must have libreoffice-headless installed too, need to check for it
135 unoconv = where('unoconv')
136 if not unoconv:
137 return False
138 try:
139 proc = Popen([unoconv, '--show'], stderr=PIPE)
140 output = proc.stderr.read()
141 except OSError:
142 _log.warn(_('unoconv failing to run, check log file'))
143 return False
144 if b'ERROR' in output:
145 return False
146 return True
147
148 def supported_extensions(cache=[None]):
149 if cache[0] == None:
150 cache[0] = 'pdf'
151 if is_unoconv_working():
152 cache.extend(unoconv_supported)
153 return cache
154
155 def where(name):
156 for p in os.environ['PATH'].split(os.pathsep):
157 fullpath = os.path.join(p, name)
158 if os.path.exists(fullpath):
159 return fullpath
160 return None
161
162 def check_prerequisites():
163 if not where('pdfinfo'):
164 _log.warn('missing pdfinfo')
165 return False
166 if not where('pdftocairo'):
167 _log.warn('missing pdfcairo')
168 return False
169 return True
170
171 def sniff_handler(media_file, filename):
172 _log.info('Sniffing {0}'.format(MEDIA_TYPE))
173 if not check_prerequisites():
174 return None
175
176 name, ext = os.path.splitext(filename)
177 clean_ext = ext[1:].lower()
178
179 if clean_ext in supported_extensions():
180 return MEDIA_TYPE
181
182 def create_pdf_thumb(original, thumb_filename, width, height):
183 # Note: pdftocairo adds '.png', remove it
184 thumb_filename = thumb_filename[:-4]
185 executable = where('pdftocairo')
186 args = [executable, '-scale-to', str(min(width, height)),
187 '-singlefile', '-png', original, thumb_filename]
188 _log.debug('calling {0}'.format(repr(' '.join(args))))
189 Popen(executable=executable, args=args).wait()
190
191 def pdf_info(original):
192 """
193 Extract dictionary of pdf information. This could use a library instead
194 of a process.
195
196 Note: I'm assuming pdfinfo output is sanitized (integers where integers are
197 expected, etc.) - if this is wrong then an exception will be raised and caught
198 leading to the dreaded error page. It seems a safe assumption.
199 """
200 ret_dict = {}
201 pdfinfo = where('pdfinfo')
202 try:
203 proc = Popen(executable=pdfinfo,
204 args=[pdfinfo, original], stdout=PIPE)
205 lines = proc.stdout.readlines()
206 except OSError:
207 _log.debug('pdfinfo could not read the pdf file.')
208 raise BadMediaFail()
209
210 lines = [l.decode('utf-8', 'replace') for l in lines]
211 info_dict = dict([[part.strip() for part in l.strip().split(':', 1)]
212 for l in lines if ':' in l])
213
214 if 'Page size' not in info_dict.keys():
215 # TODO - message is for the user, not debug, but BadMediaFail not taking an argument, fix that.
216 _log.debug('Missing "Page size" key in returned pdf - conversion failed?')
217 raise BadMediaFail()
218
219 for date_key in [('pdf_mod_date', 'ModDate'),
220 ('pdf_creation_date', 'CreationDate')]:
221 if date_key in info_dict:
222 ret_dict[date_key] = dateutil.parser.parse(info_dict[date_key])
223 for db_key, int_key in [('pdf_pages', 'Pages')]:
224 if int_key in info_dict:
225 ret_dict[db_key] = int(info_dict[int_key])
226
227 # parse 'PageSize' field: 595 x 842 pts (A4)
228 page_size_parts = info_dict['Page size'].split()
229 ret_dict['pdf_page_size_width'] = float(page_size_parts[0])
230 ret_dict['pdf_page_size_height'] = float(page_size_parts[2])
231
232 for db_key, str_key in [('pdf_keywords', 'Keywords'),
233 ('pdf_creator', 'Creator'), ('pdf_producer', 'Producer'),
234 ('pdf_author', 'Author'), ('pdf_title', 'Title')]:
235 ret_dict[db_key] = info_dict.get(str_key, None)
236 ret_dict['pdf_version_major'], ret_dict['pdf_version_minor'] = \
237 map(int, info_dict['PDF version'].split('.'))
238
239 return ret_dict
240
241
242 class CommonPdfProcessor(MediaProcessor):
243 """
244 Provides a base for various pdf processing steps
245 """
246 acceptable_files = ['original', 'pdf']
247
248 def common_setup(self):
249 """
250 Set up common pdf processing steps
251 """
252 # Pull down and set up the processing file
253 self.process_filename = get_process_filename(
254 self.entry, self.workbench, self.acceptable_files)
255 self.name_builder = FilenameBuilder(self.process_filename)
256
257 self._set_pdf_filename()
258
259 def _set_pdf_filename(self):
260 if self.name_builder.ext == '.pdf':
261 self.pdf_filename = self.process_filename
262 elif self.entry.media_files.get('pdf'):
263 self.pdf_filename = self.workbench.localized_file(
264 mgg.public_store, self.entry.media_files['pdf'])
265 else:
266 self.pdf_filename = self._generate_pdf()
267
268 def _skip_processing(self, keyname, **kwargs):
269 file_metadata = self.entry.get_file_metadata(keyname)
270 skip = True
271
272 if not file_metadata:
273 return False
274
275 if keyname == 'thumb':
276 if kwargs.get('thumb_size') != file_metadata.get('thumb_size'):
277 skip = False
278 elif keyname == 'medium':
279 if kwargs.get('size') != file_metadata.get('size'):
280 skip = False
281
282 return skip
283
284 def copy_original(self):
285 copy_original(
286 self.entry, self.process_filename,
287 self.name_builder.fill('{basename}{ext}'))
288
289 def generate_thumb(self, thumb_size=None):
290 if not thumb_size:
291 thumb_size = (mgg.global_config['media:thumb']['max_width'],
292 mgg.global_config['media:thumb']['max_height'])
293
294 if self._skip_processing('thumb', thumb_size=thumb_size):
295 return
296
297 # Note: pdftocairo adds '.png', so don't include an ext
298 thumb_filename = os.path.join(self.workbench.dir,
299 self.name_builder.fill(
300 '{basename}.thumbnail'))
301
302 executable = where('pdftocairo')
303 args = [executable, '-scale-to', str(min(thumb_size)),
304 '-singlefile', '-png', self.pdf_filename, thumb_filename]
305
306 _log.debug('calling {0}'.format(repr(' '.join(args))))
307 Popen(executable=executable, args=args).wait()
308
309 # since pdftocairo added '.png', we need to include it with the
310 # filename
311 store_public(self.entry, 'thumb', thumb_filename + '.png',
312 self.name_builder.fill('{basename}.thumbnail.png'))
313
314 self.entry.set_file_metadata('thumb', thumb_size=thumb_size)
315
316 def _generate_pdf(self):
317 """
318 Store the pdf. If the file is not a pdf, make it a pdf
319 """
320 tmp_pdf = os.path.splitext(self.process_filename)[0] + '.pdf'
321
322 unoconv = where('unoconv')
323 args = [unoconv, '-v', '-f', 'pdf', self.process_filename]
324 _log.debug('calling %s' % repr(args))
325 Popen(executable=unoconv,
326 args=args).wait()
327
328 if not os.path.exists(tmp_pdf):
329 _log.debug('unoconv failed to convert file to pdf')
330 raise BadMediaFail()
331
332 store_public(self.entry, 'pdf', tmp_pdf,
333 self.name_builder.fill('{basename}.pdf'))
334
335 return self.workbench.localized_file(
336 mgg.public_store, self.entry.media_files['pdf'])
337
338 def extract_pdf_info(self):
339 pdf_info_dict = pdf_info(self.pdf_filename)
340 self.entry.media_data_init(**pdf_info_dict)
341
342 def generate_medium(self, size=None):
343 if not size:
344 size = (mgg.global_config['media:medium']['max_width'],
345 mgg.global_config['media:medium']['max_height'])
346
347 if self._skip_processing('medium', size=size):
348 return
349
350 # Note: pdftocairo adds '.png', so don't include an ext
351 filename = os.path.join(self.workbench.dir,
352 self.name_builder.fill('{basename}.medium'))
353
354 executable = where('pdftocairo')
355 args = [executable, '-scale-to', str(min(size)),
356 '-singlefile', '-png', self.pdf_filename, filename]
357
358 _log.debug('calling {0}'.format(repr(' '.join(args))))
359 Popen(executable=executable, args=args).wait()
360
361 # since pdftocairo added '.png', we need to include it with the
362 # filename
363 store_public(self.entry, 'medium', filename + '.png',
364 self.name_builder.fill('{basename}.medium.png'))
365
366 self.entry.set_file_metadata('medium', size=size)
367
368
369 class InitialProcessor(CommonPdfProcessor):
370 """
371 Initial processing step for new pdfs
372 """
373 name = "initial"
374 description = "Initial processing"
375
376 @classmethod
377 def media_is_eligible(cls, entry=None, state=None):
378 """
379 Determine if this media type is eligible for processing
380 """
381 if not state:
382 state = entry.state
383 return state in (
384 "unprocessed", "failed")
385
386 @classmethod
387 def generate_parser(cls):
388 parser = argparse.ArgumentParser(
389 description=cls.description,
390 prog=cls.name)
391
392 parser.add_argument(
393 '--size',
394 nargs=2,
395 metavar=('max_width', 'max_height'),
396 type=int)
397
398 parser.add_argument(
399 '--thumb-size',
400 nargs=2,
401 metavar=('max_width', 'max_height'),
402 type=int)
403
404 return parser
405
406 @classmethod
407 def args_to_request(cls, args):
408 return request_from_args(
409 args, ['size', 'thumb_size'])
410
411 def process(self, size=None, thumb_size=None):
412 self.common_setup()
413 self.extract_pdf_info()
414 self.copy_original()
415 self.generate_medium(size=size)
416 self.generate_thumb(thumb_size=thumb_size)
417 self.delete_queue_file()
418
419
420 class Resizer(CommonPdfProcessor):
421 """
422 Resizing process steps for processed pdfs
423 """
424 name = 'resize'
425 description = 'Resize thumbnail and medium'
426 thumb_size = 'size'
427
428 @classmethod
429 def media_is_eligible(cls, entry=None, state=None):
430 """
431 Determine if this media type is eligible for processing
432 """
433 if not state:
434 state = entry.state
435 return state in 'processed'
436
437 @classmethod
438 def generate_parser(cls):
439 parser = argparse.ArgumentParser(
440 description=cls.description,
441 prog=cls.name)
442
443 parser.add_argument(
444 '--size',
445 nargs=2,
446 metavar=('max_width', 'max_height'),
447 type=int)
448
449 parser.add_argument(
450 'file',
451 choices=['medium', 'thumb'])
452
453 return parser
454
455 @classmethod
456 def args_to_request(cls, args):
457 return request_from_args(
458 args, ['size', 'file'])
459
460 def process(self, file, size=None):
461 self.common_setup()
462 if file == 'medium':
463 self.generate_medium(size=size)
464 elif file == 'thumb':
465 self.generate_thumb(thumb_size=size)
466
467
468 class PdfProcessingManager(ProcessingManager):
469 def __init__(self):
470 super(PdfProcessingManager, self).__init__()
471 self.add_processor(InitialProcessor)
472 self.add_processor(Resizer)
473
474 def workflow(self, entry, manager, feed_url, reprocess_action,
475 reprocess_info=None):
476 ProcessMedia().apply_async(
477 [entry.id, feed_url, reprocess_action, reprocess_info], {},
478 task_id=entry.queued_task_id)