Added pdf Initial processor
[mediagoblin.git] / mediagoblin / media_types / pdf / processing.py
CommitLineData
a80ebf3b
AL
1# GNU MediaGoblin -- federated, autonomous media hosting
2# Copyright (C) 2011, 2012 MediaGoblin contributors. See AUTHORS.
3#
4# This program is free software: you can redistribute it and/or modify
5# it under the terms of the GNU Affero General Public License as published by
6# the Free Software Foundation, either version 3 of the License, or
7# (at your option) any later version.
8#
9# This program is distributed in the hope that it will be useful,
10# but WITHOUT ANY WARRANTY; without even the implied warranty of
11# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12# GNU Affero General Public License for more details.
13#
14# You should have received a copy of the GNU Affero General Public License
15# along with this program. If not, see <http://www.gnu.org/licenses/>.
a80ebf3b 16import os
a80ebf3b
AL
17import logging
18import dateutil.parser
519bcfb0 19from subprocess import PIPE, Popen
a80ebf3b
AL
20
21from mediagoblin import mg_globals as mgg
22from mediagoblin.processing import (create_pub_filepath,
23 FilenameBuilder, BadMediaFail)
24from mediagoblin.tools.translate import fake_ugettext_passthrough as _
25
26_log = logging.getLogger(__name__)
27
51e4e435
RE
28MEDIA_TYPE = 'mediagoblin.media_types.pdf'
29
a80ebf3b
AL
30# TODO - cache (memoize) util
31
32# This is a list created via uniconv --show and hand removing some types that
33# we already support via other media types better.
34unoconv_supported = [
35 'bib', # - BibTeX [.bib]
36 #bmp - Windows Bitmap [.bmp]
37 'csv', # - Text CSV [.csv]
38 'dbf', # - dBASE [.dbf]
39 'dif', # - Data Interchange Format [.dif]
40 'doc6', # - Microsoft Word 6.0 [.doc]
41 'doc95', # - Microsoft Word 95 [.doc]
42 'docbook', # - DocBook [.xml]
43 'doc', # - Microsoft Word 97/2000/XP [.doc]
44 'docx7', # - Microsoft Office Open XML [.docx]
45 'docx', # - Microsoft Office Open XML [.docx]
46 #emf - Enhanced Metafile [.emf]
47 'eps', # - Encapsulated PostScript [.eps]
48 'fodp', # - OpenDocument Presentation (Flat XML) [.fodp]
49 'fods', # - OpenDocument Spreadsheet (Flat XML) [.fods]
50 'fodt', # - OpenDocument Text (Flat XML) [.fodt]
51 #gif - Graphics Interchange Format [.gif]
52 'html', # - HTML Document (OpenOffice.org Writer) [.html]
53 #jpg - Joint Photographic Experts Group [.jpg]
54 'latex', # - LaTeX 2e [.ltx]
55 'mediawiki', # - MediaWiki [.txt]
56 'met', # - OS/2 Metafile [.met]
57 'odd', # - OpenDocument Drawing [.odd]
58 'odg', # - ODF Drawing (Impress) [.odg]
59 'odp', # - ODF Presentation [.odp]
60 'ods', # - ODF Spreadsheet [.ods]
61 'odt', # - ODF Text Document [.odt]
62 'ooxml', # - Microsoft Office Open XML [.xml]
63 'otg', # - OpenDocument Drawing Template [.otg]
64 'otp', # - ODF Presentation Template [.otp]
65 'ots', # - ODF Spreadsheet Template [.ots]
66 'ott', # - Open Document Text [.ott]
67 #pbm - Portable Bitmap [.pbm]
68 #pct - Mac Pict [.pct]
69 'pdb', # - AportisDoc (Palm) [.pdb]
70 #pdf - Portable Document Format [.pdf]
71 #pgm - Portable Graymap [.pgm]
72 #png - Portable Network Graphic [.png]
73 'pot', # - Microsoft PowerPoint 97/2000/XP Template [.pot]
74 'potm', # - Microsoft PowerPoint 2007/2010 XML Template [.potm]
75 #ppm - Portable Pixelmap [.ppm]
76 'pps', # - Microsoft PowerPoint 97/2000/XP (Autoplay) [.pps]
77 'ppt', # - Microsoft PowerPoint 97/2000/XP [.ppt]
78 'pptx', # - Microsoft PowerPoint 2007/2010 XML [.pptx]
79 'psw', # - Pocket Word [.psw]
80 'pwp', # - PlaceWare [.pwp]
81 'pxl', # - Pocket Excel [.pxl]
82 #ras - Sun Raster Image [.ras]
83 'rtf', # - Rich Text Format [.rtf]
84 'sda', # - StarDraw 5.0 (OpenOffice.org Impress) [.sda]
85 'sdc3', # - StarCalc 3.0 [.sdc]
86 'sdc4', # - StarCalc 4.0 [.sdc]
87 'sdc', # - StarCalc 5.0 [.sdc]
88 'sdd3', # - StarDraw 3.0 (OpenOffice.org Impress) [.sdd]
89 'sdd4', # - StarImpress 4.0 [.sdd]
90 'sdd', # - StarImpress 5.0 [.sdd]
91 'sdw3', # - StarWriter 3.0 [.sdw]
92 'sdw4', # - StarWriter 4.0 [.sdw]
93 'sdw', # - StarWriter 5.0 [.sdw]
94 'slk', # - SYLK [.slk]
95 'stc', # - OpenOffice.org 1.0 Spreadsheet Template [.stc]
96 'std', # - OpenOffice.org 1.0 Drawing Template [.std]
97 'sti', # - OpenOffice.org 1.0 Presentation Template [.sti]
98 'stw', # - Open Office.org 1.0 Text Document Template [.stw]
99 #svg - Scalable Vector Graphics [.svg]
100 'svm', # - StarView Metafile [.svm]
101 'swf', # - Macromedia Flash (SWF) [.swf]
102 'sxc', # - OpenOffice.org 1.0 Spreadsheet [.sxc]
103 'sxd3', # - StarDraw 3.0 [.sxd]
104 'sxd5', # - StarDraw 5.0 [.sxd]
105 'sxd', # - OpenOffice.org 1.0 Drawing (OpenOffice.org Impress) [.sxd]
106 'sxi', # - OpenOffice.org 1.0 Presentation [.sxi]
107 'sxw', # - Open Office.org 1.0 Text Document [.sxw]
108 #text - Text Encoded [.txt]
109 #tiff - Tagged Image File Format [.tiff]
110 #txt - Text [.txt]
111 'uop', # - Unified Office Format presentation [.uop]
112 'uos', # - Unified Office Format spreadsheet [.uos]
113 'uot', # - Unified Office Format text [.uot]
114 'vor3', # - StarDraw 3.0 Template (OpenOffice.org Impress) [.vor]
115 'vor4', # - StarWriter 4.0 Template [.vor]
116 'vor5', # - StarDraw 5.0 Template (OpenOffice.org Impress) [.vor]
117 'vor', # - StarCalc 5.0 Template [.vor]
118 #wmf - Windows Metafile [.wmf]
119 'xhtml', # - XHTML Document [.html]
120 'xls5', # - Microsoft Excel 5.0 [.xls]
121 'xls95', # - Microsoft Excel 95 [.xls]
122 'xls', # - Microsoft Excel 97/2000/XP [.xls]
123 'xlt5', # - Microsoft Excel 5.0 Template [.xlt]
124 'xlt95', # - Microsoft Excel 95 Template [.xlt]
125 'xlt', # - Microsoft Excel 97/2000/XP Template [.xlt]
126 #xpm - X PixMap [.xpm]
127]
128
129def is_unoconv_working():
519bcfb0
AL
130 # TODO: must have libreoffice-headless installed too, need to check for it
131 unoconv = where('unoconv')
132 if not unoconv:
133 return False
a80ebf3b 134 try:
519bcfb0
AL
135 proc = Popen([unoconv, '--show'], stderr=PIPE)
136 output = proc.stderr.read()
137 except OSError, e:
a80ebf3b
AL
138 _log.warn(_('unoconv failing to run, check log file'))
139 return False
140 if 'ERROR' in output:
141 return False
142 return True
143
144def supported_extensions(cache=[None]):
145 if cache[0] == None:
146 cache[0] = 'pdf'
519bcfb0 147 if is_unoconv_working():
a80ebf3b
AL
148 cache.extend(unoconv_supported)
149 return cache
150
151def where(name):
152 for p in os.environ['PATH'].split(os.pathsep):
153 fullpath = os.path.join(p, name)
154 if os.path.exists(fullpath):
155 return fullpath
156 return None
157
158def check_prerequisites():
159 if not where('pdfinfo'):
160 _log.warn('missing pdfinfo')
161 return False
162 if not where('pdftocairo'):
163 _log.warn('missing pdfcairo')
164 return False
165 return True
166
167def sniff_handler(media_file, **kw):
51e4e435 168 _log.info('Sniffing {0}'.format(MEDIA_TYPE))
a80ebf3b 169 if not check_prerequisites():
51e4e435 170 return None
a80ebf3b
AL
171 if kw.get('media') is not None:
172 name, ext = os.path.splitext(kw['media'].filename)
173 clean_ext = ext[1:].lower()
174
175 if clean_ext in supported_extensions():
51e4e435 176 return MEDIA_TYPE
a80ebf3b 177
51e4e435 178 return None
a80ebf3b
AL
179
180def create_pdf_thumb(original, thumb_filename, width, height):
181 # Note: pdftocairo adds '.png', remove it
182 thumb_filename = thumb_filename[:-4]
183 executable = where('pdftocairo')
184 args = [executable, '-scale-to', str(min(width, height)),
185 '-singlefile', '-png', original, thumb_filename]
186 _log.debug('calling {0}'.format(repr(' '.join(args))))
519bcfb0 187 Popen(executable=executable, args=args).wait()
a80ebf3b
AL
188
189def pdf_info(original):
190 """
191 Extract dictionary of pdf information. This could use a library instead
192 of a process.
193
194 Note: I'm assuming pdfinfo output is sanitized (integers where integers are
195 expected, etc.) - if this is wrong then an exception will be raised and caught
196 leading to the dreaded error page. It seems a safe assumption.
197 """
198 ret_dict = {}
199 pdfinfo = where('pdfinfo')
200 try:
519bcfb0
AL
201 proc = Popen(executable=pdfinfo,
202 args=[pdfinfo, original], stdout=PIPE)
203 lines = proc.stdout.readlines()
204 except OSError:
a80ebf3b
AL
205 _log.debug('pdfinfo could not read the pdf file.')
206 raise BadMediaFail()
207
208 info_dict = dict([[part.strip() for part in l.strip().split(':', 1)]
209 for l in lines if ':' in l])
210
211 for date_key in [('pdf_mod_date', 'ModDate'),
212 ('pdf_creation_date', 'CreationDate')]:
213 if date_key in info_dict:
214 ret_dict[date_key] = dateutil.parser.parse(info_dict[date_key])
215 for db_key, int_key in [('pdf_pages', 'Pages')]:
216 if int_key in info_dict:
217 ret_dict[db_key] = int(info_dict[int_key])
218
219 # parse 'PageSize' field: 595 x 842 pts (A4)
220 page_size_parts = info_dict['Page size'].split()
221 ret_dict['pdf_page_size_width'] = float(page_size_parts[0])
222 ret_dict['pdf_page_size_height'] = float(page_size_parts[2])
223
224 for db_key, str_key in [('pdf_keywords', 'Keywords'),
225 ('pdf_creator', 'Creator'), ('pdf_producer', 'Producer'),
226 ('pdf_author', 'Author'), ('pdf_title', 'Title')]:
227 ret_dict[db_key] = info_dict.get(str_key, None)
228 ret_dict['pdf_version_major'], ret_dict['pdf_version_minor'] = \
229 map(int, info_dict['PDF version'].split('.'))
230
231 return ret_dict
232
a80ebf3b 233
5fabbcc4 234class CommonPdfProcessor(MediaProcessor):
a80ebf3b 235 """
5fabbcc4
RE
236 Provides a base for various pdf processing steps
237 """
238 def common_setup(self):
239 """
240 Set up common pdf processing steps
241 """
242 # Pull down and set up the original file
243 self.orig_filename = get_orig_filename(
244 self.entry, self.workbench)
245 self.name_builder = FilenameBuilder(self.orig_filename)
246
247 self._set_pdf_filename()
248
249 def _set_pdf_filename(self):
250 if self.name_builder.ext == 'pdf':
251 self.pdf_filename = self.orig_filename
252 else:
253 self.pdf_filename = self.name_builder.fill('{basename}.pdf')
254
255 def copy_original(self):
256 copy_original(
257 self.entry, self.orig_filename,
258 self.name_builder.fill('{basename}{ext}'))
259
260 def generate_thumb(self, thumb_size=None):
261 if not thumb_size:
262 thumb_size = (mgg.global_config['media:thumb']['max_width'],
263 mgg.global_config['media:thumb']['max_height'])
264
265 # Note: pdftocairo adds '.png', so don't include an ext
266 thumb_filename = self.name_builder.fill('{basename}.thumbnail')
267
268 executable = where('pdftocairo')
269 args = [executable, '-scale-to', str(thumb_size),
270 '-singlefile', '-png', self.pdf_filename, thumb_filename]
271
272 _log.debug('calling {0}'.format(repr(' '.join(args))))
273 Popen(executable=executable, args=args).wait()
274
275 store_public(self.entry, 'thumb', thumb_filename,
276 self.name_builder.fill('{basename}.thumbnail.png'))
277
278 def generate_pdf(self):
279 """
280 Store the pdf. If the file is not a pdf, make it a pdf
281 """
282 if self.name_builder.ext != 'pdf':
283 unoconv = where('unoconv')
284 Popen(executable=unoconv,
285 args=[unoconv, '-v', '-f', 'pdf', self.orig_filename]).wait()
286
287 if not os.path.exists(self.pdf_filename):
288 _log.debug('unoconv failed to convert file to pdf')
289 raise BadMediaFail()
290
291 store_public(self.entry, 'pdf', self.pdf_filename,
292 self.name_builder.fill('{basename}.pdf'))
293
294 def extract_pdf_info(self):
295 pdf_info_dict = pdf_info(self.pdf_filename)
296 entry.media_data_init(**pdf_info_dict)
297
298 def generate_medium(self, size=None):
299 if not size:
300 size = (mgg.global_config['media:medium']['max_width'],
301 mgg.global_config['media:medium']['max_height'])
302
303 # Note: pdftocairo adds '.png', so don't include an ext
304 filename = self.name_builder.fill('{basename}.medium')
305
306 executable = where('pdftocairo')
307 args = [executable, '-scale-to', str(size),
308 '-singlefile', '-png', self.pdf_filename, filename]
309
310 _log.debug('calling {0}'.format(repr(' '.join(args))))
311 Popen(executable=executable, args=args).wait()
312
313 store_public(self.entry, 'thumb', filename,
314 self.name_builder.fill('{basename}.medium.png'))
315
316class InitialProcessor(CommonPdfProcessor):
317 """
318 Initial processing step for new pdfs
319 """
320 name = "initial"
321 description = "Initial processing"
322
323 @classmethod
324 def media_is_eligible(cls, entry=None, state=None):
325 """
326 Determine if this media type is eligible for processing
327 """
328 if not state:
329 state = entry.state
330 return state in (
331 "unprocessed", "failed")
332
333 @classmethod
334 def generate_parser(cls):
335 parser = argparse.ArgumentParser(
336 description=cls.description,
337 prog=cls.name)
338
339 parser.add_argument(
340 '--size',
341 nargs=2,
342 metavar=('max_width', 'max_height'),
343 type=int)
344
345 parser.add_argument(
346 '--thumb-size',
347 nargs=2,
348 metavar=('max_width', 'max_height'),
349 type=int)
350
351 return parser
352
353 @classmethod
354 def args_to_request(cls, args):
355 return request_from_args(
356 args, ['size', 'thumb_size'])
357
358 def process(self, size=None, thumb_size=None):
359 self.common_setup()
360 self.generate_pdf()
361 self.extract_pdf_info()
362 self.copy_original()
363 self.generate_medium(size=size)
364 self.generate_thumb(thumb_size=thumb_size)
365 self.delete_queue_file()
366
367
368class PdfProcessingManager(ProcessingManager):
369 def __init__(self):
370 super(self.__class__, self).__init__()
371 self.add_processor(InitialProcessor)