Merge remote-tracking branch 'refs/remotes/brandoninvergo/pyconfigure' into merge...
[mediagoblin.git] / mediagoblin / media_types / pdf / processing.py
1 # GNU MediaGoblin -- federated, autonomous media hosting
2 # Copyright (C) 2011, 2012 MediaGoblin contributors. See AUTHORS.
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published by
6 # the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
16 import os
17 import logging
18 import dateutil.parser
19 from subprocess import PIPE, Popen
20
21 from mediagoblin import mg_globals as mgg
22 from mediagoblin.processing import (create_pub_filepath,
23 FilenameBuilder, BadMediaFail)
24 from mediagoblin.tools.translate import fake_ugettext_passthrough as _
25
26 _log = logging.getLogger(__name__)
27
28 MEDIA_TYPE = 'mediagoblin.media_types.pdf'
29
30 # TODO - cache (memoize) util
31
32 # This is a list created via uniconv --show and hand removing some types that
33 # we already support via other media types better.
34 unoconv_supported = [
35 'bib', # - BibTeX [.bib]
36 #bmp - Windows Bitmap [.bmp]
37 'csv', # - Text CSV [.csv]
38 'dbf', # - dBASE [.dbf]
39 'dif', # - Data Interchange Format [.dif]
40 'doc6', # - Microsoft Word 6.0 [.doc]
41 'doc95', # - Microsoft Word 95 [.doc]
42 'docbook', # - DocBook [.xml]
43 'doc', # - Microsoft Word 97/2000/XP [.doc]
44 'docx7', # - Microsoft Office Open XML [.docx]
45 'docx', # - Microsoft Office Open XML [.docx]
46 #emf - Enhanced Metafile [.emf]
47 'eps', # - Encapsulated PostScript [.eps]
48 'fodp', # - OpenDocument Presentation (Flat XML) [.fodp]
49 'fods', # - OpenDocument Spreadsheet (Flat XML) [.fods]
50 'fodt', # - OpenDocument Text (Flat XML) [.fodt]
51 #gif - Graphics Interchange Format [.gif]
52 'html', # - HTML Document (OpenOffice.org Writer) [.html]
53 #jpg - Joint Photographic Experts Group [.jpg]
54 'latex', # - LaTeX 2e [.ltx]
55 'mediawiki', # - MediaWiki [.txt]
56 'met', # - OS/2 Metafile [.met]
57 'odd', # - OpenDocument Drawing [.odd]
58 'odg', # - ODF Drawing (Impress) [.odg]
59 'odp', # - ODF Presentation [.odp]
60 'ods', # - ODF Spreadsheet [.ods]
61 'odt', # - ODF Text Document [.odt]
62 'ooxml', # - Microsoft Office Open XML [.xml]
63 'otg', # - OpenDocument Drawing Template [.otg]
64 'otp', # - ODF Presentation Template [.otp]
65 'ots', # - ODF Spreadsheet Template [.ots]
66 'ott', # - Open Document Text [.ott]
67 #pbm - Portable Bitmap [.pbm]
68 #pct - Mac Pict [.pct]
69 'pdb', # - AportisDoc (Palm) [.pdb]
70 #pdf - Portable Document Format [.pdf]
71 #pgm - Portable Graymap [.pgm]
72 #png - Portable Network Graphic [.png]
73 'pot', # - Microsoft PowerPoint 97/2000/XP Template [.pot]
74 'potm', # - Microsoft PowerPoint 2007/2010 XML Template [.potm]
75 #ppm - Portable Pixelmap [.ppm]
76 'pps', # - Microsoft PowerPoint 97/2000/XP (Autoplay) [.pps]
77 'ppt', # - Microsoft PowerPoint 97/2000/XP [.ppt]
78 'pptx', # - Microsoft PowerPoint 2007/2010 XML [.pptx]
79 'psw', # - Pocket Word [.psw]
80 'pwp', # - PlaceWare [.pwp]
81 'pxl', # - Pocket Excel [.pxl]
82 #ras - Sun Raster Image [.ras]
83 'rtf', # - Rich Text Format [.rtf]
84 'sda', # - StarDraw 5.0 (OpenOffice.org Impress) [.sda]
85 'sdc3', # - StarCalc 3.0 [.sdc]
86 'sdc4', # - StarCalc 4.0 [.sdc]
87 'sdc', # - StarCalc 5.0 [.sdc]
88 'sdd3', # - StarDraw 3.0 (OpenOffice.org Impress) [.sdd]
89 'sdd4', # - StarImpress 4.0 [.sdd]
90 'sdd', # - StarImpress 5.0 [.sdd]
91 'sdw3', # - StarWriter 3.0 [.sdw]
92 'sdw4', # - StarWriter 4.0 [.sdw]
93 'sdw', # - StarWriter 5.0 [.sdw]
94 'slk', # - SYLK [.slk]
95 'stc', # - OpenOffice.org 1.0 Spreadsheet Template [.stc]
96 'std', # - OpenOffice.org 1.0 Drawing Template [.std]
97 'sti', # - OpenOffice.org 1.0 Presentation Template [.sti]
98 'stw', # - Open Office.org 1.0 Text Document Template [.stw]
99 #svg - Scalable Vector Graphics [.svg]
100 'svm', # - StarView Metafile [.svm]
101 'swf', # - Macromedia Flash (SWF) [.swf]
102 'sxc', # - OpenOffice.org 1.0 Spreadsheet [.sxc]
103 'sxd3', # - StarDraw 3.0 [.sxd]
104 'sxd5', # - StarDraw 5.0 [.sxd]
105 'sxd', # - OpenOffice.org 1.0 Drawing (OpenOffice.org Impress) [.sxd]
106 'sxi', # - OpenOffice.org 1.0 Presentation [.sxi]
107 'sxw', # - Open Office.org 1.0 Text Document [.sxw]
108 #text - Text Encoded [.txt]
109 #tiff - Tagged Image File Format [.tiff]
110 #txt - Text [.txt]
111 'uop', # - Unified Office Format presentation [.uop]
112 'uos', # - Unified Office Format spreadsheet [.uos]
113 'uot', # - Unified Office Format text [.uot]
114 'vor3', # - StarDraw 3.0 Template (OpenOffice.org Impress) [.vor]
115 'vor4', # - StarWriter 4.0 Template [.vor]
116 'vor5', # - StarDraw 5.0 Template (OpenOffice.org Impress) [.vor]
117 'vor', # - StarCalc 5.0 Template [.vor]
118 #wmf - Windows Metafile [.wmf]
119 'xhtml', # - XHTML Document [.html]
120 'xls5', # - Microsoft Excel 5.0 [.xls]
121 'xls95', # - Microsoft Excel 95 [.xls]
122 'xls', # - Microsoft Excel 97/2000/XP [.xls]
123 'xlt5', # - Microsoft Excel 5.0 Template [.xlt]
124 'xlt95', # - Microsoft Excel 95 Template [.xlt]
125 'xlt', # - Microsoft Excel 97/2000/XP Template [.xlt]
126 #xpm - X PixMap [.xpm]
127 ]
128
129 def is_unoconv_working():
130 # TODO: must have libreoffice-headless installed too, need to check for it
131 unoconv = where('unoconv')
132 if not unoconv:
133 return False
134 try:
135 proc = Popen([unoconv, '--show'], stderr=PIPE)
136 output = proc.stderr.read()
137 except OSError, e:
138 _log.warn(_('unoconv failing to run, check log file'))
139 return False
140 if 'ERROR' in output:
141 return False
142 return True
143
144 def supported_extensions(cache=[None]):
145 if cache[0] == None:
146 cache[0] = 'pdf'
147 if is_unoconv_working():
148 cache.extend(unoconv_supported)
149 return cache
150
151 def where(name):
152 for p in os.environ['PATH'].split(os.pathsep):
153 fullpath = os.path.join(p, name)
154 if os.path.exists(fullpath):
155 return fullpath
156 return None
157
158 def check_prerequisites():
159 if not where('pdfinfo'):
160 _log.warn('missing pdfinfo')
161 return False
162 if not where('pdftocairo'):
163 _log.warn('missing pdfcairo')
164 return False
165 return True
166
167 def sniff_handler(media_file, **kw):
168 _log.info('Sniffing {0}'.format(MEDIA_TYPE))
169 if not check_prerequisites():
170 return None
171 if kw.get('media') is not None:
172 name, ext = os.path.splitext(kw['media'].filename)
173 clean_ext = ext[1:].lower()
174
175 if clean_ext in supported_extensions():
176 return MEDIA_TYPE
177
178 return None
179
180 def create_pdf_thumb(original, thumb_filename, width, height):
181 # Note: pdftocairo adds '.png', remove it
182 thumb_filename = thumb_filename[:-4]
183 executable = where('pdftocairo')
184 args = [executable, '-scale-to', str(min(width, height)),
185 '-singlefile', '-png', original, thumb_filename]
186 _log.debug('calling {0}'.format(repr(' '.join(args))))
187 Popen(executable=executable, args=args).wait()
188
189 def pdf_info(original):
190 """
191 Extract dictionary of pdf information. This could use a library instead
192 of a process.
193
194 Note: I'm assuming pdfinfo output is sanitized (integers where integers are
195 expected, etc.) - if this is wrong then an exception will be raised and caught
196 leading to the dreaded error page. It seems a safe assumption.
197 """
198 ret_dict = {}
199 pdfinfo = where('pdfinfo')
200 try:
201 proc = Popen(executable=pdfinfo,
202 args=[pdfinfo, original], stdout=PIPE)
203 lines = proc.stdout.readlines()
204 except OSError:
205 _log.debug('pdfinfo could not read the pdf file.')
206 raise BadMediaFail()
207
208 info_dict = dict([[part.strip() for part in l.strip().split(':', 1)]
209 for l in lines if ':' in l])
210
211 for date_key in [('pdf_mod_date', 'ModDate'),
212 ('pdf_creation_date', 'CreationDate')]:
213 if date_key in info_dict:
214 ret_dict[date_key] = dateutil.parser.parse(info_dict[date_key])
215 for db_key, int_key in [('pdf_pages', 'Pages')]:
216 if int_key in info_dict:
217 ret_dict[db_key] = int(info_dict[int_key])
218
219 # parse 'PageSize' field: 595 x 842 pts (A4)
220 page_size_parts = info_dict['Page size'].split()
221 ret_dict['pdf_page_size_width'] = float(page_size_parts[0])
222 ret_dict['pdf_page_size_height'] = float(page_size_parts[2])
223
224 for db_key, str_key in [('pdf_keywords', 'Keywords'),
225 ('pdf_creator', 'Creator'), ('pdf_producer', 'Producer'),
226 ('pdf_author', 'Author'), ('pdf_title', 'Title')]:
227 ret_dict[db_key] = info_dict.get(str_key, None)
228 ret_dict['pdf_version_major'], ret_dict['pdf_version_minor'] = \
229 map(int, info_dict['PDF version'].split('.'))
230
231 return ret_dict
232
233 def process_pdf(proc_state):
234 """Code to process a pdf file. Will be run by celery.
235
236 A Workbench() represents a local tempory dir. It is automatically
237 cleaned up when this function exits.
238 """
239 entry = proc_state.entry
240 workbench = proc_state.workbench
241
242 queued_filename = proc_state.get_queued_filename()
243 name_builder = FilenameBuilder(queued_filename)
244
245 # Copy our queued local workbench to its final destination
246 original_dest = name_builder.fill('{basename}{ext}')
247 proc_state.copy_original(original_dest)
248
249 # Create a pdf if this is a different doc, store pdf for viewer
250 ext = queued_filename.rsplit('.', 1)[-1].lower()
251 if ext == 'pdf':
252 pdf_filename = queued_filename
253 else:
254 pdf_filename = queued_filename.rsplit('.', 1)[0] + '.pdf'
255 unoconv = where('unoconv')
256 Popen(executable=unoconv,
257 args=[unoconv, '-v', '-f', 'pdf', queued_filename]).wait()
258 if not os.path.exists(pdf_filename):
259 _log.debug('unoconv failed to convert file to pdf')
260 raise BadMediaFail()
261 proc_state.store_public(keyname=u'pdf', local_file=pdf_filename)
262
263 pdf_info_dict = pdf_info(pdf_filename)
264
265 for name, width, height in [
266 (u'thumb', mgg.global_config['media:thumb']['max_width'],
267 mgg.global_config['media:thumb']['max_height']),
268 (u'medium', mgg.global_config['media:medium']['max_width'],
269 mgg.global_config['media:medium']['max_height']),
270 ]:
271 filename = name_builder.fill('{basename}.%s.png' % name)
272 path = workbench.joinpath(filename)
273 create_pdf_thumb(pdf_filename, path, width, height)
274 assert(os.path.exists(path))
275 proc_state.store_public(keyname=name, local_file=path)
276
277 proc_state.delete_queue_file()
278
279 entry.media_data_init(**pdf_info_dict)
280 entry.save()