Merge branch 'release-0.4.1'
[mediagoblin.git] / mediagoblin / media_types / pdf / processing.py
CommitLineData
a80ebf3b
AL
1# GNU MediaGoblin -- federated, autonomous media hosting
2# Copyright (C) 2011, 2012 MediaGoblin contributors. See AUTHORS.
3#
4# This program is free software: you can redistribute it and/or modify
5# it under the terms of the GNU Affero General Public License as published by
6# the Free Software Foundation, either version 3 of the License, or
7# (at your option) any later version.
8#
9# This program is distributed in the hope that it will be useful,
10# but WITHOUT ANY WARRANTY; without even the implied warranty of
11# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12# GNU Affero General Public License for more details.
13#
14# You should have received a copy of the GNU Affero General Public License
15# along with this program. If not, see <http://www.gnu.org/licenses/>.
a80ebf3b 16import os
a80ebf3b
AL
17import logging
18import dateutil.parser
519bcfb0 19from subprocess import PIPE, Popen
a80ebf3b
AL
20
21from mediagoblin import mg_globals as mgg
22from mediagoblin.processing import (create_pub_filepath,
23 FilenameBuilder, BadMediaFail)
24from mediagoblin.tools.translate import fake_ugettext_passthrough as _
25
26_log = logging.getLogger(__name__)
27
28# TODO - cache (memoize) util
29
30# This is a list created via uniconv --show and hand removing some types that
31# we already support via other media types better.
32unoconv_supported = [
33 'bib', # - BibTeX [.bib]
34 #bmp - Windows Bitmap [.bmp]
35 'csv', # - Text CSV [.csv]
36 'dbf', # - dBASE [.dbf]
37 'dif', # - Data Interchange Format [.dif]
38 'doc6', # - Microsoft Word 6.0 [.doc]
39 'doc95', # - Microsoft Word 95 [.doc]
40 'docbook', # - DocBook [.xml]
41 'doc', # - Microsoft Word 97/2000/XP [.doc]
42 'docx7', # - Microsoft Office Open XML [.docx]
43 'docx', # - Microsoft Office Open XML [.docx]
44 #emf - Enhanced Metafile [.emf]
45 'eps', # - Encapsulated PostScript [.eps]
46 'fodp', # - OpenDocument Presentation (Flat XML) [.fodp]
47 'fods', # - OpenDocument Spreadsheet (Flat XML) [.fods]
48 'fodt', # - OpenDocument Text (Flat XML) [.fodt]
49 #gif - Graphics Interchange Format [.gif]
50 'html', # - HTML Document (OpenOffice.org Writer) [.html]
51 #jpg - Joint Photographic Experts Group [.jpg]
52 'latex', # - LaTeX 2e [.ltx]
53 'mediawiki', # - MediaWiki [.txt]
54 'met', # - OS/2 Metafile [.met]
55 'odd', # - OpenDocument Drawing [.odd]
56 'odg', # - ODF Drawing (Impress) [.odg]
57 'odp', # - ODF Presentation [.odp]
58 'ods', # - ODF Spreadsheet [.ods]
59 'odt', # - ODF Text Document [.odt]
60 'ooxml', # - Microsoft Office Open XML [.xml]
61 'otg', # - OpenDocument Drawing Template [.otg]
62 'otp', # - ODF Presentation Template [.otp]
63 'ots', # - ODF Spreadsheet Template [.ots]
64 'ott', # - Open Document Text [.ott]
65 #pbm - Portable Bitmap [.pbm]
66 #pct - Mac Pict [.pct]
67 'pdb', # - AportisDoc (Palm) [.pdb]
68 #pdf - Portable Document Format [.pdf]
69 #pgm - Portable Graymap [.pgm]
70 #png - Portable Network Graphic [.png]
71 'pot', # - Microsoft PowerPoint 97/2000/XP Template [.pot]
72 'potm', # - Microsoft PowerPoint 2007/2010 XML Template [.potm]
73 #ppm - Portable Pixelmap [.ppm]
74 'pps', # - Microsoft PowerPoint 97/2000/XP (Autoplay) [.pps]
75 'ppt', # - Microsoft PowerPoint 97/2000/XP [.ppt]
76 'pptx', # - Microsoft PowerPoint 2007/2010 XML [.pptx]
77 'psw', # - Pocket Word [.psw]
78 'pwp', # - PlaceWare [.pwp]
79 'pxl', # - Pocket Excel [.pxl]
80 #ras - Sun Raster Image [.ras]
81 'rtf', # - Rich Text Format [.rtf]
82 'sda', # - StarDraw 5.0 (OpenOffice.org Impress) [.sda]
83 'sdc3', # - StarCalc 3.0 [.sdc]
84 'sdc4', # - StarCalc 4.0 [.sdc]
85 'sdc', # - StarCalc 5.0 [.sdc]
86 'sdd3', # - StarDraw 3.0 (OpenOffice.org Impress) [.sdd]
87 'sdd4', # - StarImpress 4.0 [.sdd]
88 'sdd', # - StarImpress 5.0 [.sdd]
89 'sdw3', # - StarWriter 3.0 [.sdw]
90 'sdw4', # - StarWriter 4.0 [.sdw]
91 'sdw', # - StarWriter 5.0 [.sdw]
92 'slk', # - SYLK [.slk]
93 'stc', # - OpenOffice.org 1.0 Spreadsheet Template [.stc]
94 'std', # - OpenOffice.org 1.0 Drawing Template [.std]
95 'sti', # - OpenOffice.org 1.0 Presentation Template [.sti]
96 'stw', # - Open Office.org 1.0 Text Document Template [.stw]
97 #svg - Scalable Vector Graphics [.svg]
98 'svm', # - StarView Metafile [.svm]
99 'swf', # - Macromedia Flash (SWF) [.swf]
100 'sxc', # - OpenOffice.org 1.0 Spreadsheet [.sxc]
101 'sxd3', # - StarDraw 3.0 [.sxd]
102 'sxd5', # - StarDraw 5.0 [.sxd]
103 'sxd', # - OpenOffice.org 1.0 Drawing (OpenOffice.org Impress) [.sxd]
104 'sxi', # - OpenOffice.org 1.0 Presentation [.sxi]
105 'sxw', # - Open Office.org 1.0 Text Document [.sxw]
106 #text - Text Encoded [.txt]
107 #tiff - Tagged Image File Format [.tiff]
108 #txt - Text [.txt]
109 'uop', # - Unified Office Format presentation [.uop]
110 'uos', # - Unified Office Format spreadsheet [.uos]
111 'uot', # - Unified Office Format text [.uot]
112 'vor3', # - StarDraw 3.0 Template (OpenOffice.org Impress) [.vor]
113 'vor4', # - StarWriter 4.0 Template [.vor]
114 'vor5', # - StarDraw 5.0 Template (OpenOffice.org Impress) [.vor]
115 'vor', # - StarCalc 5.0 Template [.vor]
116 #wmf - Windows Metafile [.wmf]
117 'xhtml', # - XHTML Document [.html]
118 'xls5', # - Microsoft Excel 5.0 [.xls]
119 'xls95', # - Microsoft Excel 95 [.xls]
120 'xls', # - Microsoft Excel 97/2000/XP [.xls]
121 'xlt5', # - Microsoft Excel 5.0 Template [.xlt]
122 'xlt95', # - Microsoft Excel 95 Template [.xlt]
123 'xlt', # - Microsoft Excel 97/2000/XP Template [.xlt]
124 #xpm - X PixMap [.xpm]
125]
126
127def is_unoconv_working():
519bcfb0
AL
128 # TODO: must have libreoffice-headless installed too, need to check for it
129 unoconv = where('unoconv')
130 if not unoconv:
131 return False
a80ebf3b 132 try:
519bcfb0
AL
133 proc = Popen([unoconv, '--show'], stderr=PIPE)
134 output = proc.stderr.read()
135 except OSError, e:
a80ebf3b
AL
136 _log.warn(_('unoconv failing to run, check log file'))
137 return False
138 if 'ERROR' in output:
139 return False
140 return True
141
142def supported_extensions(cache=[None]):
143 if cache[0] == None:
144 cache[0] = 'pdf'
519bcfb0 145 if is_unoconv_working():
a80ebf3b
AL
146 cache.extend(unoconv_supported)
147 return cache
148
149def where(name):
150 for p in os.environ['PATH'].split(os.pathsep):
151 fullpath = os.path.join(p, name)
152 if os.path.exists(fullpath):
153 return fullpath
154 return None
155
156def check_prerequisites():
157 if not where('pdfinfo'):
158 _log.warn('missing pdfinfo')
159 return False
160 if not where('pdftocairo'):
161 _log.warn('missing pdfcairo')
162 return False
163 return True
164
165def sniff_handler(media_file, **kw):
166 if not check_prerequisites():
167 return False
168 if kw.get('media') is not None:
169 name, ext = os.path.splitext(kw['media'].filename)
170 clean_ext = ext[1:].lower()
171
172 if clean_ext in supported_extensions():
173 return True
174
175 return False
176
177def create_pdf_thumb(original, thumb_filename, width, height):
178 # Note: pdftocairo adds '.png', remove it
179 thumb_filename = thumb_filename[:-4]
180 executable = where('pdftocairo')
181 args = [executable, '-scale-to', str(min(width, height)),
182 '-singlefile', '-png', original, thumb_filename]
183 _log.debug('calling {0}'.format(repr(' '.join(args))))
519bcfb0 184 Popen(executable=executable, args=args).wait()
a80ebf3b
AL
185
186def pdf_info(original):
187 """
188 Extract dictionary of pdf information. This could use a library instead
189 of a process.
190
191 Note: I'm assuming pdfinfo output is sanitized (integers where integers are
192 expected, etc.) - if this is wrong then an exception will be raised and caught
193 leading to the dreaded error page. It seems a safe assumption.
194 """
195 ret_dict = {}
196 pdfinfo = where('pdfinfo')
197 try:
519bcfb0
AL
198 proc = Popen(executable=pdfinfo,
199 args=[pdfinfo, original], stdout=PIPE)
200 lines = proc.stdout.readlines()
201 except OSError:
a80ebf3b
AL
202 _log.debug('pdfinfo could not read the pdf file.')
203 raise BadMediaFail()
204
205 info_dict = dict([[part.strip() for part in l.strip().split(':', 1)]
206 for l in lines if ':' in l])
207
208 for date_key in [('pdf_mod_date', 'ModDate'),
209 ('pdf_creation_date', 'CreationDate')]:
210 if date_key in info_dict:
211 ret_dict[date_key] = dateutil.parser.parse(info_dict[date_key])
212 for db_key, int_key in [('pdf_pages', 'Pages')]:
213 if int_key in info_dict:
214 ret_dict[db_key] = int(info_dict[int_key])
215
216 # parse 'PageSize' field: 595 x 842 pts (A4)
217 page_size_parts = info_dict['Page size'].split()
218 ret_dict['pdf_page_size_width'] = float(page_size_parts[0])
219 ret_dict['pdf_page_size_height'] = float(page_size_parts[2])
220
221 for db_key, str_key in [('pdf_keywords', 'Keywords'),
222 ('pdf_creator', 'Creator'), ('pdf_producer', 'Producer'),
223 ('pdf_author', 'Author'), ('pdf_title', 'Title')]:
224 ret_dict[db_key] = info_dict.get(str_key, None)
225 ret_dict['pdf_version_major'], ret_dict['pdf_version_minor'] = \
226 map(int, info_dict['PDF version'].split('.'))
227
228 return ret_dict
229
230def process_pdf(proc_state):
231 """Code to process a pdf file. Will be run by celery.
232
233 A Workbench() represents a local tempory dir. It is automatically
234 cleaned up when this function exits.
235 """
236 entry = proc_state.entry
237 workbench = proc_state.workbench
238
239 queued_filename = proc_state.get_queued_filename()
240 name_builder = FilenameBuilder(queued_filename)
241
a80ebf3b
AL
242 # Copy our queued local workbench to its final destination
243 original_dest = name_builder.fill('{basename}{ext}')
244 proc_state.copy_original(original_dest)
245
246 # Create a pdf if this is a different doc, store pdf for viewer
247 ext = queued_filename.rsplit('.', 1)[-1].lower()
248 if ext == 'pdf':
249 pdf_filename = queued_filename
250 else:
251 pdf_filename = queued_filename.rsplit('.', 1)[0] + '.pdf'
252 unoconv = where('unoconv')
2d7a6789
CAW
253 Popen(executable=unoconv,
254 args=[unoconv, '-v', '-f', 'pdf', queued_filename]).wait()
a80ebf3b
AL
255 if not os.path.exists(pdf_filename):
256 _log.debug('unoconv failed to convert file to pdf')
257 raise BadMediaFail()
258 proc_state.store_public(keyname=u'pdf', local_file=pdf_filename)
259
260 pdf_info_dict = pdf_info(pdf_filename)
261
262 for name, width, height in [
263 (u'thumb', mgg.global_config['media:thumb']['max_width'],
264 mgg.global_config['media:thumb']['max_height']),
265 (u'medium', mgg.global_config['media:medium']['max_width'],
266 mgg.global_config['media:medium']['max_height']),
267 ]:
268 filename = name_builder.fill('{basename}.%s.png' % name)
269 path = workbench.joinpath(filename)
270 create_pdf_thumb(pdf_filename, path, width, height)
271 assert(os.path.exists(path))
272 proc_state.store_public(keyname=name, local_file=path)
273
274 proc_state.delete_queue_file()
275
276 entry.media_data_init(**pdf_info_dict)
277 entry.save()