Commit | Line | Data |
---|---|---|
a80ebf3b AL |
1 | # GNU MediaGoblin -- federated, autonomous media hosting |
2 | # Copyright (C) 2011, 2012 MediaGoblin contributors. See AUTHORS. | |
3 | # | |
4 | # This program is free software: you can redistribute it and/or modify | |
5 | # it under the terms of the GNU Affero General Public License as published by | |
6 | # the Free Software Foundation, either version 3 of the License, or | |
7 | # (at your option) any later version. | |
8 | # | |
9 | # This program is distributed in the hope that it will be useful, | |
10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
12 | # GNU Affero General Public License for more details. | |
13 | # | |
14 | # You should have received a copy of the GNU Affero General Public License | |
15 | # along with this program. If not, see <http://www.gnu.org/licenses/>. | |
a80ebf3b | 16 | import os |
a80ebf3b AL |
17 | import logging |
18 | import dateutil.parser | |
519bcfb0 | 19 | from subprocess import PIPE, Popen |
a80ebf3b AL |
20 | |
21 | from mediagoblin import mg_globals as mgg | |
22 | from mediagoblin.processing import (create_pub_filepath, | |
23 | FilenameBuilder, BadMediaFail) | |
24 | from mediagoblin.tools.translate import fake_ugettext_passthrough as _ | |
25 | ||
26 | _log = logging.getLogger(__name__) | |
27 | ||
51e4e435 RE |
28 | MEDIA_TYPE = 'mediagoblin.media_types.pdf' |
29 | ||
a80ebf3b AL |
30 | # TODO - cache (memoize) util |
31 | ||
32 | # This is a list created via uniconv --show and hand removing some types that | |
33 | # we already support via other media types better. | |
34 | unoconv_supported = [ | |
35 | 'bib', # - BibTeX [.bib] | |
36 | #bmp - Windows Bitmap [.bmp] | |
37 | 'csv', # - Text CSV [.csv] | |
38 | 'dbf', # - dBASE [.dbf] | |
39 | 'dif', # - Data Interchange Format [.dif] | |
40 | 'doc6', # - Microsoft Word 6.0 [.doc] | |
41 | 'doc95', # - Microsoft Word 95 [.doc] | |
42 | 'docbook', # - DocBook [.xml] | |
43 | 'doc', # - Microsoft Word 97/2000/XP [.doc] | |
44 | 'docx7', # - Microsoft Office Open XML [.docx] | |
45 | 'docx', # - Microsoft Office Open XML [.docx] | |
46 | #emf - Enhanced Metafile [.emf] | |
47 | 'eps', # - Encapsulated PostScript [.eps] | |
48 | 'fodp', # - OpenDocument Presentation (Flat XML) [.fodp] | |
49 | 'fods', # - OpenDocument Spreadsheet (Flat XML) [.fods] | |
50 | 'fodt', # - OpenDocument Text (Flat XML) [.fodt] | |
51 | #gif - Graphics Interchange Format [.gif] | |
52 | 'html', # - HTML Document (OpenOffice.org Writer) [.html] | |
53 | #jpg - Joint Photographic Experts Group [.jpg] | |
54 | 'latex', # - LaTeX 2e [.ltx] | |
55 | 'mediawiki', # - MediaWiki [.txt] | |
56 | 'met', # - OS/2 Metafile [.met] | |
57 | 'odd', # - OpenDocument Drawing [.odd] | |
58 | 'odg', # - ODF Drawing (Impress) [.odg] | |
59 | 'odp', # - ODF Presentation [.odp] | |
60 | 'ods', # - ODF Spreadsheet [.ods] | |
61 | 'odt', # - ODF Text Document [.odt] | |
62 | 'ooxml', # - Microsoft Office Open XML [.xml] | |
63 | 'otg', # - OpenDocument Drawing Template [.otg] | |
64 | 'otp', # - ODF Presentation Template [.otp] | |
65 | 'ots', # - ODF Spreadsheet Template [.ots] | |
66 | 'ott', # - Open Document Text [.ott] | |
67 | #pbm - Portable Bitmap [.pbm] | |
68 | #pct - Mac Pict [.pct] | |
69 | 'pdb', # - AportisDoc (Palm) [.pdb] | |
70 | #pdf - Portable Document Format [.pdf] | |
71 | #pgm - Portable Graymap [.pgm] | |
72 | #png - Portable Network Graphic [.png] | |
73 | 'pot', # - Microsoft PowerPoint 97/2000/XP Template [.pot] | |
74 | 'potm', # - Microsoft PowerPoint 2007/2010 XML Template [.potm] | |
75 | #ppm - Portable Pixelmap [.ppm] | |
76 | 'pps', # - Microsoft PowerPoint 97/2000/XP (Autoplay) [.pps] | |
77 | 'ppt', # - Microsoft PowerPoint 97/2000/XP [.ppt] | |
78 | 'pptx', # - Microsoft PowerPoint 2007/2010 XML [.pptx] | |
79 | 'psw', # - Pocket Word [.psw] | |
80 | 'pwp', # - PlaceWare [.pwp] | |
81 | 'pxl', # - Pocket Excel [.pxl] | |
82 | #ras - Sun Raster Image [.ras] | |
83 | 'rtf', # - Rich Text Format [.rtf] | |
84 | 'sda', # - StarDraw 5.0 (OpenOffice.org Impress) [.sda] | |
85 | 'sdc3', # - StarCalc 3.0 [.sdc] | |
86 | 'sdc4', # - StarCalc 4.0 [.sdc] | |
87 | 'sdc', # - StarCalc 5.0 [.sdc] | |
88 | 'sdd3', # - StarDraw 3.0 (OpenOffice.org Impress) [.sdd] | |
89 | 'sdd4', # - StarImpress 4.0 [.sdd] | |
90 | 'sdd', # - StarImpress 5.0 [.sdd] | |
91 | 'sdw3', # - StarWriter 3.0 [.sdw] | |
92 | 'sdw4', # - StarWriter 4.0 [.sdw] | |
93 | 'sdw', # - StarWriter 5.0 [.sdw] | |
94 | 'slk', # - SYLK [.slk] | |
95 | 'stc', # - OpenOffice.org 1.0 Spreadsheet Template [.stc] | |
96 | 'std', # - OpenOffice.org 1.0 Drawing Template [.std] | |
97 | 'sti', # - OpenOffice.org 1.0 Presentation Template [.sti] | |
98 | 'stw', # - Open Office.org 1.0 Text Document Template [.stw] | |
99 | #svg - Scalable Vector Graphics [.svg] | |
100 | 'svm', # - StarView Metafile [.svm] | |
101 | 'swf', # - Macromedia Flash (SWF) [.swf] | |
102 | 'sxc', # - OpenOffice.org 1.0 Spreadsheet [.sxc] | |
103 | 'sxd3', # - StarDraw 3.0 [.sxd] | |
104 | 'sxd5', # - StarDraw 5.0 [.sxd] | |
105 | 'sxd', # - OpenOffice.org 1.0 Drawing (OpenOffice.org Impress) [.sxd] | |
106 | 'sxi', # - OpenOffice.org 1.0 Presentation [.sxi] | |
107 | 'sxw', # - Open Office.org 1.0 Text Document [.sxw] | |
108 | #text - Text Encoded [.txt] | |
109 | #tiff - Tagged Image File Format [.tiff] | |
110 | #txt - Text [.txt] | |
111 | 'uop', # - Unified Office Format presentation [.uop] | |
112 | 'uos', # - Unified Office Format spreadsheet [.uos] | |
113 | 'uot', # - Unified Office Format text [.uot] | |
114 | 'vor3', # - StarDraw 3.0 Template (OpenOffice.org Impress) [.vor] | |
115 | 'vor4', # - StarWriter 4.0 Template [.vor] | |
116 | 'vor5', # - StarDraw 5.0 Template (OpenOffice.org Impress) [.vor] | |
117 | 'vor', # - StarCalc 5.0 Template [.vor] | |
118 | #wmf - Windows Metafile [.wmf] | |
119 | 'xhtml', # - XHTML Document [.html] | |
120 | 'xls5', # - Microsoft Excel 5.0 [.xls] | |
121 | 'xls95', # - Microsoft Excel 95 [.xls] | |
122 | 'xls', # - Microsoft Excel 97/2000/XP [.xls] | |
123 | 'xlt5', # - Microsoft Excel 5.0 Template [.xlt] | |
124 | 'xlt95', # - Microsoft Excel 95 Template [.xlt] | |
125 | 'xlt', # - Microsoft Excel 97/2000/XP Template [.xlt] | |
126 | #xpm - X PixMap [.xpm] | |
127 | ] | |
128 | ||
129 | def is_unoconv_working(): | |
519bcfb0 AL |
130 | # TODO: must have libreoffice-headless installed too, need to check for it |
131 | unoconv = where('unoconv') | |
132 | if not unoconv: | |
133 | return False | |
a80ebf3b | 134 | try: |
519bcfb0 AL |
135 | proc = Popen([unoconv, '--show'], stderr=PIPE) |
136 | output = proc.stderr.read() | |
137 | except OSError, e: | |
a80ebf3b AL |
138 | _log.warn(_('unoconv failing to run, check log file')) |
139 | return False | |
140 | if 'ERROR' in output: | |
141 | return False | |
142 | return True | |
143 | ||
144 | def supported_extensions(cache=[None]): | |
145 | if cache[0] == None: | |
146 | cache[0] = 'pdf' | |
519bcfb0 | 147 | if is_unoconv_working(): |
a80ebf3b AL |
148 | cache.extend(unoconv_supported) |
149 | return cache | |
150 | ||
151 | def where(name): | |
152 | for p in os.environ['PATH'].split(os.pathsep): | |
153 | fullpath = os.path.join(p, name) | |
154 | if os.path.exists(fullpath): | |
155 | return fullpath | |
156 | return None | |
157 | ||
158 | def check_prerequisites(): | |
159 | if not where('pdfinfo'): | |
160 | _log.warn('missing pdfinfo') | |
161 | return False | |
162 | if not where('pdftocairo'): | |
163 | _log.warn('missing pdfcairo') | |
164 | return False | |
165 | return True | |
166 | ||
167 | def sniff_handler(media_file, **kw): | |
51e4e435 | 168 | _log.info('Sniffing {0}'.format(MEDIA_TYPE)) |
a80ebf3b | 169 | if not check_prerequisites(): |
51e4e435 | 170 | return None |
a80ebf3b AL |
171 | if kw.get('media') is not None: |
172 | name, ext = os.path.splitext(kw['media'].filename) | |
173 | clean_ext = ext[1:].lower() | |
174 | ||
175 | if clean_ext in supported_extensions(): | |
51e4e435 | 176 | return MEDIA_TYPE |
a80ebf3b | 177 | |
51e4e435 | 178 | return None |
a80ebf3b AL |
179 | |
180 | def create_pdf_thumb(original, thumb_filename, width, height): | |
181 | # Note: pdftocairo adds '.png', remove it | |
182 | thumb_filename = thumb_filename[:-4] | |
183 | executable = where('pdftocairo') | |
184 | args = [executable, '-scale-to', str(min(width, height)), | |
185 | '-singlefile', '-png', original, thumb_filename] | |
186 | _log.debug('calling {0}'.format(repr(' '.join(args)))) | |
519bcfb0 | 187 | Popen(executable=executable, args=args).wait() |
a80ebf3b AL |
188 | |
189 | def pdf_info(original): | |
190 | """ | |
191 | Extract dictionary of pdf information. This could use a library instead | |
192 | of a process. | |
193 | ||
194 | Note: I'm assuming pdfinfo output is sanitized (integers where integers are | |
195 | expected, etc.) - if this is wrong then an exception will be raised and caught | |
196 | leading to the dreaded error page. It seems a safe assumption. | |
197 | """ | |
198 | ret_dict = {} | |
199 | pdfinfo = where('pdfinfo') | |
200 | try: | |
519bcfb0 AL |
201 | proc = Popen(executable=pdfinfo, |
202 | args=[pdfinfo, original], stdout=PIPE) | |
203 | lines = proc.stdout.readlines() | |
204 | except OSError: | |
a80ebf3b AL |
205 | _log.debug('pdfinfo could not read the pdf file.') |
206 | raise BadMediaFail() | |
207 | ||
208 | info_dict = dict([[part.strip() for part in l.strip().split(':', 1)] | |
209 | for l in lines if ':' in l]) | |
210 | ||
211 | for date_key in [('pdf_mod_date', 'ModDate'), | |
212 | ('pdf_creation_date', 'CreationDate')]: | |
213 | if date_key in info_dict: | |
214 | ret_dict[date_key] = dateutil.parser.parse(info_dict[date_key]) | |
215 | for db_key, int_key in [('pdf_pages', 'Pages')]: | |
216 | if int_key in info_dict: | |
217 | ret_dict[db_key] = int(info_dict[int_key]) | |
218 | ||
219 | # parse 'PageSize' field: 595 x 842 pts (A4) | |
220 | page_size_parts = info_dict['Page size'].split() | |
221 | ret_dict['pdf_page_size_width'] = float(page_size_parts[0]) | |
222 | ret_dict['pdf_page_size_height'] = float(page_size_parts[2]) | |
223 | ||
224 | for db_key, str_key in [('pdf_keywords', 'Keywords'), | |
225 | ('pdf_creator', 'Creator'), ('pdf_producer', 'Producer'), | |
226 | ('pdf_author', 'Author'), ('pdf_title', 'Title')]: | |
227 | ret_dict[db_key] = info_dict.get(str_key, None) | |
228 | ret_dict['pdf_version_major'], ret_dict['pdf_version_minor'] = \ | |
229 | map(int, info_dict['PDF version'].split('.')) | |
230 | ||
231 | return ret_dict | |
232 | ||
a80ebf3b | 233 | |
5fabbcc4 | 234 | class CommonPdfProcessor(MediaProcessor): |
a80ebf3b | 235 | """ |
5fabbcc4 RE |
236 | Provides a base for various pdf processing steps |
237 | """ | |
238 | def common_setup(self): | |
239 | """ | |
240 | Set up common pdf processing steps | |
241 | """ | |
242 | # Pull down and set up the original file | |
243 | self.orig_filename = get_orig_filename( | |
244 | self.entry, self.workbench) | |
245 | self.name_builder = FilenameBuilder(self.orig_filename) | |
246 | ||
247 | self._set_pdf_filename() | |
248 | ||
249 | def _set_pdf_filename(self): | |
250 | if self.name_builder.ext == 'pdf': | |
251 | self.pdf_filename = self.orig_filename | |
252 | else: | |
253 | self.pdf_filename = self.name_builder.fill('{basename}.pdf') | |
254 | ||
255 | def copy_original(self): | |
256 | copy_original( | |
257 | self.entry, self.orig_filename, | |
258 | self.name_builder.fill('{basename}{ext}')) | |
259 | ||
260 | def generate_thumb(self, thumb_size=None): | |
261 | if not thumb_size: | |
262 | thumb_size = (mgg.global_config['media:thumb']['max_width'], | |
263 | mgg.global_config['media:thumb']['max_height']) | |
264 | ||
265 | # Note: pdftocairo adds '.png', so don't include an ext | |
266 | thumb_filename = self.name_builder.fill('{basename}.thumbnail') | |
267 | ||
268 | executable = where('pdftocairo') | |
269 | args = [executable, '-scale-to', str(thumb_size), | |
270 | '-singlefile', '-png', self.pdf_filename, thumb_filename] | |
271 | ||
272 | _log.debug('calling {0}'.format(repr(' '.join(args)))) | |
273 | Popen(executable=executable, args=args).wait() | |
274 | ||
275 | store_public(self.entry, 'thumb', thumb_filename, | |
276 | self.name_builder.fill('{basename}.thumbnail.png')) | |
277 | ||
278 | def generate_pdf(self): | |
279 | """ | |
280 | Store the pdf. If the file is not a pdf, make it a pdf | |
281 | """ | |
282 | if self.name_builder.ext != 'pdf': | |
283 | unoconv = where('unoconv') | |
284 | Popen(executable=unoconv, | |
285 | args=[unoconv, '-v', '-f', 'pdf', self.orig_filename]).wait() | |
286 | ||
287 | if not os.path.exists(self.pdf_filename): | |
288 | _log.debug('unoconv failed to convert file to pdf') | |
289 | raise BadMediaFail() | |
290 | ||
291 | store_public(self.entry, 'pdf', self.pdf_filename, | |
292 | self.name_builder.fill('{basename}.pdf')) | |
293 | ||
294 | def extract_pdf_info(self): | |
295 | pdf_info_dict = pdf_info(self.pdf_filename) | |
296 | entry.media_data_init(**pdf_info_dict) | |
297 | ||
298 | def generate_medium(self, size=None): | |
299 | if not size: | |
300 | size = (mgg.global_config['media:medium']['max_width'], | |
301 | mgg.global_config['media:medium']['max_height']) | |
302 | ||
303 | # Note: pdftocairo adds '.png', so don't include an ext | |
304 | filename = self.name_builder.fill('{basename}.medium') | |
305 | ||
306 | executable = where('pdftocairo') | |
307 | args = [executable, '-scale-to', str(size), | |
308 | '-singlefile', '-png', self.pdf_filename, filename] | |
309 | ||
310 | _log.debug('calling {0}'.format(repr(' '.join(args)))) | |
311 | Popen(executable=executable, args=args).wait() | |
312 | ||
313 | store_public(self.entry, 'thumb', filename, | |
314 | self.name_builder.fill('{basename}.medium.png')) | |
315 | ||
316 | class InitialProcessor(CommonPdfProcessor): | |
317 | """ | |
318 | Initial processing step for new pdfs | |
319 | """ | |
320 | name = "initial" | |
321 | description = "Initial processing" | |
322 | ||
323 | @classmethod | |
324 | def media_is_eligible(cls, entry=None, state=None): | |
325 | """ | |
326 | Determine if this media type is eligible for processing | |
327 | """ | |
328 | if not state: | |
329 | state = entry.state | |
330 | return state in ( | |
331 | "unprocessed", "failed") | |
332 | ||
333 | @classmethod | |
334 | def generate_parser(cls): | |
335 | parser = argparse.ArgumentParser( | |
336 | description=cls.description, | |
337 | prog=cls.name) | |
338 | ||
339 | parser.add_argument( | |
340 | '--size', | |
341 | nargs=2, | |
342 | metavar=('max_width', 'max_height'), | |
343 | type=int) | |
344 | ||
345 | parser.add_argument( | |
346 | '--thumb-size', | |
347 | nargs=2, | |
348 | metavar=('max_width', 'max_height'), | |
349 | type=int) | |
350 | ||
351 | return parser | |
352 | ||
353 | @classmethod | |
354 | def args_to_request(cls, args): | |
355 | return request_from_args( | |
356 | args, ['size', 'thumb_size']) | |
357 | ||
358 | def process(self, size=None, thumb_size=None): | |
359 | self.common_setup() | |
360 | self.generate_pdf() | |
361 | self.extract_pdf_info() | |
362 | self.copy_original() | |
363 | self.generate_medium(size=size) | |
364 | self.generate_thumb(thumb_size=thumb_size) | |
365 | self.delete_queue_file() | |
366 | ||
367 | ||
368 | class PdfProcessingManager(ProcessingManager): | |
369 | def __init__(self): | |
370 | super(self.__class__, self).__init__() | |
371 | self.add_processor(InitialProcessor) |