1 # GNU MediaGoblin -- federated, autonomous media hosting
2 # Copyright (C) 2011, 2012 MediaGoblin contributors. See AUTHORS.
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published by
6 # the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU Affero General Public License for more details.
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
19 import dateutil
.parser
20 from subprocess
import PIPE
, Popen
22 from mediagoblin
import mg_globals
as mgg
23 from mediagoblin
.processing
import (
24 FilenameBuilder
, BadMediaFail
,
25 MediaProcessor
, ProcessingManager
,
26 request_from_args
, get_process_filename
,
27 store_public
, copy_original
)
28 from mediagoblin
.tools
.translate
import fake_ugettext_passthrough
as _
30 _log
= logging
.getLogger(__name__
)
32 MEDIA_TYPE
= 'mediagoblin.media_types.pdf'
34 # TODO - cache (memoize) util
36 # This is a list created via uniconv --show and hand removing some types that
37 # we already support via other media types better.
39 'bib', # - BibTeX [.bib]
40 #bmp - Windows Bitmap [.bmp]
41 'csv', # - Text CSV [.csv]
42 'dbf', # - dBASE [.dbf]
43 'dif', # - Data Interchange Format [.dif]
44 'doc6', # - Microsoft Word 6.0 [.doc]
45 'doc95', # - Microsoft Word 95 [.doc]
46 'docbook', # - DocBook [.xml]
47 'doc', # - Microsoft Word 97/2000/XP [.doc]
48 'docx7', # - Microsoft Office Open XML [.docx]
49 'docx', # - Microsoft Office Open XML [.docx]
50 #emf - Enhanced Metafile [.emf]
51 'eps', # - Encapsulated PostScript [.eps]
52 'fodp', # - OpenDocument Presentation (Flat XML) [.fodp]
53 'fods', # - OpenDocument Spreadsheet (Flat XML) [.fods]
54 'fodt', # - OpenDocument Text (Flat XML) [.fodt]
55 #gif - Graphics Interchange Format [.gif]
56 'html', # - HTML Document (OpenOffice.org Writer) [.html]
57 #jpg - Joint Photographic Experts Group [.jpg]
58 'latex', # - LaTeX 2e [.ltx]
59 'mediawiki', # - MediaWiki [.txt]
60 'met', # - OS/2 Metafile [.met]
61 'odd', # - OpenDocument Drawing [.odd]
62 'odg', # - ODF Drawing (Impress) [.odg]
63 'odp', # - ODF Presentation [.odp]
64 'ods', # - ODF Spreadsheet [.ods]
65 'odt', # - ODF Text Document [.odt]
66 'ooxml', # - Microsoft Office Open XML [.xml]
67 'otg', # - OpenDocument Drawing Template [.otg]
68 'otp', # - ODF Presentation Template [.otp]
69 'ots', # - ODF Spreadsheet Template [.ots]
70 'ott', # - Open Document Text [.ott]
71 #pbm - Portable Bitmap [.pbm]
72 #pct - Mac Pict [.pct]
73 'pdb', # - AportisDoc (Palm) [.pdb]
74 #pdf - Portable Document Format [.pdf]
75 #pgm - Portable Graymap [.pgm]
76 #png - Portable Network Graphic [.png]
77 'pot', # - Microsoft PowerPoint 97/2000/XP Template [.pot]
78 'potm', # - Microsoft PowerPoint 2007/2010 XML Template [.potm]
79 #ppm - Portable Pixelmap [.ppm]
80 'pps', # - Microsoft PowerPoint 97/2000/XP (Autoplay) [.pps]
81 'ppt', # - Microsoft PowerPoint 97/2000/XP [.ppt]
82 'pptx', # - Microsoft PowerPoint 2007/2010 XML [.pptx]
83 'psw', # - Pocket Word [.psw]
84 'pwp', # - PlaceWare [.pwp]
85 'pxl', # - Pocket Excel [.pxl]
86 #ras - Sun Raster Image [.ras]
87 'rtf', # - Rich Text Format [.rtf]
88 'sda', # - StarDraw 5.0 (OpenOffice.org Impress) [.sda]
89 'sdc3', # - StarCalc 3.0 [.sdc]
90 'sdc4', # - StarCalc 4.0 [.sdc]
91 'sdc', # - StarCalc 5.0 [.sdc]
92 'sdd3', # - StarDraw 3.0 (OpenOffice.org Impress) [.sdd]
93 'sdd4', # - StarImpress 4.0 [.sdd]
94 'sdd', # - StarImpress 5.0 [.sdd]
95 'sdw3', # - StarWriter 3.0 [.sdw]
96 'sdw4', # - StarWriter 4.0 [.sdw]
97 'sdw', # - StarWriter 5.0 [.sdw]
98 'slk', # - SYLK [.slk]
99 'stc', # - OpenOffice.org 1.0 Spreadsheet Template [.stc]
100 'std', # - OpenOffice.org 1.0 Drawing Template [.std]
101 'sti', # - OpenOffice.org 1.0 Presentation Template [.sti]
102 'stw', # - Open Office.org 1.0 Text Document Template [.stw]
103 #svg - Scalable Vector Graphics [.svg]
104 'svm', # - StarView Metafile [.svm]
105 'swf', # - Macromedia Flash (SWF) [.swf]
106 'sxc', # - OpenOffice.org 1.0 Spreadsheet [.sxc]
107 'sxd3', # - StarDraw 3.0 [.sxd]
108 'sxd5', # - StarDraw 5.0 [.sxd]
109 'sxd', # - OpenOffice.org 1.0 Drawing (OpenOffice.org Impress) [.sxd]
110 'sxi', # - OpenOffice.org 1.0 Presentation [.sxi]
111 'sxw', # - Open Office.org 1.0 Text Document [.sxw]
112 #text - Text Encoded [.txt]
113 #tiff - Tagged Image File Format [.tiff]
115 'uop', # - Unified Office Format presentation [.uop]
116 'uos', # - Unified Office Format spreadsheet [.uos]
117 'uot', # - Unified Office Format text [.uot]
118 'vor3', # - StarDraw 3.0 Template (OpenOffice.org Impress) [.vor]
119 'vor4', # - StarWriter 4.0 Template [.vor]
120 'vor5', # - StarDraw 5.0 Template (OpenOffice.org Impress) [.vor]
121 'vor', # - StarCalc 5.0 Template [.vor]
122 #wmf - Windows Metafile [.wmf]
123 'xhtml', # - XHTML Document [.html]
124 'xls5', # - Microsoft Excel 5.0 [.xls]
125 'xls95', # - Microsoft Excel 95 [.xls]
126 'xls', # - Microsoft Excel 97/2000/XP [.xls]
127 'xlt5', # - Microsoft Excel 5.0 Template [.xlt]
128 'xlt95', # - Microsoft Excel 95 Template [.xlt]
129 'xlt', # - Microsoft Excel 97/2000/XP Template [.xlt]
130 #xpm - X PixMap [.xpm]
133 def is_unoconv_working():
134 # TODO: must have libreoffice-headless installed too, need to check for it
135 unoconv
= where('unoconv')
139 proc
= Popen([unoconv
, '--show'], stderr
=PIPE
)
140 output
= proc
.stderr
.read()
142 _log
.warn(_('unoconv failing to run, check log file'))
144 if 'ERROR' in output
:
148 def supported_extensions(cache
=[None]):
151 if is_unoconv_working():
152 cache
.extend(unoconv_supported
)
156 for p
in os
.environ
['PATH'].split(os
.pathsep
):
157 fullpath
= os
.path
.join(p
, name
)
158 if os
.path
.exists(fullpath
):
162 def check_prerequisites():
163 if not where('pdfinfo'):
164 _log
.warn('missing pdfinfo')
166 if not where('pdftocairo'):
167 _log
.warn('missing pdfcairo')
171 def sniff_handler(media_file
, **kw
):
172 _log
.info('Sniffing {0}'.format(MEDIA_TYPE
))
173 if not check_prerequisites():
175 if kw
.get('media') is not None:
176 name
, ext
= os
.path
.splitext(kw
['media'].filename
)
177 clean_ext
= ext
[1:].lower()
179 if clean_ext
in supported_extensions():
184 def create_pdf_thumb(original
, thumb_filename
, width
, height
):
185 # Note: pdftocairo adds '.png', remove it
186 thumb_filename
= thumb_filename
[:-4]
187 executable
= where('pdftocairo')
188 args
= [executable
, '-scale-to', str(min(width
, height
)),
189 '-singlefile', '-png', original
, thumb_filename
]
190 _log
.debug('calling {0}'.format(repr(' '.join(args
))))
191 Popen(executable
=executable
, args
=args
).wait()
193 def pdf_info(original
):
195 Extract dictionary of pdf information. This could use a library instead
198 Note: I'm assuming pdfinfo output is sanitized (integers where integers are
199 expected, etc.) - if this is wrong then an exception will be raised and caught
200 leading to the dreaded error page. It seems a safe assumption.
203 pdfinfo
= where('pdfinfo')
205 proc
= Popen(executable
=pdfinfo
,
206 args
=[pdfinfo
, original
], stdout
=PIPE
)
207 lines
= proc
.stdout
.readlines()
209 _log
.debug('pdfinfo could not read the pdf file.')
212 info_dict
= dict([[part
.strip() for part
in l
.strip().split(':', 1)]
213 for l
in lines
if ':' in l
])
215 for date_key
in [('pdf_mod_date', 'ModDate'),
216 ('pdf_creation_date', 'CreationDate')]:
217 if date_key
in info_dict
:
218 ret_dict
[date_key
] = dateutil
.parser
.parse(info_dict
[date_key
])
219 for db_key
, int_key
in [('pdf_pages', 'Pages')]:
220 if int_key
in info_dict
:
221 ret_dict
[db_key
] = int(info_dict
[int_key
])
223 # parse 'PageSize' field: 595 x 842 pts (A4)
224 page_size_parts
= info_dict
['Page size'].split()
225 ret_dict
['pdf_page_size_width'] = float(page_size_parts
[0])
226 ret_dict
['pdf_page_size_height'] = float(page_size_parts
[2])
228 for db_key
, str_key
in [('pdf_keywords', 'Keywords'),
229 ('pdf_creator', 'Creator'), ('pdf_producer', 'Producer'),
230 ('pdf_author', 'Author'), ('pdf_title', 'Title')]:
231 ret_dict
[db_key
] = info_dict
.get(str_key
, None)
232 ret_dict
['pdf_version_major'], ret_dict
['pdf_version_minor'] = \
233 map(int, info_dict
['PDF version'].split('.'))
238 class CommonPdfProcessor(MediaProcessor
):
240 Provides a base for various pdf processing steps
242 acceptable_files
= ['original', 'pdf']
244 def common_setup(self
):
246 Set up common pdf processing steps
248 # Pull down and set up the processing file
249 self
.process_filename
= get_process_filename(
250 self
.entry
, self
.workbench
, self
.acceptable_files
)
251 self
.name_builder
= FilenameBuilder(self
.process_filename
)
253 self
._set
_pdf
_filename
()
255 def _set_pdf_filename(self
):
256 if self
.name_builder
.ext
== '.pdf':
257 self
.pdf_filename
= self
.process_filename
258 elif self
.entry
.media_files
.get('pdf'):
259 self
.pdf_filename
= self
.workbench
.localized_file(
260 mgg
.public_store
, self
.entry
.media_files
['pdf'])
262 self
.pdf_filename
= self
._generate
_pdf
()
264 def copy_original(self
):
266 self
.entry
, self
.process_filename
,
267 self
.name_builder
.fill('{basename}{ext}'))
269 def generate_thumb(self
, thumb_size
=None):
271 thumb_size
= (mgg
.global_config
['media:thumb']['max_width'],
272 mgg
.global_config
['media:thumb']['max_height'])
274 # Note: pdftocairo adds '.png', so don't include an ext
275 thumb_filename
= os
.path
.join(self
.workbench
.dir,
276 self
.name_builder
.fill(
277 '{basename}.thumbnail'))
279 executable
= where('pdftocairo')
280 args
= [executable
, '-scale-to', str(min(thumb_size
)),
281 '-singlefile', '-png', self
.pdf_filename
, thumb_filename
]
283 _log
.debug('calling {0}'.format(repr(' '.join(args
))))
284 Popen(executable
=executable
, args
=args
).wait()
286 # since pdftocairo added '.png', we need to include it with the
288 store_public(self
.entry
, 'thumb', thumb_filename
+ '.png',
289 self
.name_builder
.fill('{basename}.thumbnail.png'))
291 def _generate_pdf(self
):
293 Store the pdf. If the file is not a pdf, make it a pdf
295 tmp_pdf
= self
.process_filename
297 unoconv
= where('unoconv')
298 Popen(executable
=unoconv
,
299 args
=[unoconv
, '-v', '-f', 'pdf', self
.process_filename
]).wait()
301 if not os
.path
.exists(tmp_pdf
):
302 _log
.debug('unoconv failed to convert file to pdf')
305 store_public(self
.entry
, 'pdf', tmp_pdf
,
306 self
.name_builder
.fill('{basename}.pdf'))
308 return self
.workbench
.localized_file(
309 mgg
.public_store
, self
.entry
.media_files
['pdf'])
311 def extract_pdf_info(self
):
312 pdf_info_dict
= pdf_info(self
.pdf_filename
)
313 self
.entry
.media_data_init(**pdf_info_dict
)
315 def generate_medium(self
, size
=None):
317 size
= (mgg
.global_config
['media:medium']['max_width'],
318 mgg
.global_config
['media:medium']['max_height'])
320 # Note: pdftocairo adds '.png', so don't include an ext
321 filename
= os
.path
.join(self
.workbench
.dir,
322 self
.name_builder
.fill('{basename}.medium'))
324 executable
= where('pdftocairo')
325 args
= [executable
, '-scale-to', str(min(size
)),
326 '-singlefile', '-png', self
.pdf_filename
, filename
]
328 _log
.debug('calling {0}'.format(repr(' '.join(args
))))
329 Popen(executable
=executable
, args
=args
).wait()
331 # since pdftocairo added '.png', we need to include it with the
333 store_public(self
.entry
, 'medium', filename
+ '.png',
334 self
.name_builder
.fill('{basename}.medium.png'))
337 class InitialProcessor(CommonPdfProcessor
):
339 Initial processing step for new pdfs
342 description
= "Initial processing"
345 def media_is_eligible(cls
, entry
=None, state
=None):
347 Determine if this media type is eligible for processing
352 "unprocessed", "failed")
355 def generate_parser(cls
):
356 parser
= argparse
.ArgumentParser(
357 description
=cls
.description
,
363 metavar
=('max_width', 'max_height'),
369 metavar
=('max_width', 'max_height'),
375 def args_to_request(cls
, args
):
376 return request_from_args(
377 args
, ['size', 'thumb_size'])
379 def process(self
, size
=None, thumb_size
=None):
381 self
.extract_pdf_info()
383 self
.generate_medium(size
=size
)
384 self
.generate_thumb(thumb_size
=thumb_size
)
385 self
.delete_queue_file()
388 class Resizer(CommonPdfProcessor
):
390 Resizing process steps for processed pdfs
393 description
= 'Resize thumbnail and medium'
397 def media_is_eligible(cls
, entry
=None, state
=None):
399 Determine if this media type is eligible for processing
403 return state
in 'processed'
406 def generate_parser(cls
):
407 parser
= argparse
.ArgumentParser(
408 description
=cls
.description
,
414 metavar
=('max_width', 'max_height'),
419 choices
=['medium', 'thumb'])
424 def args_to_request(cls
, args
):
425 return request_from_args(
426 args
, ['size', 'file'])
428 def process(self
, file, size
=None):
431 self
.generate_medium(size
=size
)
432 elif file == 'thumb':
433 self
.generate_thumb(thumb_size
=size
)
436 class PdfProcessingManager(ProcessingManager
):
438 super(self
.__class
__, self
).__init
__()
439 self
.add_processor(InitialProcessor
)
440 self
.add_processor(Resizer
)