1 # GNU MediaGoblin -- federated, autonomous media hosting
2 # Copyright (C) 2011, 2012 MediaGoblin contributors. See AUTHORS.
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published by
6 # the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU Affero General Public License for more details.
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 import dateutil
.parser
19 from subprocess
import PIPE
, Popen
21 from mediagoblin
import mg_globals
as mgg
22 from mediagoblin
.processing
import (create_pub_filepath
,
23 FilenameBuilder
, BadMediaFail
)
24 from mediagoblin
.tools
.translate
import fake_ugettext_passthrough
as _
26 _log
= logging
.getLogger(__name__
)
28 MEDIA_TYPE
= 'mediagoblin.media_types.pdf'
30 # TODO - cache (memoize) util
32 # This is a list created via uniconv --show and hand removing some types that
33 # we already support via other media types better.
35 'bib', # - BibTeX [.bib]
36 #bmp - Windows Bitmap [.bmp]
37 'csv', # - Text CSV [.csv]
38 'dbf', # - dBASE [.dbf]
39 'dif', # - Data Interchange Format [.dif]
40 'doc6', # - Microsoft Word 6.0 [.doc]
41 'doc95', # - Microsoft Word 95 [.doc]
42 'docbook', # - DocBook [.xml]
43 'doc', # - Microsoft Word 97/2000/XP [.doc]
44 'docx7', # - Microsoft Office Open XML [.docx]
45 'docx', # - Microsoft Office Open XML [.docx]
46 #emf - Enhanced Metafile [.emf]
47 'eps', # - Encapsulated PostScript [.eps]
48 'fodp', # - OpenDocument Presentation (Flat XML) [.fodp]
49 'fods', # - OpenDocument Spreadsheet (Flat XML) [.fods]
50 'fodt', # - OpenDocument Text (Flat XML) [.fodt]
51 #gif - Graphics Interchange Format [.gif]
52 'html', # - HTML Document (OpenOffice.org Writer) [.html]
53 #jpg - Joint Photographic Experts Group [.jpg]
54 'latex', # - LaTeX 2e [.ltx]
55 'mediawiki', # - MediaWiki [.txt]
56 'met', # - OS/2 Metafile [.met]
57 'odd', # - OpenDocument Drawing [.odd]
58 'odg', # - ODF Drawing (Impress) [.odg]
59 'odp', # - ODF Presentation [.odp]
60 'ods', # - ODF Spreadsheet [.ods]
61 'odt', # - ODF Text Document [.odt]
62 'ooxml', # - Microsoft Office Open XML [.xml]
63 'otg', # - OpenDocument Drawing Template [.otg]
64 'otp', # - ODF Presentation Template [.otp]
65 'ots', # - ODF Spreadsheet Template [.ots]
66 'ott', # - Open Document Text [.ott]
67 #pbm - Portable Bitmap [.pbm]
68 #pct - Mac Pict [.pct]
69 'pdb', # - AportisDoc (Palm) [.pdb]
70 #pdf - Portable Document Format [.pdf]
71 #pgm - Portable Graymap [.pgm]
72 #png - Portable Network Graphic [.png]
73 'pot', # - Microsoft PowerPoint 97/2000/XP Template [.pot]
74 'potm', # - Microsoft PowerPoint 2007/2010 XML Template [.potm]
75 #ppm - Portable Pixelmap [.ppm]
76 'pps', # - Microsoft PowerPoint 97/2000/XP (Autoplay) [.pps]
77 'ppt', # - Microsoft PowerPoint 97/2000/XP [.ppt]
78 'pptx', # - Microsoft PowerPoint 2007/2010 XML [.pptx]
79 'psw', # - Pocket Word [.psw]
80 'pwp', # - PlaceWare [.pwp]
81 'pxl', # - Pocket Excel [.pxl]
82 #ras - Sun Raster Image [.ras]
83 'rtf', # - Rich Text Format [.rtf]
84 'sda', # - StarDraw 5.0 (OpenOffice.org Impress) [.sda]
85 'sdc3', # - StarCalc 3.0 [.sdc]
86 'sdc4', # - StarCalc 4.0 [.sdc]
87 'sdc', # - StarCalc 5.0 [.sdc]
88 'sdd3', # - StarDraw 3.0 (OpenOffice.org Impress) [.sdd]
89 'sdd4', # - StarImpress 4.0 [.sdd]
90 'sdd', # - StarImpress 5.0 [.sdd]
91 'sdw3', # - StarWriter 3.0 [.sdw]
92 'sdw4', # - StarWriter 4.0 [.sdw]
93 'sdw', # - StarWriter 5.0 [.sdw]
94 'slk', # - SYLK [.slk]
95 'stc', # - OpenOffice.org 1.0 Spreadsheet Template [.stc]
96 'std', # - OpenOffice.org 1.0 Drawing Template [.std]
97 'sti', # - OpenOffice.org 1.0 Presentation Template [.sti]
98 'stw', # - Open Office.org 1.0 Text Document Template [.stw]
99 #svg - Scalable Vector Graphics [.svg]
100 'svm', # - StarView Metafile [.svm]
101 'swf', # - Macromedia Flash (SWF) [.swf]
102 'sxc', # - OpenOffice.org 1.0 Spreadsheet [.sxc]
103 'sxd3', # - StarDraw 3.0 [.sxd]
104 'sxd5', # - StarDraw 5.0 [.sxd]
105 'sxd', # - OpenOffice.org 1.0 Drawing (OpenOffice.org Impress) [.sxd]
106 'sxi', # - OpenOffice.org 1.0 Presentation [.sxi]
107 'sxw', # - Open Office.org 1.0 Text Document [.sxw]
108 #text - Text Encoded [.txt]
109 #tiff - Tagged Image File Format [.tiff]
111 'uop', # - Unified Office Format presentation [.uop]
112 'uos', # - Unified Office Format spreadsheet [.uos]
113 'uot', # - Unified Office Format text [.uot]
114 'vor3', # - StarDraw 3.0 Template (OpenOffice.org Impress) [.vor]
115 'vor4', # - StarWriter 4.0 Template [.vor]
116 'vor5', # - StarDraw 5.0 Template (OpenOffice.org Impress) [.vor]
117 'vor', # - StarCalc 5.0 Template [.vor]
118 #wmf - Windows Metafile [.wmf]
119 'xhtml', # - XHTML Document [.html]
120 'xls5', # - Microsoft Excel 5.0 [.xls]
121 'xls95', # - Microsoft Excel 95 [.xls]
122 'xls', # - Microsoft Excel 97/2000/XP [.xls]
123 'xlt5', # - Microsoft Excel 5.0 Template [.xlt]
124 'xlt95', # - Microsoft Excel 95 Template [.xlt]
125 'xlt', # - Microsoft Excel 97/2000/XP Template [.xlt]
126 #xpm - X PixMap [.xpm]
129 def is_unoconv_working():
130 # TODO: must have libreoffice-headless installed too, need to check for it
131 unoconv
= where('unoconv')
135 proc
= Popen([unoconv
, '--show'], stderr
=PIPE
)
136 output
= proc
.stderr
.read()
138 _log
.warn(_('unoconv failing to run, check log file'))
140 if 'ERROR' in output
:
144 def supported_extensions(cache
=[None]):
147 if is_unoconv_working():
148 cache
.extend(unoconv_supported
)
152 for p
in os
.environ
['PATH'].split(os
.pathsep
):
153 fullpath
= os
.path
.join(p
, name
)
154 if os
.path
.exists(fullpath
):
158 def check_prerequisites():
159 if not where('pdfinfo'):
160 _log
.warn('missing pdfinfo')
162 if not where('pdftocairo'):
163 _log
.warn('missing pdfcairo')
167 def sniff_handler(media_file
, **kw
):
168 _log
.info('Sniffing {0}'.format(MEDIA_TYPE
))
169 if not check_prerequisites():
171 if kw
.get('media') is not None:
172 name
, ext
= os
.path
.splitext(kw
['media'].filename
)
173 clean_ext
= ext
[1:].lower()
175 if clean_ext
in supported_extensions():
180 def create_pdf_thumb(original
, thumb_filename
, width
, height
):
181 # Note: pdftocairo adds '.png', remove it
182 thumb_filename
= thumb_filename
[:-4]
183 executable
= where('pdftocairo')
184 args
= [executable
, '-scale-to', str(min(width
, height
)),
185 '-singlefile', '-png', original
, thumb_filename
]
186 _log
.debug('calling {0}'.format(repr(' '.join(args
))))
187 Popen(executable
=executable
, args
=args
).wait()
189 def pdf_info(original
):
191 Extract dictionary of pdf information. This could use a library instead
194 Note: I'm assuming pdfinfo output is sanitized (integers where integers are
195 expected, etc.) - if this is wrong then an exception will be raised and caught
196 leading to the dreaded error page. It seems a safe assumption.
199 pdfinfo
= where('pdfinfo')
201 proc
= Popen(executable
=pdfinfo
,
202 args
=[pdfinfo
, original
], stdout
=PIPE
)
203 lines
= proc
.stdout
.readlines()
205 _log
.debug('pdfinfo could not read the pdf file.')
208 info_dict
= dict([[part
.strip() for part
in l
.strip().split(':', 1)]
209 for l
in lines
if ':' in l
])
211 for date_key
in [('pdf_mod_date', 'ModDate'),
212 ('pdf_creation_date', 'CreationDate')]:
213 if date_key
in info_dict
:
214 ret_dict
[date_key
] = dateutil
.parser
.parse(info_dict
[date_key
])
215 for db_key
, int_key
in [('pdf_pages', 'Pages')]:
216 if int_key
in info_dict
:
217 ret_dict
[db_key
] = int(info_dict
[int_key
])
219 # parse 'PageSize' field: 595 x 842 pts (A4)
220 page_size_parts
= info_dict
['Page size'].split()
221 ret_dict
['pdf_page_size_width'] = float(page_size_parts
[0])
222 ret_dict
['pdf_page_size_height'] = float(page_size_parts
[2])
224 for db_key
, str_key
in [('pdf_keywords', 'Keywords'),
225 ('pdf_creator', 'Creator'), ('pdf_producer', 'Producer'),
226 ('pdf_author', 'Author'), ('pdf_title', 'Title')]:
227 ret_dict
[db_key
] = info_dict
.get(str_key
, None)
228 ret_dict
['pdf_version_major'], ret_dict
['pdf_version_minor'] = \
229 map(int, info_dict
['PDF version'].split('.'))
233 def process_pdf(proc_state
):
234 """Code to process a pdf file. Will be run by celery.
236 A Workbench() represents a local tempory dir. It is automatically
237 cleaned up when this function exits.
239 entry
= proc_state
.entry
240 workbench
= proc_state
.workbench
242 queued_filename
= proc_state
.get_queued_filename()
243 name_builder
= FilenameBuilder(queued_filename
)
245 # Copy our queued local workbench to its final destination
246 original_dest
= name_builder
.fill('{basename}{ext}')
247 proc_state
.copy_original(original_dest
)
249 # Create a pdf if this is a different doc, store pdf for viewer
250 ext
= queued_filename
.rsplit('.', 1)[-1].lower()
252 pdf_filename
= queued_filename
254 pdf_filename
= queued_filename
.rsplit('.', 1)[0] + '.pdf'
255 unoconv
= where('unoconv')
256 Popen(executable
=unoconv
,
257 args
=[unoconv
, '-v', '-f', 'pdf', queued_filename
]).wait()
258 if not os
.path
.exists(pdf_filename
):
259 _log
.debug('unoconv failed to convert file to pdf')
261 proc_state
.store_public(keyname
=u
'pdf', local_file
=pdf_filename
)
263 pdf_info_dict
= pdf_info(pdf_filename
)
265 for name
, width
, height
in [
266 (u
'thumb', mgg
.global_config
['media:thumb']['max_width'],
267 mgg
.global_config
['media:thumb']['max_height']),
268 (u
'medium', mgg
.global_config
['media:medium']['max_width'],
269 mgg
.global_config
['media:medium']['max_height']),
271 filename
= name_builder
.fill('{basename}.%s.png' % name
)
272 path
= workbench
.joinpath(filename
)
273 create_pdf_thumb(pdf_filename
, path
, width
, height
)
274 assert(os
.path
.exists(path
))
275 proc_state
.store_public(keyname
=name
, local_file
=path
)
277 proc_state
.delete_queue_file()
279 entry
.media_data_init(**pdf_info_dict
)