Commit | Line | Data |
---|---|---|
a80ebf3b AL |
1 | # GNU MediaGoblin -- federated, autonomous media hosting |
2 | # Copyright (C) 2011, 2012 MediaGoblin contributors. See AUTHORS. | |
3 | # | |
4 | # This program is free software: you can redistribute it and/or modify | |
5 | # it under the terms of the GNU Affero General Public License as published by | |
6 | # the Free Software Foundation, either version 3 of the License, or | |
7 | # (at your option) any later version. | |
8 | # | |
9 | # This program is distributed in the hope that it will be useful, | |
10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
12 | # GNU Affero General Public License for more details. | |
13 | # | |
14 | # You should have received a copy of the GNU Affero General Public License | |
15 | # along with this program. If not, see <http://www.gnu.org/licenses/>. | |
a80ebf3b | 16 | import os |
a80ebf3b AL |
17 | import logging |
18 | import dateutil.parser | |
519bcfb0 | 19 | from subprocess import PIPE, Popen |
a80ebf3b AL |
20 | |
21 | from mediagoblin import mg_globals as mgg | |
22 | from mediagoblin.processing import (create_pub_filepath, | |
23 | FilenameBuilder, BadMediaFail) | |
24 | from mediagoblin.tools.translate import fake_ugettext_passthrough as _ | |
25 | ||
26 | _log = logging.getLogger(__name__) | |
27 | ||
28 | # TODO - cache (memoize) util | |
29 | ||
30 | # This is a list created via uniconv --show and hand removing some types that | |
31 | # we already support via other media types better. | |
32 | unoconv_supported = [ | |
33 | 'bib', # - BibTeX [.bib] | |
34 | #bmp - Windows Bitmap [.bmp] | |
35 | 'csv', # - Text CSV [.csv] | |
36 | 'dbf', # - dBASE [.dbf] | |
37 | 'dif', # - Data Interchange Format [.dif] | |
38 | 'doc6', # - Microsoft Word 6.0 [.doc] | |
39 | 'doc95', # - Microsoft Word 95 [.doc] | |
40 | 'docbook', # - DocBook [.xml] | |
41 | 'doc', # - Microsoft Word 97/2000/XP [.doc] | |
42 | 'docx7', # - Microsoft Office Open XML [.docx] | |
43 | 'docx', # - Microsoft Office Open XML [.docx] | |
44 | #emf - Enhanced Metafile [.emf] | |
45 | 'eps', # - Encapsulated PostScript [.eps] | |
46 | 'fodp', # - OpenDocument Presentation (Flat XML) [.fodp] | |
47 | 'fods', # - OpenDocument Spreadsheet (Flat XML) [.fods] | |
48 | 'fodt', # - OpenDocument Text (Flat XML) [.fodt] | |
49 | #gif - Graphics Interchange Format [.gif] | |
50 | 'html', # - HTML Document (OpenOffice.org Writer) [.html] | |
51 | #jpg - Joint Photographic Experts Group [.jpg] | |
52 | 'latex', # - LaTeX 2e [.ltx] | |
53 | 'mediawiki', # - MediaWiki [.txt] | |
54 | 'met', # - OS/2 Metafile [.met] | |
55 | 'odd', # - OpenDocument Drawing [.odd] | |
56 | 'odg', # - ODF Drawing (Impress) [.odg] | |
57 | 'odp', # - ODF Presentation [.odp] | |
58 | 'ods', # - ODF Spreadsheet [.ods] | |
59 | 'odt', # - ODF Text Document [.odt] | |
60 | 'ooxml', # - Microsoft Office Open XML [.xml] | |
61 | 'otg', # - OpenDocument Drawing Template [.otg] | |
62 | 'otp', # - ODF Presentation Template [.otp] | |
63 | 'ots', # - ODF Spreadsheet Template [.ots] | |
64 | 'ott', # - Open Document Text [.ott] | |
65 | #pbm - Portable Bitmap [.pbm] | |
66 | #pct - Mac Pict [.pct] | |
67 | 'pdb', # - AportisDoc (Palm) [.pdb] | |
68 | #pdf - Portable Document Format [.pdf] | |
69 | #pgm - Portable Graymap [.pgm] | |
70 | #png - Portable Network Graphic [.png] | |
71 | 'pot', # - Microsoft PowerPoint 97/2000/XP Template [.pot] | |
72 | 'potm', # - Microsoft PowerPoint 2007/2010 XML Template [.potm] | |
73 | #ppm - Portable Pixelmap [.ppm] | |
74 | 'pps', # - Microsoft PowerPoint 97/2000/XP (Autoplay) [.pps] | |
75 | 'ppt', # - Microsoft PowerPoint 97/2000/XP [.ppt] | |
76 | 'pptx', # - Microsoft PowerPoint 2007/2010 XML [.pptx] | |
77 | 'psw', # - Pocket Word [.psw] | |
78 | 'pwp', # - PlaceWare [.pwp] | |
79 | 'pxl', # - Pocket Excel [.pxl] | |
80 | #ras - Sun Raster Image [.ras] | |
81 | 'rtf', # - Rich Text Format [.rtf] | |
82 | 'sda', # - StarDraw 5.0 (OpenOffice.org Impress) [.sda] | |
83 | 'sdc3', # - StarCalc 3.0 [.sdc] | |
84 | 'sdc4', # - StarCalc 4.0 [.sdc] | |
85 | 'sdc', # - StarCalc 5.0 [.sdc] | |
86 | 'sdd3', # - StarDraw 3.0 (OpenOffice.org Impress) [.sdd] | |
87 | 'sdd4', # - StarImpress 4.0 [.sdd] | |
88 | 'sdd', # - StarImpress 5.0 [.sdd] | |
89 | 'sdw3', # - StarWriter 3.0 [.sdw] | |
90 | 'sdw4', # - StarWriter 4.0 [.sdw] | |
91 | 'sdw', # - StarWriter 5.0 [.sdw] | |
92 | 'slk', # - SYLK [.slk] | |
93 | 'stc', # - OpenOffice.org 1.0 Spreadsheet Template [.stc] | |
94 | 'std', # - OpenOffice.org 1.0 Drawing Template [.std] | |
95 | 'sti', # - OpenOffice.org 1.0 Presentation Template [.sti] | |
96 | 'stw', # - Open Office.org 1.0 Text Document Template [.stw] | |
97 | #svg - Scalable Vector Graphics [.svg] | |
98 | 'svm', # - StarView Metafile [.svm] | |
99 | 'swf', # - Macromedia Flash (SWF) [.swf] | |
100 | 'sxc', # - OpenOffice.org 1.0 Spreadsheet [.sxc] | |
101 | 'sxd3', # - StarDraw 3.0 [.sxd] | |
102 | 'sxd5', # - StarDraw 5.0 [.sxd] | |
103 | 'sxd', # - OpenOffice.org 1.0 Drawing (OpenOffice.org Impress) [.sxd] | |
104 | 'sxi', # - OpenOffice.org 1.0 Presentation [.sxi] | |
105 | 'sxw', # - Open Office.org 1.0 Text Document [.sxw] | |
106 | #text - Text Encoded [.txt] | |
107 | #tiff - Tagged Image File Format [.tiff] | |
108 | #txt - Text [.txt] | |
109 | 'uop', # - Unified Office Format presentation [.uop] | |
110 | 'uos', # - Unified Office Format spreadsheet [.uos] | |
111 | 'uot', # - Unified Office Format text [.uot] | |
112 | 'vor3', # - StarDraw 3.0 Template (OpenOffice.org Impress) [.vor] | |
113 | 'vor4', # - StarWriter 4.0 Template [.vor] | |
114 | 'vor5', # - StarDraw 5.0 Template (OpenOffice.org Impress) [.vor] | |
115 | 'vor', # - StarCalc 5.0 Template [.vor] | |
116 | #wmf - Windows Metafile [.wmf] | |
117 | 'xhtml', # - XHTML Document [.html] | |
118 | 'xls5', # - Microsoft Excel 5.0 [.xls] | |
119 | 'xls95', # - Microsoft Excel 95 [.xls] | |
120 | 'xls', # - Microsoft Excel 97/2000/XP [.xls] | |
121 | 'xlt5', # - Microsoft Excel 5.0 Template [.xlt] | |
122 | 'xlt95', # - Microsoft Excel 95 Template [.xlt] | |
123 | 'xlt', # - Microsoft Excel 97/2000/XP Template [.xlt] | |
124 | #xpm - X PixMap [.xpm] | |
125 | ] | |
126 | ||
127 | def is_unoconv_working(): | |
519bcfb0 AL |
128 | # TODO: must have libreoffice-headless installed too, need to check for it |
129 | unoconv = where('unoconv') | |
130 | if not unoconv: | |
131 | return False | |
a80ebf3b | 132 | try: |
519bcfb0 AL |
133 | proc = Popen([unoconv, '--show'], stderr=PIPE) |
134 | output = proc.stderr.read() | |
135 | except OSError, e: | |
a80ebf3b AL |
136 | _log.warn(_('unoconv failing to run, check log file')) |
137 | return False | |
138 | if 'ERROR' in output: | |
139 | return False | |
140 | return True | |
141 | ||
142 | def supported_extensions(cache=[None]): | |
143 | if cache[0] == None: | |
144 | cache[0] = 'pdf' | |
519bcfb0 | 145 | if is_unoconv_working(): |
a80ebf3b AL |
146 | cache.extend(unoconv_supported) |
147 | return cache | |
148 | ||
149 | def where(name): | |
150 | for p in os.environ['PATH'].split(os.pathsep): | |
151 | fullpath = os.path.join(p, name) | |
152 | if os.path.exists(fullpath): | |
153 | return fullpath | |
154 | return None | |
155 | ||
156 | def check_prerequisites(): | |
157 | if not where('pdfinfo'): | |
158 | _log.warn('missing pdfinfo') | |
159 | return False | |
160 | if not where('pdftocairo'): | |
161 | _log.warn('missing pdfcairo') | |
162 | return False | |
163 | return True | |
164 | ||
165 | def sniff_handler(media_file, **kw): | |
166 | if not check_prerequisites(): | |
167 | return False | |
168 | if kw.get('media') is not None: | |
169 | name, ext = os.path.splitext(kw['media'].filename) | |
170 | clean_ext = ext[1:].lower() | |
171 | ||
172 | if clean_ext in supported_extensions(): | |
173 | return True | |
174 | ||
175 | return False | |
176 | ||
177 | def create_pdf_thumb(original, thumb_filename, width, height): | |
178 | # Note: pdftocairo adds '.png', remove it | |
179 | thumb_filename = thumb_filename[:-4] | |
180 | executable = where('pdftocairo') | |
181 | args = [executable, '-scale-to', str(min(width, height)), | |
182 | '-singlefile', '-png', original, thumb_filename] | |
183 | _log.debug('calling {0}'.format(repr(' '.join(args)))) | |
519bcfb0 | 184 | Popen(executable=executable, args=args).wait() |
a80ebf3b AL |
185 | |
186 | def pdf_info(original): | |
187 | """ | |
188 | Extract dictionary of pdf information. This could use a library instead | |
189 | of a process. | |
190 | ||
191 | Note: I'm assuming pdfinfo output is sanitized (integers where integers are | |
192 | expected, etc.) - if this is wrong then an exception will be raised and caught | |
193 | leading to the dreaded error page. It seems a safe assumption. | |
194 | """ | |
195 | ret_dict = {} | |
196 | pdfinfo = where('pdfinfo') | |
197 | try: | |
519bcfb0 AL |
198 | proc = Popen(executable=pdfinfo, |
199 | args=[pdfinfo, original], stdout=PIPE) | |
200 | lines = proc.stdout.readlines() | |
201 | except OSError: | |
a80ebf3b AL |
202 | _log.debug('pdfinfo could not read the pdf file.') |
203 | raise BadMediaFail() | |
204 | ||
205 | info_dict = dict([[part.strip() for part in l.strip().split(':', 1)] | |
206 | for l in lines if ':' in l]) | |
207 | ||
208 | for date_key in [('pdf_mod_date', 'ModDate'), | |
209 | ('pdf_creation_date', 'CreationDate')]: | |
210 | if date_key in info_dict: | |
211 | ret_dict[date_key] = dateutil.parser.parse(info_dict[date_key]) | |
212 | for db_key, int_key in [('pdf_pages', 'Pages')]: | |
213 | if int_key in info_dict: | |
214 | ret_dict[db_key] = int(info_dict[int_key]) | |
215 | ||
216 | # parse 'PageSize' field: 595 x 842 pts (A4) | |
217 | page_size_parts = info_dict['Page size'].split() | |
218 | ret_dict['pdf_page_size_width'] = float(page_size_parts[0]) | |
219 | ret_dict['pdf_page_size_height'] = float(page_size_parts[2]) | |
220 | ||
221 | for db_key, str_key in [('pdf_keywords', 'Keywords'), | |
222 | ('pdf_creator', 'Creator'), ('pdf_producer', 'Producer'), | |
223 | ('pdf_author', 'Author'), ('pdf_title', 'Title')]: | |
224 | ret_dict[db_key] = info_dict.get(str_key, None) | |
225 | ret_dict['pdf_version_major'], ret_dict['pdf_version_minor'] = \ | |
226 | map(int, info_dict['PDF version'].split('.')) | |
227 | ||
228 | return ret_dict | |
229 | ||
230 | def process_pdf(proc_state): | |
231 | """Code to process a pdf file. Will be run by celery. | |
232 | ||
233 | A Workbench() represents a local tempory dir. It is automatically | |
234 | cleaned up when this function exits. | |
235 | """ | |
236 | entry = proc_state.entry | |
237 | workbench = proc_state.workbench | |
238 | ||
239 | queued_filename = proc_state.get_queued_filename() | |
240 | name_builder = FilenameBuilder(queued_filename) | |
241 | ||
a80ebf3b AL |
242 | # Copy our queued local workbench to its final destination |
243 | original_dest = name_builder.fill('{basename}{ext}') | |
244 | proc_state.copy_original(original_dest) | |
245 | ||
246 | # Create a pdf if this is a different doc, store pdf for viewer | |
247 | ext = queued_filename.rsplit('.', 1)[-1].lower() | |
248 | if ext == 'pdf': | |
249 | pdf_filename = queued_filename | |
250 | else: | |
251 | pdf_filename = queued_filename.rsplit('.', 1)[0] + '.pdf' | |
252 | unoconv = where('unoconv') | |
2d7a6789 CAW |
253 | Popen(executable=unoconv, |
254 | args=[unoconv, '-v', '-f', 'pdf', queued_filename]).wait() | |
a80ebf3b AL |
255 | if not os.path.exists(pdf_filename): |
256 | _log.debug('unoconv failed to convert file to pdf') | |
257 | raise BadMediaFail() | |
258 | proc_state.store_public(keyname=u'pdf', local_file=pdf_filename) | |
259 | ||
260 | pdf_info_dict = pdf_info(pdf_filename) | |
261 | ||
262 | for name, width, height in [ | |
263 | (u'thumb', mgg.global_config['media:thumb']['max_width'], | |
264 | mgg.global_config['media:thumb']['max_height']), | |
265 | (u'medium', mgg.global_config['media:medium']['max_width'], | |
266 | mgg.global_config['media:medium']['max_height']), | |
267 | ]: | |
268 | filename = name_builder.fill('{basename}.%s.png' % name) | |
269 | path = workbench.joinpath(filename) | |
270 | create_pdf_thumb(pdf_filename, path, width, height) | |
271 | assert(os.path.exists(path)) | |
272 | proc_state.store_public(keyname=name, local_file=path) | |
273 | ||
274 | proc_state.delete_queue_file() | |
275 | ||
276 | entry.media_data_init(**pdf_info_dict) | |
277 | entry.save() |