Fix issue 983 PDF UnicodeDecodeError
[mediagoblin.git] / mediagoblin / media_types / pdf / processing.py
index 549def695f991bbea532c524ec2a5127b7ab548b..ac4bab6d55a51a39594d9fabd9d6a472c9ccc8ff 100644 (file)
@@ -138,10 +138,10 @@ def is_unoconv_working():
     try:
         proc = Popen([unoconv, '--show'], stderr=PIPE)
         output = proc.stderr.read()
-    except OSError, e:
+    except OSError:
         _log.warn(_('unoconv failing to run, check log file'))
         return False
-    if 'ERROR' in output:
+    if b'ERROR' in output:
         return False
     return True
 
@@ -168,18 +168,16 @@ def check_prerequisites():
         return False
     return True
 
-def sniff_handler(media_file, **kw):
+def sniff_handler(media_file, filename):
     _log.info('Sniffing {0}'.format(MEDIA_TYPE))
     if not check_prerequisites():
         return None
-    if kw.get('media') is not None:
-        name, ext = os.path.splitext(kw['media'].filename)
-        clean_ext = ext[1:].lower()
 
-        if clean_ext in supported_extensions():
-            return MEDIA_TYPE
+    name, ext = os.path.splitext(filename)
+    clean_ext = ext[1:].lower()
 
-    return None
+    if clean_ext in supported_extensions():
+        return MEDIA_TYPE
 
 def create_pdf_thumb(original, thumb_filename, width, height):
     # Note: pdftocairo adds '.png', remove it
@@ -209,9 +207,15 @@ def pdf_info(original):
         _log.debug('pdfinfo could not read the pdf file.')
         raise BadMediaFail()
 
+    lines = [l.decode('utf-8', 'replace') for l in lines]
     info_dict = dict([[part.strip() for part in l.strip().split(':', 1)]
                       for l in lines if ':' in l])
 
+    if 'Page size' not in info_dict.keys():
+        # TODO - message is for the user, not debug, but BadMediaFail not taking an argument, fix that.
+        _log.debug('Missing "Page size" key in returned pdf - conversion failed?')
+        raise BadMediaFail()
+
     for date_key in [('pdf_mod_date', 'ModDate'),
                      ('pdf_creation_date', 'CreationDate')]:
         if date_key in info_dict:
@@ -261,6 +265,22 @@ class CommonPdfProcessor(MediaProcessor):
         else:
             self.pdf_filename = self._generate_pdf()
 
+    def _skip_processing(self, keyname, **kwargs):
+        file_metadata = self.entry.get_file_metadata(keyname)
+        skip = True
+
+        if not file_metadata:
+            return False
+
+        if keyname == 'thumb':
+            if kwargs.get('thumb_size') != file_metadata.get('thumb_size'):
+                skip = False
+        elif keyname == 'medium':
+            if kwargs.get('size') != file_metadata.get('size'):
+                skip = False
+
+        return skip
+
     def copy_original(self):
         copy_original(
             self.entry, self.process_filename,
@@ -271,6 +291,9 @@ class CommonPdfProcessor(MediaProcessor):
             thumb_size = (mgg.global_config['media:thumb']['max_width'],
                           mgg.global_config['media:thumb']['max_height'])
 
+        if self._skip_processing('thumb', thumb_size=thumb_size):
+            return
+
         # Note: pdftocairo adds '.png', so don't include an ext
         thumb_filename = os.path.join(self.workbench.dir,
                                       self.name_builder.fill(
@@ -288,15 +311,19 @@ class CommonPdfProcessor(MediaProcessor):
         store_public(self.entry, 'thumb', thumb_filename + '.png',
                      self.name_builder.fill('{basename}.thumbnail.png'))
 
+        self.entry.set_file_metadata('thumb', thumb_size=thumb_size)
+
     def _generate_pdf(self):
         """
         Store the pdf. If the file is not a pdf, make it a pdf
         """
-        tmp_pdf = self.process_filename
+        tmp_pdf = os.path.splitext(self.process_filename)[0] + '.pdf'
 
         unoconv = where('unoconv')
+        args = [unoconv, '-v', '-f', 'pdf', self.process_filename]
+        _log.debug('calling %s' % repr(args))
         Popen(executable=unoconv,
-              args=[unoconv, '-v', '-f', 'pdf', self.process_filename]).wait()
+              args=args).wait()
 
         if not os.path.exists(tmp_pdf):
             _log.debug('unoconv failed to convert file to pdf')
@@ -317,6 +344,9 @@ class CommonPdfProcessor(MediaProcessor):
             size = (mgg.global_config['media:medium']['max_width'],
                     mgg.global_config['media:medium']['max_height'])
 
+        if self._skip_processing('medium', size=size):
+            return
+
         # Note: pdftocairo adds '.png', so don't include an ext
         filename = os.path.join(self.workbench.dir,
                                 self.name_builder.fill('{basename}.medium'))
@@ -333,6 +363,8 @@ class CommonPdfProcessor(MediaProcessor):
         store_public(self.entry, 'medium', filename + '.png',
                      self.name_builder.fill('{basename}.medium.png'))
 
+        self.entry.set_file_metadata('medium', size=size)
+
 
 class InitialProcessor(CommonPdfProcessor):
     """
@@ -435,6 +467,6 @@ class Resizer(CommonPdfProcessor):
 
 class PdfProcessingManager(ProcessingManager):
     def __init__(self):
-        super(self.__class__, self).__init__()
+        super(PdfProcessingManager, self).__init__()
         self.add_processor(InitialProcessor)
         self.add_processor(Resizer)