X-Git-Url: https://vcs.fsf.org/?a=blobdiff_plain;f=mediagoblin%2Fmedia_types%2Fpdf%2Fprocessing.py;h=ac4bab6d55a51a39594d9fabd9d6a472c9ccc8ff;hb=e2b44bd7a7478792be47c98086f5c91d0da283e2;hp=49742fd72256d7f4baf01717252d6434602e4ac8;hpb=94fadafe0908e497b9562f6e2689d83d07c5147d;p=mediagoblin.git diff --git a/mediagoblin/media_types/pdf/processing.py b/mediagoblin/media_types/pdf/processing.py index 49742fd7..ac4bab6d 100644 --- a/mediagoblin/media_types/pdf/processing.py +++ b/mediagoblin/media_types/pdf/processing.py @@ -13,18 +13,24 @@ # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . +import argparse import os import logging import dateutil.parser from subprocess import PIPE, Popen from mediagoblin import mg_globals as mgg -from mediagoblin.processing import (create_pub_filepath, - FilenameBuilder, BadMediaFail) +from mediagoblin.processing import ( + FilenameBuilder, BadMediaFail, + MediaProcessor, ProcessingManager, + request_from_args, get_process_filename, + store_public, copy_original) from mediagoblin.tools.translate import fake_ugettext_passthrough as _ _log = logging.getLogger(__name__) +MEDIA_TYPE = 'mediagoblin.media_types.pdf' + # TODO - cache (memoize) util # This is a list created via uniconv --show and hand removing some types that @@ -132,10 +138,10 @@ def is_unoconv_working(): try: proc = Popen([unoconv, '--show'], stderr=PIPE) output = proc.stderr.read() - except OSError, e: + except OSError: _log.warn(_('unoconv failing to run, check log file')) return False - if 'ERROR' in output: + if b'ERROR' in output: return False return True @@ -162,17 +168,16 @@ def check_prerequisites(): return False return True -def sniff_handler(media_file, **kw): +def sniff_handler(media_file, filename): + _log.info('Sniffing {0}'.format(MEDIA_TYPE)) if not check_prerequisites(): - return False - if kw.get('media') is not None: - name, ext = os.path.splitext(kw['media'].filename) - clean_ext = ext[1:].lower() + return None - if clean_ext in supported_extensions(): - return True + name, ext = os.path.splitext(filename) + clean_ext = ext[1:].lower() - return False + if clean_ext in supported_extensions(): + return MEDIA_TYPE def create_pdf_thumb(original, thumb_filename, width, height): # Note: pdftocairo adds '.png', remove it @@ -202,9 +207,15 @@ def pdf_info(original): _log.debug('pdfinfo could not read the pdf file.') raise BadMediaFail() + lines = [l.decode('utf-8', 'replace') for l in lines] info_dict = dict([[part.strip() for part in l.strip().split(':', 1)] for l in lines if ':' in l]) + if 'Page size' not in info_dict.keys(): + # TODO - message is for the user, not debug, but BadMediaFail not taking an argument, fix that. + _log.debug('Missing "Page size" key in returned pdf - conversion failed?') + raise BadMediaFail() + for date_key in [('pdf_mod_date', 'ModDate'), ('pdf_creation_date', 'CreationDate')]: if date_key in info_dict: @@ -227,51 +238,235 @@ def pdf_info(original): return ret_dict -def process_pdf(proc_state): - """Code to process a pdf file. Will be run by celery. - A Workbench() represents a local tempory dir. It is automatically - cleaned up when this function exits. +class CommonPdfProcessor(MediaProcessor): + """ + Provides a base for various pdf processing steps """ - entry = proc_state.entry - workbench = proc_state.workbench - - queued_filename = proc_state.get_queued_filename() - name_builder = FilenameBuilder(queued_filename) - - # Copy our queued local workbench to its final destination - original_dest = name_builder.fill('{basename}{ext}') - proc_state.copy_original(original_dest) - - # Create a pdf if this is a different doc, store pdf for viewer - ext = queued_filename.rsplit('.', 1)[-1].lower() - if ext == 'pdf': - pdf_filename = queued_filename - else: - pdf_filename = queued_filename.rsplit('.', 1)[0] + '.pdf' + acceptable_files = ['original', 'pdf'] + + def common_setup(self): + """ + Set up common pdf processing steps + """ + # Pull down and set up the processing file + self.process_filename = get_process_filename( + self.entry, self.workbench, self.acceptable_files) + self.name_builder = FilenameBuilder(self.process_filename) + + self._set_pdf_filename() + + def _set_pdf_filename(self): + if self.name_builder.ext == '.pdf': + self.pdf_filename = self.process_filename + elif self.entry.media_files.get('pdf'): + self.pdf_filename = self.workbench.localized_file( + mgg.public_store, self.entry.media_files['pdf']) + else: + self.pdf_filename = self._generate_pdf() + + def _skip_processing(self, keyname, **kwargs): + file_metadata = self.entry.get_file_metadata(keyname) + skip = True + + if not file_metadata: + return False + + if keyname == 'thumb': + if kwargs.get('thumb_size') != file_metadata.get('thumb_size'): + skip = False + elif keyname == 'medium': + if kwargs.get('size') != file_metadata.get('size'): + skip = False + + return skip + + def copy_original(self): + copy_original( + self.entry, self.process_filename, + self.name_builder.fill('{basename}{ext}')) + + def generate_thumb(self, thumb_size=None): + if not thumb_size: + thumb_size = (mgg.global_config['media:thumb']['max_width'], + mgg.global_config['media:thumb']['max_height']) + + if self._skip_processing('thumb', thumb_size=thumb_size): + return + + # Note: pdftocairo adds '.png', so don't include an ext + thumb_filename = os.path.join(self.workbench.dir, + self.name_builder.fill( + '{basename}.thumbnail')) + + executable = where('pdftocairo') + args = [executable, '-scale-to', str(min(thumb_size)), + '-singlefile', '-png', self.pdf_filename, thumb_filename] + + _log.debug('calling {0}'.format(repr(' '.join(args)))) + Popen(executable=executable, args=args).wait() + + # since pdftocairo added '.png', we need to include it with the + # filename + store_public(self.entry, 'thumb', thumb_filename + '.png', + self.name_builder.fill('{basename}.thumbnail.png')) + + self.entry.set_file_metadata('thumb', thumb_size=thumb_size) + + def _generate_pdf(self): + """ + Store the pdf. If the file is not a pdf, make it a pdf + """ + tmp_pdf = os.path.splitext(self.process_filename)[0] + '.pdf' + unoconv = where('unoconv') - call(executable=unoconv, - args=[unoconv, '-v', '-f', 'pdf', queued_filename]) - if not os.path.exists(pdf_filename): + args = [unoconv, '-v', '-f', 'pdf', self.process_filename] + _log.debug('calling %s' % repr(args)) + Popen(executable=unoconv, + args=args).wait() + + if not os.path.exists(tmp_pdf): _log.debug('unoconv failed to convert file to pdf') raise BadMediaFail() - proc_state.store_public(keyname=u'pdf', local_file=pdf_filename) - - pdf_info_dict = pdf_info(pdf_filename) - - for name, width, height in [ - (u'thumb', mgg.global_config['media:thumb']['max_width'], - mgg.global_config['media:thumb']['max_height']), - (u'medium', mgg.global_config['media:medium']['max_width'], - mgg.global_config['media:medium']['max_height']), - ]: - filename = name_builder.fill('{basename}.%s.png' % name) - path = workbench.joinpath(filename) - create_pdf_thumb(pdf_filename, path, width, height) - assert(os.path.exists(path)) - proc_state.store_public(keyname=name, local_file=path) - - proc_state.delete_queue_file() - - entry.media_data_init(**pdf_info_dict) - entry.save() + + store_public(self.entry, 'pdf', tmp_pdf, + self.name_builder.fill('{basename}.pdf')) + + return self.workbench.localized_file( + mgg.public_store, self.entry.media_files['pdf']) + + def extract_pdf_info(self): + pdf_info_dict = pdf_info(self.pdf_filename) + self.entry.media_data_init(**pdf_info_dict) + + def generate_medium(self, size=None): + if not size: + size = (mgg.global_config['media:medium']['max_width'], + mgg.global_config['media:medium']['max_height']) + + if self._skip_processing('medium', size=size): + return + + # Note: pdftocairo adds '.png', so don't include an ext + filename = os.path.join(self.workbench.dir, + self.name_builder.fill('{basename}.medium')) + + executable = where('pdftocairo') + args = [executable, '-scale-to', str(min(size)), + '-singlefile', '-png', self.pdf_filename, filename] + + _log.debug('calling {0}'.format(repr(' '.join(args)))) + Popen(executable=executable, args=args).wait() + + # since pdftocairo added '.png', we need to include it with the + # filename + store_public(self.entry, 'medium', filename + '.png', + self.name_builder.fill('{basename}.medium.png')) + + self.entry.set_file_metadata('medium', size=size) + + +class InitialProcessor(CommonPdfProcessor): + """ + Initial processing step for new pdfs + """ + name = "initial" + description = "Initial processing" + + @classmethod + def media_is_eligible(cls, entry=None, state=None): + """ + Determine if this media type is eligible for processing + """ + if not state: + state = entry.state + return state in ( + "unprocessed", "failed") + + @classmethod + def generate_parser(cls): + parser = argparse.ArgumentParser( + description=cls.description, + prog=cls.name) + + parser.add_argument( + '--size', + nargs=2, + metavar=('max_width', 'max_height'), + type=int) + + parser.add_argument( + '--thumb-size', + nargs=2, + metavar=('max_width', 'max_height'), + type=int) + + return parser + + @classmethod + def args_to_request(cls, args): + return request_from_args( + args, ['size', 'thumb_size']) + + def process(self, size=None, thumb_size=None): + self.common_setup() + self.extract_pdf_info() + self.copy_original() + self.generate_medium(size=size) + self.generate_thumb(thumb_size=thumb_size) + self.delete_queue_file() + + +class Resizer(CommonPdfProcessor): + """ + Resizing process steps for processed pdfs + """ + name = 'resize' + description = 'Resize thumbnail and medium' + thumb_size = 'size' + + @classmethod + def media_is_eligible(cls, entry=None, state=None): + """ + Determine if this media type is eligible for processing + """ + if not state: + state = entry.state + return state in 'processed' + + @classmethod + def generate_parser(cls): + parser = argparse.ArgumentParser( + description=cls.description, + prog=cls.name) + + parser.add_argument( + '--size', + nargs=2, + metavar=('max_width', 'max_height'), + type=int) + + parser.add_argument( + 'file', + choices=['medium', 'thumb']) + + return parser + + @classmethod + def args_to_request(cls, args): + return request_from_args( + args, ['size', 'file']) + + def process(self, file, size=None): + self.common_setup() + if file == 'medium': + self.generate_medium(size=size) + elif file == 'thumb': + self.generate_thumb(thumb_size=size) + + +class PdfProcessingManager(ProcessingManager): + def __init__(self): + super(PdfProcessingManager, self).__init__() + self.add_processor(InitialProcessor) + self.add_processor(Resizer)