From 6fab7734d6b6817b310865409f260ab87907eaa0 Mon Sep 17 00:00:00 2001 From: Christopher Allan Webber Date: Wed, 7 May 2014 18:50:48 -0500 Subject: [PATCH] Updating batchaddmedia to use new metadata tools --- mediagoblin/gmg_commands/batchaddmedia.py | 99 +++-------------------- mediagoblin/tools/metadata.py | 4 +- 2 files changed, 14 insertions(+), 89 deletions(-) diff --git a/mediagoblin/gmg_commands/batchaddmedia.py b/mediagoblin/gmg_commands/batchaddmedia.py index 07c0b3fc..e540e88c 100644 --- a/mediagoblin/gmg_commands/batchaddmedia.py +++ b/mediagoblin/gmg_commands/batchaddmedia.py @@ -15,7 +15,7 @@ # along with this program. If not, see . import os -import copy, tempfile, tarfile, zipfile, subprocess, re, requests +import tempfile, tarfile, zipfile, subprocess, requests from csv import reader as csv_reader from urlparse import urlparse from pyld import jsonld @@ -24,11 +24,9 @@ from mediagoblin.gmg_commands import util as commands_util from mediagoblin.submit.lib import ( submit_media, get_upload_file_limits, FileUploadLimit, UserUploadLimit, UserPastUploadLimit) -from mediagoblin.tools.translate import lazy_pass_to_ugettext as _ +from mediagoblin.tools.metadata import compact_and_validate -from jsonschema import validate, FormatChecker, draft4_format_checker from jsonschema.exceptions import ValidationError -from jsonschema.compat import str_types def parser_setup(subparser): @@ -126,25 +124,24 @@ zip files and directories" contents = all_metadata.read() media_metadata = parse_csv_file(contents) - metadata_context = { 'dcterms':'http://purl.org/dc/terms/', - 'xsd': 'http://www.w3.org/2001/XMLSchema#'} - for media_id in media_locations.keys(): files_attempted += 1 - file_metadata = media_metadata[media_id] - sanitized_metadata = check_metadata_format(file_metadata) - if sanitized_metadata == {}: continue + file_metadata = media_metadata[media_id] + try: + json_ld_metadata = compact_and_validate(file_metadata) + except ValidationError, exc: + print "Error with '%s' value '%s': %s" % ( + media_id, exc.path[0], exc.message) + continue - json_ld_metadata = jsonld.compact(file_metadata, metadata_context) original_location = media_locations[media_id]['media:original'] url = urlparse(original_location) - title = sanitized_metadata.get('dcterms:title') - description = sanitized_metadata.get('dcterms:description') + title = json_ld_metadata.get('dcterms:title') + description = json_ld_metadata.get('dcterms:description') - # TODO: this isn't the same thing - license = sanitized_metadata.get('dcterms:rights') + license = json_ld_metadata.get('license') filename = url.path.split()[-1] if url.scheme == 'http': @@ -214,75 +211,3 @@ def parse_csv_file(file_contents): def teardown(temp_files): for temp_file in temp_files: subprocess.call(['rm','-r',temp_file]) - - -## Set up the MediaGoblin checker -# - -URL_REGEX = re.compile( - r'^[a-z]+://([^/:]+|([0-9]{1,3}\.){3}[0-9]{1,3})(:[0-9]+)?(\/.*)?$', - re.IGNORECASE) - -def is_uri(instance): - if not isinstance(instance, str_types): - return True - - return URL_REGEX.match(instance) - - -class DefaultChecker(FormatChecker): - checkers = copy.deepcopy(draft4_format_checker.checkers) - -DefaultChecker.checkers[u"uri"] = (is_uri, ()) - -DEFAULT_CHECKER = DefaultChecker() - -def check_metadata_format(metadata_dict): - schema = { - "$schema": "http://json-schema.org/schema#", - - "type": "object", - "properties": { - "dcterms:rights": { - "format": "uri", - "type": "string", - }, - "dcterms:created": { - - } - }, - # "required": ["dcterms:title", "media:id"], - } - - try: - validate(metadata_dict, schema, - format_checker=DEFAULT_CHECKER) - output_dict = metadata_dict - # "media:id" is only for internal use, so we delete it for the output - del output_dict['media:id'] - - except ValidationError, exc: - title = (metadata_dict.get('dcterms:title') or - metadata_dict.get('media:id') or _(u'UNKNOWN FILE')) - - if exc.validator == "additionalProperties": - message = _(u'Invalid metadata provided for file "{title}". This \ -script only accepts the Dublin Core metadata terms.'.format(title=title)) - - elif exc.validator == "required": - message = _( -u'All necessary metadata was not provided for file "{title}", you must include \ -a "dcterms:title" column for each media file'.format(title=title)) - - else: - message = _(u'Could not find appropriate metadata for file \ -"{title}".'.format(title=title)) - - print _(u"""WARN: {message} \nSkipping File...\n""".format( - message=message)) - - output_dict = {} - except: - raise - - return output_dict diff --git a/mediagoblin/tools/metadata.py b/mediagoblin/tools/metadata.py index 428e425c..c49bcaaf 100644 --- a/mediagoblin/tools/metadata.py +++ b/mediagoblin/tools/metadata.py @@ -78,7 +78,7 @@ DEFAULT_SCHEMA = { "type": "object", "properties": { - "dcterms:rights": { + "license": { "format": "uri", "type": "string", }, @@ -96,7 +96,7 @@ def compact_and_validate(metadata, context=MEDIAGOBLIN_CONTEXT, compact json with supplied context, check against schema for errors raises an exception (jsonschema.exceptions.ValidationError) if - there's an error. + there's an error.9 You may wish to do this validation yourself... this is just for convenience. """ -- 2.25.1