mediagoblin/tools/metadata.py

   1 # GNU MediaGoblin -- federated, autonomous media hosting
   2 # Copyright (C) 2011, 2012 MediaGoblin contributors.  See AUTHORS.
   3 #
   4 # This program is free software: you can redistribute it and/or modify
   5 # it under the terms of the GNU Affero General Public License as published by
   6 # the Free Software Foundation, either version 3 of the License, or
   7 # (at your option) any later version.
   8 #
   9 # This program is distributed in the hope that it will be useful,
  10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 # GNU Affero General Public License for more details.
  13 #
  14 # You should have received a copy of the GNU Affero General Public License
  15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16
  17
  18 from io import open
  19 import os
  20 import copy
  21 import json
  22 import re
  23 from pkg_resources import resource_filename
  24
  25 import dateutil.parser
  26 from pyld import jsonld
  27 from jsonschema import validate, FormatChecker, draft4_format_checker
  28 from jsonschema.compat import str_types
  29
  30 from mediagoblin.tools.pluginapi import hook_handle
  31
  32
  33
  34 ########################################################
  35 ## Set up the MediaGoblin format checker for json-schema
  36 ########################################################
  37
  38 URL_REGEX = re.compile(
  39     r'^[a-z]+://([^/:]+|([0-9]{1,3}\.){3}[0-9]{1,3})(:[0-9]+)?(\/.*)?$',
  40     re.IGNORECASE)
  41
  42 def is_uri(instance):
  43     """
  44     jsonschema uri validator
  45     """
  46     if not isinstance(instance, str_types):
  47         return True
  48
  49     return URL_REGEX.match(instance)
  50
  51 def is_datetime(instance):
  52     """
  53     Is a date or datetime readable string.
  54     """
  55     if not isinstance(instance, str_types):
  56         return True
  57
  58     return dateutil.parser.parse(instance)
  59
  60
  61 class DefaultChecker(FormatChecker):
  62     """
  63     Default MediaGoblin format checker... extended to include a few extra things
  64     """
  65     checkers = copy.deepcopy(draft4_format_checker.checkers)
  66
  67
  68 DefaultChecker.checkers[u"uri"] = (is_uri, ())
  69 DefaultChecker.checkers[u"date-time"] = (is_datetime, (ValueError, TypeError))
  70 DEFAULT_CHECKER = DefaultChecker()
  71
  72 # Crappy default schema, checks for things we deem important
  73
  74 DEFAULT_SCHEMA = {
  75     "$schema": "http://json-schema.org/schema#",
  76
  77     "type": "object",
  78     "properties": {
  79         "license": {
  80             "format": "uri",
  81             "type": "string",
  82         },
  83         "dcterms:created": {
  84             "format": "date-time",
  85             "type": "string",
  86         },
  87         "dc:created": {
  88             "format": "date-time",
  89             "type": "string",
  90         }
  91     },
  92 }
  93
  94
  95 def load_resource(package, resource_path):
  96     """
  97     Load a resource, return it as a string.
  98
  99     Args:
 100     - package: package or module name.  Eg "mediagoblin.media_types.audio"
 101     - resource_path: path to get to this resource, a list of
 102       directories and finally a filename.  Will be joined with
 103       os.path.sep.
 104     """
 105     filename = resource_filename(package, os.path.sep.join(resource_path))
 106     return open(filename, encoding="utf-8").read()
 107
 108 def load_resource_json(package, resource_path):
 109     """
 110     Load a resource json file, return a dictionary.
 111
 112     Args:
 113     - package: package or module name.  Eg "mediagoblin.media_types.audio"
 114     - resource_path: path to get to this resource, a list of
 115       directories and finally a filename.  Will be joined with
 116       os.path.sep.
 117     """
 118     return json.loads(load_resource(package, resource_path))
 119
 120
 121 ##################################
 122 ## Load the MediaGoblin core files
 123 ##################################
 124
 125
 126 BUILTIN_CONTEXTS = {
 127     "http://www.w3.org/2013/json-ld-context/rdfa11": load_resource(
 128         "mediagoblin", ["static", "metadata", "rdfa11.jsonld"])}
 129
 130
 131 _CONTEXT_CACHE = {}
 132
 133 def load_context(url):
 134     """
 135     A self-aware document loader.  For those contexts MediaGoblin
 136     stores internally, load them from disk.
 137     """
 138     if url in _CONTEXT_CACHE:
 139         return _CONTEXT_CACHE[url]
 140
 141     # See if it's one of our basic ones
 142     document = BUILTIN_CONTEXTS.get(url, None)
 143
 144     # No?  See if we have an internal schema for this
 145     if document is None:
 146         document = hook_handle(("context_url_data", url))
 147
 148     # Okay, if we've gotten a document by now... let's package it up
 149     if document is not None:
 150         document = {'contextUrl': None,
 151                     'documentUrl': url,
 152                     'document': document}
 153
 154     # Otherwise, use jsonld.load_document
 155     else:
 156         document = jsonld.load_document(url)
 157
 158     # cache
 159     _CONTEXT_CACHE[url] = document
 160     return document
 161
 162
 163 DEFAULT_CONTEXT = "http://www.w3.org/2013/json-ld-context/rdfa11"
 164
 165 def compact_json(metadata, context=DEFAULT_CONTEXT):
 166     """
 167     Compact json with supplied context.
 168
 169     Note: Free floating" nodes are removed (eg a key just named
 170     "bazzzzzz" which isn't specified in the context... something like
 171     bazzzzzz:blerp will stay though.  This is jsonld.compact behavior.
 172     """
 173     compacted = jsonld.compact(
 174         metadata, context,
 175         options={
 176             "documentLoader": load_context,
 177             # This allows for things like "license" and etc to be preserved
 178             "expandContext": context,
 179             "keepFreeFloatingNodes": False})
 180
 181     return compacted
 182
 183
 184 def compact_and_validate(metadata, context=DEFAULT_CONTEXT,
 185                          schema=DEFAULT_SCHEMA):
 186     """
 187     compact json with supplied context, check against schema for errors
 188
 189     raises an exception (jsonschema.exceptions.ValidationError) if
 190     there's an error.
 191
 192     Note: Free floating" nodes are removed (eg a key just named
 193     "bazzzzzz" which isn't specified in the context... something like
 194     bazzzzzz:blerp will stay though.  This is jsonld.compact behavior.
 195
 196     You may wish to do this validation yourself... this is just for convenience.
 197     """
 198     compacted = compact_json(metadata, context)
 199     validate(metadata, schema, format_checker=DEFAULT_CHECKER)
 200
 201     return compacted
 202
 203
 204 def expand_json(metadata, context=DEFAULT_CONTEXT):
 205     """
 206     Expand json, but be sure to use our documentLoader.
 207
 208     By default this expands with DEFAULT_CONTEXT, but if you do not need this,
 209     you can safely set this to None.
 210
 211     # @@: Is the above a good idea?  Maybe it should be set to None by
 212     #   default.
 213     """
 214     options = {
 215         "documentLoader": load_context}
 216     if context is not None:
 217         options["expandContext"] = context
 218     return jsonld.expand(metadata, options=options)
 219
 220
 221 def rdfa_to_readable(rdfa_predicate):
 222     readable = rdfa_predicate.split(u":")[1].capitalize()
 223     return readable