mediagoblin/tools/metadata.py

   1 # GNU MediaGoblin -- federated, autonomous media hosting
   2 # Copyright (C) 2011, 2012 MediaGoblin contributors.  See AUTHORS.
   3 #
   4 # This program is free software: you can redistribute it and/or modify
   5 # it under the terms of the GNU Affero General Public License as published by
   6 # the Free Software Foundation, either version 3 of the License, or
   7 # (at your option) any later version.
   8 #
   9 # This program is distributed in the hope that it will be useful,
  10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 # GNU Affero General Public License for more details.
  13 #
  14 # You should have received a copy of the GNU Affero General Public License
  15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16
  17
  18 import os
  19 import copy
  20 import json
  21 import re
  22 from pkg_resources import resource_filename
  23
  24 import dateutil.parser
  25 from pyld import jsonld
  26 from jsonschema import validate, FormatChecker, draft4_format_checker
  27 from jsonschema.compat import str_types
  28
  29 from mediagoblin.tools.pluginapi import hook_handle
  30
  31
  32
  33 ########################################################
  34 ## Set up the MediaGoblin format checker for json-schema
  35 ########################################################
  36
  37 URL_REGEX = re.compile(
  38     r'^[a-z]+://([^/:]+|([0-9]{1,3}\.){3}[0-9]{1,3})(:[0-9]+)?(\/.*)?$',
  39     re.IGNORECASE)
  40
  41 def is_uri(instance):
  42     """
  43     jsonschema uri validator
  44     """
  45     if not isinstance(instance, str_types):
  46         return True
  47
  48     return URL_REGEX.match(instance)
  49
  50 def is_datetime(instance):
  51     """
  52     Is a date or datetime readable string.
  53     """
  54     if not isinstance(instance, str_types):
  55         return True
  56
  57     return dateutil.parser.parse(instance)
  58
  59
  60 class DefaultChecker(FormatChecker):
  61     """
  62     Default MediaGoblin format checker... extended to include a few extra things
  63     """
  64     checkers = copy.deepcopy(draft4_format_checker.checkers)
  65
  66
  67 DefaultChecker.checkers[u"uri"] = (is_uri, ())
  68 DefaultChecker.checkers[u"date-time"] = (is_datetime, (ValueError, TypeError))
  69 DEFAULT_CHECKER = DefaultChecker()
  70
  71 # Crappy default schema, checks for things we deem important
  72
  73 DEFAULT_SCHEMA = {
  74     "$schema": "http://json-schema.org/schema#",
  75
  76     "type": "object",
  77     "properties": {
  78         "license": {
  79             "format": "uri",
  80             "type": "string",
  81         },
  82         "dcterms:created": {
  83             "format": "date-time",
  84             "type": "string",
  85         },
  86         "dc:created": {
  87             "format": "date-time",
  88             "type": "string",
  89         }
  90     },
  91 }
  92
  93
  94 def load_resource(package, resource_path):
  95     """
  96     Load a resource, return it as a string.
  97
  98     Args:
  99     - package: package or module name.  Eg "mediagoblin.media_types.audio"
 100     - resource_path: path to get to this resource, a list of
 101       directories and finally a filename.  Will be joined with
 102       os.path.sep.
 103     """
 104     filename = resource_filename(package, os.path.sep.join(resource_path))
 105     return file(filename).read()
 106
 107 def load_resource_json(package, resource_path):
 108     """
 109     Load a resource json file, return a dictionary.
 110
 111     Args:
 112     - package: package or module name.  Eg "mediagoblin.media_types.audio"
 113     - resource_path: path to get to this resource, a list of
 114       directories and finally a filename.  Will be joined with
 115       os.path.sep.
 116     """
 117     return json.loads(load_resource(package, resource_path))
 118
 119
 120 ##################################
 121 ## Load the MediaGoblin core files
 122 ##################################
 123
 124
 125 BUILTIN_CONTEXTS = {
 126     "http://www.w3.org/2013/json-ld-context/rdfa11": load_resource(
 127         "mediagoblin", ["static", "metadata", "rdfa11.jsonld"])}
 128
 129
 130 _CONTEXT_CACHE = {}
 131
 132 def load_context(url):
 133     """
 134     A self-aware document loader.  For those contexts MediaGoblin
 135     stores internally, load them from disk.
 136     """
 137     if url in _CONTEXT_CACHE:
 138         return _CONTEXT_CACHE[url]
 139
 140     # See if it's one of our basic ones
 141     document = BUILTIN_CONTEXTS.get(url, None)
 142
 143     # No?  See if we have an internal schema for this
 144     if document is None:
 145         document = hook_handle(("context_url_data", url))
 146
 147     # Okay, if we've gotten a document by now... let's package it up
 148     if document is not None:
 149         document = {'contextUrl': None,
 150                     'documentUrl': url,
 151                     'document': document}
 152
 153     # Otherwise, use jsonld.load_document
 154     else:
 155         document = jsonld.load_document(url)
 156
 157     # cache
 158     _CONTEXT_CACHE[url] = document
 159     return document
 160
 161
 162 DEFAULT_CONTEXT = "http://www.w3.org/2013/json-ld-context/rdfa11"
 163
 164 def compact_json(metadata, context=DEFAULT_CONTEXT):
 165     """
 166     Compact json with supplied context.
 167
 168     Note: Free floating" nodes are removed (eg a key just named
 169     "bazzzzzz" which isn't specified in the context... something like
 170     bazzzzzz:blerp will stay though.  This is jsonld.compact behavior.
 171     """
 172     compacted = jsonld.compact(
 173         metadata, context,
 174         options={
 175             "documentLoader": load_context,
 176             # This allows for things like "license" and etc to be preserved
 177             "expandContext": context,
 178             "keepFreeFloatingNodes": False})
 179
 180     return compacted
 181
 182
 183 def compact_and_validate(metadata, context=DEFAULT_CONTEXT,
 184                          schema=DEFAULT_SCHEMA):
 185     """
 186     compact json with supplied context, check against schema for errors
 187
 188     raises an exception (jsonschema.exceptions.ValidationError) if
 189     there's an error.
 190
 191     Note: Free floating" nodes are removed (eg a key just named
 192     "bazzzzzz" which isn't specified in the context... something like
 193     bazzzzzz:blerp will stay though.  This is jsonld.compact behavior.
 194
 195     You may wish to do this validation yourself... this is just for convenience.
 196     """
 197     compacted = compact_json(metadata, context)
 198     validate(metadata, schema, format_checker=DEFAULT_CHECKER)
 199
 200     return compacted
 201
 202
 203 def expand_json(metadata, context=DEFAULT_CONTEXT):
 204     """
 205     Expand json, but be sure to use our documentLoader.
 206
 207     By default this expands with DEFAULT_CONTEXT, but if you do not need this,
 208     you can safely set this to None.
 209
 210     # @@: Is the above a good idea?  Maybe it should be set to None by
 211     #   default.
 212     """
 213     options = {
 214         "documentLoader": load_context}
 215     if context is not None:
 216         options["expandContext"] = context
 217     return jsonld.expand(metadata, options=options)
 218
 219
 220 def rdfa_to_readable(rdfa_predicate):
 221     readable = rdfa_predicate.split(u":")[1].capitalize()
 222     return readable