this negation needs parens.
[mediagoblin.git] / mediagoblin / tools / metadata.py
1 # GNU MediaGoblin -- federated, autonomous media hosting
2 # Copyright (C) 2011, 2012 MediaGoblin contributors. See AUTHORS.
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published by
6 # the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
16
17
18 import os
19 import copy
20 import json
21 import re
22 from pkg_resources import resource_filename
23
24 import dateutil.parser
25 from pyld import jsonld
26 from jsonschema import validate, FormatChecker, draft4_format_checker
27 from jsonschema.compat import str_types
28
29 from mediagoblin.tools.pluginapi import hook_handle
30
31
32
33 ########################################################
34 ## Set up the MediaGoblin format checker for json-schema
35 ########################################################
36
37 URL_REGEX = re.compile(
38 r'^[a-z]+://([^/:]+|([0-9]{1,3}\.){3}[0-9]{1,3})(:[0-9]+)?(\/.*)?$',
39 re.IGNORECASE)
40
41 def is_uri(instance):
42 """
43 jsonschema uri validator
44 """
45 if not isinstance(instance, str_types):
46 return True
47
48 return URL_REGEX.match(instance)
49
50 def is_datetime(instance):
51 """
52 Is a date or datetime readable string.
53 """
54 if not isinstance(instance, str_types):
55 return True
56
57 return dateutil.parser.parse(instance)
58
59
60 class DefaultChecker(FormatChecker):
61 """
62 Default MediaGoblin format checker... extended to include a few extra things
63 """
64 checkers = copy.deepcopy(draft4_format_checker.checkers)
65
66
67 DefaultChecker.checkers[u"uri"] = (is_uri, ())
68 DefaultChecker.checkers[u"date-time"] = (is_datetime, (ValueError, TypeError))
69 DEFAULT_CHECKER = DefaultChecker()
70
71 # Crappy default schema, checks for things we deem important
72
73 DEFAULT_SCHEMA = {
74 "$schema": "http://json-schema.org/schema#",
75
76 "type": "object",
77 "properties": {
78 "license": {
79 "format": "uri",
80 "type": "string",
81 },
82 "dcterms:created": {
83 "format": "date-time",
84 "type": "string",
85 },
86 "dc:created": {
87 "format": "date-time",
88 "type": "string",
89 }
90 },
91 }
92
93
94 def load_resource(package, resource_path):
95 """
96 Load a resource, return it as a string.
97
98 Args:
99 - package: package or module name. Eg "mediagoblin.media_types.audio"
100 - resource_path: path to get to this resource, a list of
101 directories and finally a filename. Will be joined with
102 os.path.sep.
103 """
104 filename = resource_filename(package, os.path.sep.join(resource_path))
105 return file(filename).read()
106
107 def load_resource_json(package, resource_path):
108 """
109 Load a resource json file, return a dictionary.
110
111 Args:
112 - package: package or module name. Eg "mediagoblin.media_types.audio"
113 - resource_path: path to get to this resource, a list of
114 directories and finally a filename. Will be joined with
115 os.path.sep.
116 """
117 return json.loads(load_resource(package, resource_path))
118
119
120 ##################################
121 ## Load the MediaGoblin core files
122 ##################################
123
124
125 BUILTIN_CONTEXTS = {
126 "http://www.w3.org/2013/json-ld-context/rdfa11": load_resource(
127 "mediagoblin", ["static", "metadata", "rdfa11.jsonld"])}
128
129
130 _CONTEXT_CACHE = {}
131
132 def load_context(url):
133 """
134 A self-aware document loader. For those contexts MediaGoblin
135 stores internally, load them from disk.
136 """
137 if url in _CONTEXT_CACHE:
138 return _CONTEXT_CACHE[url]
139
140 # See if it's one of our basic ones
141 document = BUILTIN_CONTEXTS.get(url, None)
142
143 # No? See if we have an internal schema for this
144 if document is None:
145 document = hook_handle(("context_url_data", url))
146
147 # Okay, if we've gotten a document by now... let's package it up
148 if document is not None:
149 document = {'contextUrl': None,
150 'documentUrl': url,
151 'document': document}
152
153 # Otherwise, use jsonld.load_document
154 else:
155 document = jsonld.load_document(url)
156
157 # cache
158 _CONTEXT_CACHE[url] = document
159 return document
160
161
162 DEFAULT_CONTEXT = "http://www.w3.org/2013/json-ld-context/rdfa11"
163
164 def compact_json(metadata, context=DEFAULT_CONTEXT):
165 """
166 Compact json with supplied context.
167
168 Note: Free floating" nodes are removed (eg a key just named
169 "bazzzzzz" which isn't specified in the context... something like
170 bazzzzzz:blerp will stay though. This is jsonld.compact behavior.
171 """
172 compacted = jsonld.compact(
173 metadata, context,
174 options={
175 "documentLoader": load_context,
176 # This allows for things like "license" and etc to be preserved
177 "expandContext": context,
178 "keepFreeFloatingNodes": False})
179
180 return compacted
181
182
183 def compact_and_validate(metadata, context=DEFAULT_CONTEXT,
184 schema=DEFAULT_SCHEMA):
185 """
186 compact json with supplied context, check against schema for errors
187
188 raises an exception (jsonschema.exceptions.ValidationError) if
189 there's an error.
190
191 Note: Free floating" nodes are removed (eg a key just named
192 "bazzzzzz" which isn't specified in the context... something like
193 bazzzzzz:blerp will stay though. This is jsonld.compact behavior.
194
195 You may wish to do this validation yourself... this is just for convenience.
196 """
197 compacted = compact_json(metadata, context)
198 validate(metadata, schema, format_checker=DEFAULT_CHECKER)
199
200 return compacted
201
202
203 def expand_json(metadata, context=DEFAULT_CONTEXT):
204 """
205 Expand json, but be sure to use our documentLoader.
206
207 By default this expands with DEFAULT_CONTEXT, but if you do not need this,
208 you can safely set this to None.
209
210 # @@: Is the above a good idea? Maybe it should be set to None by
211 # default.
212 """
213 options = {
214 "documentLoader": load_context}
215 if context is not None:
216 options["expandContext"] = context
217 return jsonld.expand(metadata, options=options)
218
219
220 def rdfa_to_readable(rdfa_predicate):
221 readable = rdfa_predicate.split(u":")[1].capitalize()
222 return readable