finally fix url validator
[mediagoblin.git] / mediagoblin / tools / metadata.py
1 # GNU MediaGoblin -- federated, autonomous media hosting
2 # Copyright (C) 2011, 2012 MediaGoblin contributors. See AUTHORS.
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published by
6 # the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
16
17
18 from io import open
19 import os
20 import copy
21 import json
22 import re
23 from pkg_resources import resource_filename
24
25 import dateutil.parser
26 from pyld import jsonld
27 from jsonschema import validate, FormatChecker, draft4_format_checker
28 from jsonschema.compat import str_types
29
30 from mediagoblin.tools.pluginapi import hook_handle
31
32
33
34 ########################################################
35 ## Set up the MediaGoblin format checker for json-schema
36 ########################################################
37
38 URL_REGEX = re.compile(
39 r'^[a-z]+://([^/:]+|([0-9]{1,3}\.){3}[0-9]{1,3})(:[0-9]+)?(\/.*)?$',
40 re.IGNORECASE)
41
42 def is_uri(instance):
43 """
44 jsonschema uri validator
45 """
46 if not isinstance(instance, str_types):
47 return True
48
49 return URL_REGEX.match(instance)
50
51 def is_datetime(instance):
52 """
53 Is a date or datetime readable string.
54 """
55 if not isinstance(instance, str_types):
56 return True
57
58 return dateutil.parser.parse(instance)
59
60
61 class DefaultChecker(FormatChecker):
62 """
63 Default MediaGoblin format checker... extended to include a few extra things
64 """
65 checkers = copy.deepcopy(draft4_format_checker.checkers)
66
67
68 DefaultChecker.checkers[u"uri"] = (is_uri, ())
69 DefaultChecker.checkers[u"date-time"] = (is_datetime, (ValueError, TypeError))
70 DEFAULT_CHECKER = DefaultChecker()
71
72 # Crappy default schema, checks for things we deem important
73
74 DEFAULT_SCHEMA = {
75 "$schema": "http://json-schema.org/schema#",
76
77 "type": "object",
78 "properties": {
79 "license": {
80 "format": "uri",
81 "type": "string",
82 },
83 "dcterms:created": {
84 "format": "date-time",
85 "type": "string",
86 },
87 "dc:created": {
88 "format": "date-time",
89 "type": "string",
90 }
91 },
92 }
93
94
95 def load_resource(package, resource_path):
96 """
97 Load a resource, return it as a string.
98
99 Args:
100 - package: package or module name. Eg "mediagoblin.media_types.audio"
101 - resource_path: path to get to this resource, a list of
102 directories and finally a filename. Will be joined with
103 os.path.sep.
104 """
105 filename = resource_filename(package, os.path.sep.join(resource_path))
106 return open(filename, encoding="utf-8").read()
107
108 def load_resource_json(package, resource_path):
109 """
110 Load a resource json file, return a dictionary.
111
112 Args:
113 - package: package or module name. Eg "mediagoblin.media_types.audio"
114 - resource_path: path to get to this resource, a list of
115 directories and finally a filename. Will be joined with
116 os.path.sep.
117 """
118 return json.loads(load_resource(package, resource_path))
119
120
121 ##################################
122 ## Load the MediaGoblin core files
123 ##################################
124
125
126 BUILTIN_CONTEXTS = {
127 "http://www.w3.org/2013/json-ld-context/rdfa11": load_resource(
128 "mediagoblin", ["static", "metadata", "rdfa11.jsonld"])}
129
130
131 _CONTEXT_CACHE = {}
132
133 def load_context(url):
134 """
135 A self-aware document loader. For those contexts MediaGoblin
136 stores internally, load them from disk.
137 """
138 if url in _CONTEXT_CACHE:
139 return _CONTEXT_CACHE[url]
140
141 # See if it's one of our basic ones
142 document = BUILTIN_CONTEXTS.get(url, None)
143
144 # No? See if we have an internal schema for this
145 if document is None:
146 document = hook_handle(("context_url_data", url))
147
148 # Okay, if we've gotten a document by now... let's package it up
149 if document is not None:
150 document = {'contextUrl': None,
151 'documentUrl': url,
152 'document': document}
153
154 # Otherwise, use jsonld.load_document
155 else:
156 document = jsonld.load_document(url)
157
158 # cache
159 _CONTEXT_CACHE[url] = document
160 return document
161
162
163 DEFAULT_CONTEXT = "http://www.w3.org/2013/json-ld-context/rdfa11"
164
165 def compact_json(metadata, context=DEFAULT_CONTEXT):
166 """
167 Compact json with supplied context.
168
169 Note: Free floating" nodes are removed (eg a key just named
170 "bazzzzzz" which isn't specified in the context... something like
171 bazzzzzz:blerp will stay though. This is jsonld.compact behavior.
172 """
173 compacted = jsonld.compact(
174 metadata, context,
175 options={
176 "documentLoader": load_context,
177 # This allows for things like "license" and etc to be preserved
178 "expandContext": context,
179 "keepFreeFloatingNodes": False})
180
181 return compacted
182
183
184 def compact_and_validate(metadata, context=DEFAULT_CONTEXT,
185 schema=DEFAULT_SCHEMA):
186 """
187 compact json with supplied context, check against schema for errors
188
189 raises an exception (jsonschema.exceptions.ValidationError) if
190 there's an error.
191
192 Note: Free floating" nodes are removed (eg a key just named
193 "bazzzzzz" which isn't specified in the context... something like
194 bazzzzzz:blerp will stay though. This is jsonld.compact behavior.
195
196 You may wish to do this validation yourself... this is just for convenience.
197 """
198 compacted = compact_json(metadata, context)
199 validate(metadata, schema, format_checker=DEFAULT_CHECKER)
200
201 return compacted
202
203
204 def expand_json(metadata, context=DEFAULT_CONTEXT):
205 """
206 Expand json, but be sure to use our documentLoader.
207
208 By default this expands with DEFAULT_CONTEXT, but if you do not need this,
209 you can safely set this to None.
210
211 # @@: Is the above a good idea? Maybe it should be set to None by
212 # default.
213 """
214 options = {
215 "documentLoader": load_context}
216 if context is not None:
217 options["expandContext"] = context
218 return jsonld.expand(metadata, options=options)
219
220
221 def rdfa_to_readable(rdfa_predicate):
222 readable = rdfa_predicate.split(u":")[1].capitalize()
223 return readable