pull: Add JSON-LD markup
authorW. Trevor King <wking@tremily.us>
Thu, 4 Jan 2018 23:44:17 +0000 (15:44 -0800)
committerW. Trevor King <wking@tremily.us>
Mon, 26 Mar 2018 16:39:37 +0000 (09:39 -0700)
This should help with forward/backward compatibility, because
consumers can use JSON-LD to extract the semantic meaning regardless
of how I lay out the data.  From [1]:

  This information allows developers to re-use each other's data
  without having to agree to how their data will interoperate on a
  site-by-site basis.

There's a framing spec in the works [2] to support "request your own
layout" in an API.

I had to cludge a bit to support FSF IDs in licenses-full.json.  The
released JSON-LD 1.0 has index maps, but [3]:

  Note how the index keys do not appear in the Linked Data below, but
  would continue to exist if the document were compacted or
  expanded...

which isn't very useful for folks who are parsing the file as JSON-LD
and who need access to the FSF IDs.  To work around that, I've used
the FSF IDs as both the 'licenses' keys and as 'id' entries in the
'licenses' values.

We might be able to specify the semantics of the object keys with the
unreleased 1.1 [4] and its ID maps [5], but I've left that off for
now.

There is some background on classifying non-URI identifiers in [6].
Currently I'm not classifying my identifiers.

I'm using HTTPS identifiers for schema.org, because that's the
long term target [7].

I've tested the output JSON-LD in [8], and the compacted
licenses-full.json looks like:

  {
    "http://tremily.us/fsf/schema/license.jsonld": [
      {
        "@index": "ACDL",
        "https://schema.org/identifier": "ACDL",
        "https://schema.org/keywords": "libre",
        "https://schema.org/name": "Apple's Common Documentation License, Version 1.0",
        "https://schema.org/url": {
          "@list": [
            "https://www.gnu.org/licenses/license-list.html#ACDL",
            "http://fedoraproject.org/wiki/Licensing/Common_Documentation_License"
          ]
        }
      },
      {
        "@index": "AGPLv1.0",
        "https://schema.org/identifier": [
          "AGPLv1.0",
          {
            "@index": "spdx",
            "@value": "AGPL-1.0"
          }
        ],
        "https://schema.org/keywords": "libre",
        "https://schema.org/name": "Affero General Public License version 1",
        "https://schema.org/url": {
          "@list": [
            "https://www.gnu.org/licenses/license-list.html#AGPLv1.0",
            "http://directory.fsf.org/wiki/License:AGPLv1"
          ]
        }
      },
      ...
    ]
  }

[1]: https://www.w3.org/TR/2014/REC-json-ld-20140116/#h3_the-context
[2]: https://json-ld.org/spec/latest/json-ld-framing/
[3]: https://www.w3.org/TR/2014/REC-json-ld-20140116/#data-indexing
[4]: https://json-ld.org/spec/latest/json-ld/#changes-since-1-0-recommendation-of-16-january-2014
[5]: https://json-ld.org/spec/latest/json-ld/#node-identifier-indexing
[6]: http://meta.schema.org/docs/datamodel.html#identifierBg
[7]: http://schema.org/docs/faq.html#19
[8]: https://json-ld.org/playground/

pull.py

diff --git a/pull.py b/pull.py
index 995e2a707725808848b25a82bd10cdf0b0e5455c..0baab432562c94d786048b4702e81dcdbe525b7e 100755 (executable)
--- a/pull.py
+++ b/pull.py
@@ -266,8 +266,9 @@ def extract(root, base_uri=None):
     return licenses
 
 
-def save(licenses, dir=os.curdir):
-    os.makedirs(dir, exist_ok=True)
+def save(licenses, base_uri, dir=os.curdir):
+    schema_dir = os.path.join(dir, 'schema')
+    os.makedirs(schema_dir, exist_ok=True)
     if sys.version_info >= (3, 5):
         paths = glob.glob(os.path.join(dir, '**', '*.json'), recursive=True)
     else:
@@ -277,17 +278,60 @@ def save(licenses, dir=os.curdir):
         )
     for path in paths:
         os.remove(path)
+    license_schema = {
+        '@context': {
+            'schema': 'https://schema.org/',
+            'id': {
+                '@id': 'schema:identifier'
+            },
+            'name': {
+                '@id': 'schema:name',
+            },
+            'uris': {
+                '@container': '@list',
+                '@id': 'schema:url',
+            },
+            'tags': {
+                '@id': 'schema:keywords',
+            },
+            'identifiers': {
+                '@container': '@index',
+                '@id': 'schema:identifier',
+            },
+        },
+    }
+    with open(os.path.join(schema_dir, 'license.jsonld'), 'w') as f:
+        json.dump(obj=license_schema, fp=f, indent=2)
+        f.write('\n')
+    license_schema_uri = urllib.parse.urljoin(
+        base=base_uri, url='schema/license.jsonld')
+    licenses_schema = license_schema.copy()
+    licenses_schema['@context']['licenses'] = {
+        '@container': '@index',
+        '@id': license_schema_uri,
+    }
+    licenses_schema.update(license_schema)
+    with open(os.path.join(schema_dir, 'licenses.jsonld'), 'w') as f:
+        json.dump(obj=licenses_schema, fp=f, indent=2, sort_keys=True)
+        f.write('\n')
+    licenses_schema_uri = urllib.parse.urljoin(
+        base=base_uri, url='schema/licenses.jsonld')
     index = sorted(licenses.keys())
     with open(os.path.join(dir, 'licenses.json'), 'w') as f:
-        json.dump(obj=index, fp=f, indent=2)
+        json.dump(obj=index, fp=f, indent=2, sort_keys=True)
         f.write('\n')
-    full_index = {}
+    full_index = {
+        '@context': licenses_schema_uri,
+        'licenses': {},
+    }
     for id, license in licenses.items():
         license = license.copy()
         if 'tags' in license:
             license['tags'] = sorted(license['tags'])
-        full_index[id] = license.copy()
         license['id'] = id
+        full_index['licenses'][id] = license.copy()
+        license['@context'] = urllib.parse.urljoin(
+            base=base_uri, url='schema/license.jsonld')
         license_path = os.path.join(dir, '{}.json'.format(id))
         with open(license_path, 'w') as f:
             json.dump(obj=license, fp=f, indent=2, sort_keys=True)
@@ -317,4 +361,4 @@ if __name__ == '__main__':
     if unused_identifiers:
         raise ValueError('unused IDENTIFIERS keys: {}'.format(
             ', '.join(sorted(unused_identifiers))))
-    save(licenses=licenses, dir=dir)
+    save(licenses=licenses, base_uri='https://wking.github.io/fsf-api/', dir=dir)