From e21a0d6df95d4b7cbb8f8cf23fd757a8e63c5515 Mon Sep 17 00:00:00 2001
From: Stephan Heunis <s.heunis@fz-juelich.de>
Date: Sat, 18 Nov 2023 22:54:02 +0100
Subject: [PATCH] adds a datalad-catalog contained extractor for datalad
 datasets

This ports most functionality from datalad_metalad.extractors.core into a script
that also then adds translation into the catalog schema. The script receives
a path to a datalad dataset as parameter and outputs a metadata record that can
immediatelt be added toa catalog. In this way, the dependence on metalad is
removed and the explicit Translator functionality of the catalog (which also
depends on jq bindings) does not have to be used. The reason for doing this is
to have a self-contained script that could in future just be ripped and replaced
with whatever new functionality supercedes this.
---
 datalad_catalog/extractors/catalog_core.py | 305 +++++++++++++++++++++
 1 file changed, 305 insertions(+)
 create mode 100644 datalad_catalog/extractors/catalog_core.py

diff --git a/datalad_catalog/extractors/catalog_core.py b/datalad_catalog/extractors/catalog_core.py
new file mode 100644
index 00000000..c84d7f1f
--- /dev/null
+++ b/datalad_catalog/extractors/catalog_core.py
@@ -0,0 +1,305 @@
+# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
+# ex: set sts=4 ts=4 sw=4 noet:
+# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
+#
+#   See COPYING file distributed along with the datalad package for the
+#   copyright and license terms.
+#
+# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
+"""Extractor for DataLad dataset metadata - ported from datalad-metalad"""
+from argparse import ArgumentParser
+import hashlib
+import json
+from pathlib import Path
+import os.path as op
+import time
+
+import datalad.support.network as dsn
+from datalad_next.constraints import EnsureBool
+from datalad_next.constraints.dataset import EnsureDataset
+
+
+def get_dataset_metadata(ds, refcommit, status):
+    meta = []
+    # Get dataset commit info
+    commitinfo = get_commit_info(ds, refcommit)
+    # Get contributors from commits
+    contributor_ids = []
+    for contributor in commitinfo.pop('contributors', []):
+        contributor_id = get_agent_id(*contributor[:2])
+        meta.append({
+            '@id': contributor_id,
+            # we cannot distinguish real people from machine-committers
+            '@type': 'agent',
+            'name': contributor[0],
+            'email': contributor[1],
+        })
+        contributor_ids.append(contributor_id)
+    # Set up dataset metadata dict
+    dsmeta = {
+        # the uniquest ID for this metadata record is the refcommit SHA
+        '@id': refcommit,
+        # the dataset UUID is the main identifier
+        'identifier': ds.id,
+        '@type': 'Dataset',
+    }
+    dsmeta.update(commitinfo)
+    # Add contributors
+    if contributor_ids:
+        c = [{'@id': i} for i in contributor_ids]
+        dsmeta['hasContributor'] = c[0] if len(c) == 1 else c
+    # Add subdatasets
+    parts = []
+    for subds in [s for s in status if s['type'] == 'dataset']:
+        subdsinfo = {
+            # reference by subdataset commit
+            '@id': 'datalad:{}'.format(subds['gitshasum']),
+            '@type': 'Dataset',
+            'name': Path(subds['path']).relative_to(ds.pathobj).as_posix(),
+        }
+        subdsid = ds.subdatasets(
+            contains=subds['path'],
+            return_type='item-or-list',
+            result_renderer="disabled").get('gitmodule_datalad-id', None)
+        if subdsid:
+            subdsinfo['identifier'] = 'datalad:{}'.format(subdsid)
+        parts.append(subdsinfo)
+    if parts:
+        dsmeta['hasPart'] = parts
+    # Add distributions
+    if ds.config.obtain(
+            'datalad.metadata.datalad-core.report-remotes',
+            True, valtype=EnsureBool()):
+        remote_names = ds.repo.get_remotes()
+        distributions = []
+        known_uuids = {}
+        # start with configured Git remotes
+        for r in remote_names:
+            info = {
+                'name': r,
+                # not very informative
+                #'description': 'DataLad dataset sibling',
+            }
+            url = ds.config.get('remote.{}.url'.format(r), None)
+            # best effort to recode whatever is configured into a URL
+            if url is not None:
+                url = ri2url(dsn.RI(url))
+            if url:
+                info['url'] = url
+            # do we have information on the annex ID?
+            annex_uuid = ds.config.get(
+                'remote.{}.annex-uuid'.format(r), None)
+            if annex_uuid is not None:
+                info['@id'] = 'datalad:{}'.format(annex_uuid)
+                known_uuids[annex_uuid] = info
+            if 'url' in info or '@id' in info:
+                # only record if we have any identifying information
+                # otherwise it is pointless cruft
+                distributions.append(info)
+        # now look for annex info
+        if hasattr(ds.repo, 'repo_info'):
+            info = ds.repo.repo_info(fast=True)
+            for cat in ('trusted repositories',
+                        'semitrusted repositories',
+                        'untrusted repositories'):
+                for r in info[cat]:
+                    if r['here'] or r['uuid'] in (
+                            '00000000-0000-0000-0000-000000000001',
+                            '00000000-0000-0000-0000-000000000002'):
+                        # ignore local and universally available
+                        # remotes
+                        continue
+                    # avoid duplicates, but record all sources, even
+                    # if not URLs are around
+                    if r['uuid'] not in known_uuids:
+                        distributions.append({'@id': r['uuid']})
+        if distributions:
+            dsmeta['distribution'] = sorted(
+                distributions,
+                key=lambda x: x.get('@id', x.get('url', None))
+            )
+        meta.append(dsmeta)
+    return meta
+
+
+def get_commit_info(ds, refcommit):
+    """Get info about all commits, up to (and incl. the refcommit)"""
+    #- get all the commit info with git log --pretty='%aN%x00%aI%x00%H'
+    #  - use all first-level paths other than .datalad and .git for the query
+    #- from this we can determine all modification timestamps, described refcommit
+    #- do a subsequent git log query for the determined refcommit to determine
+    #  a version by counting all commits since inception up to the refcommit
+    #  - we cannot use the first query, because it will be constrained by the
+    #    present paths that may not have existed previously at all
+
+    # grab the history until the refcommit
+    commits = [
+        line.split('\0')
+        for line in ds.repo.call_git_items_(
+            # name, email, timestamp, shasum
+            ['log', '--pretty=format:%aN%x00%aE%x00%aI%x00%H', refcommit]
+        )
+    ]
+    # version, always anchored on the first commit (tags could move and
+    # make the integer commit count ambiguous, and subtantially complicate
+    # version comparisons
+    version = '0-{}-g{}'.format(
+        len(commits),
+        # abbreviated shasum (like git-describe)
+        ds.repo.get_hexsha(commits[0][3], short=True),
+    )
+    meta = {
+        'version': version,
+    }
+    if ds.config.obtain(
+            'datalad.metadata.datalad-core.report-contributors',
+            True, valtype=EnsureBool()):
+        meta.update(
+            contributors=sorted(set(tuple(c[:2]) for c in commits)))
+    if ds.config.obtain(
+            'datalad.metadata.datalad-core.report-modification-dates',
+            True, valtype=EnsureBool()):
+        meta.update(
+            dateCreated=commits[-1][2],
+            dateModified=commits[0][2],
+        )
+    return meta
+
+
+def get_agent_id(name, email):
+    """Return a suitable '@id' for committers/authors
+
+    In most cases we will not have a URL for people/software agents.
+    Let's create a string ID that is based on the combination of both
+    name and email. Return an MD5 hash instead of a plain-text string
+    to discourage direct interpretation by humans.
+    """
+    return hashlib.md5(u'{}<{}>'.format(
+        name.replace(' ', '_'),
+        email
+    ).encode('utf-8')).hexdigest()
+
+
+# TODO RF to be merged with datalad.support.network
+def ri2url(ri):
+    f = ri.fields
+    if isinstance(ri, dsn.URL):
+        return ri.as_str()
+    elif isinstance(ri, dsn.SSHRI):
+        # construct a URL that Git would understand
+        return 'ssh://{}{}{}{}{}{}'.format(
+            f['username'],
+            '@' if f['username'] else '',
+            f['hostname'],
+            ':' if f['port'] else '',
+            f['port'],
+            f['path'] if op.isabs(f['path'])
+            else '/{}'.format(f['path']) if f['path'].startswith('~')
+            else '/~/{}'.format(f['path'])
+        )
+    elif isinstance(ri, dsn.PathRI):
+        # this has no chance of being resolved outside this machine
+        # not work reporting
+        return None
+    
+
+def get_dataset_url(graph):
+    dataset = [x for x in graph if x["@type"] == "Dataset"]
+    dataset = dataset[0]
+    dist = dataset.get("distribution", [])
+    return [d["url"] for d in dist if "url" in d]
+    
+
+def get_authors(graph):
+    return [{"name": x["name"], "email": x["email"]} for x in graph if x["@type"] == "agent"]
+
+
+def get_subdatasets(graph):
+    dataset = [x for x in graph if x["@type"] == "Dataset"]
+    dataset = dataset[0]
+    haspart = dataset.get("hasPart", [])
+    subs = [
+        {
+            "dataset_id": h["identifier"].replace("datalad:", ""),
+            "dataset_version": h["@id"].replace("datalad:", ""),
+            "dataset_path": h["name"],
+            "dirs_from_path": [],
+        } for h in haspart
+    ]
+    return subs if len(subs) > 0 else None
+    
+
+def get_metadata_source(metadata_record):
+    s = {
+        "key_source_map": {},
+        "sources": [
+            {
+                "source_name": metadata_record["extractor_name"], 
+                "source_version": metadata_record["extractor_version"], 
+                "source_parameter": metadata_record["extraction_parameter"], 
+                "source_time": metadata_record["extraction_time"], 
+                "agent_email": metadata_record["agent_email"], 
+                "agent_name": metadata_record["agent_name"]
+            }
+        ]
+    }
+    return s if len(s) > 0 else None
+
+
+def translate(metadata_record, graph):
+    translated_record = {
+        "type": metadata_record["type"],
+        "dataset_id": metadata_record["dataset_id"],
+        "dataset_version": metadata_record["dataset_version"],
+        "metadata_sources": get_metadata_source(metadata_record),
+        "name": "",
+        "url": get_dataset_url(graph),
+        "authors": get_authors(graph),
+        "subdatasets": get_subdatasets(graph),
+    }
+    return {k: v for k, v in translated_record.items() if v is not None}
+
+
+
+# SCRIPT EXECUTION STARTS HERE
+
+parser = ArgumentParser()
+parser.add_argument("dataset_path", type=Path, help="Path to the datalad dataset")
+args = parser.parse_args()
+source_dataset=EnsureDataset(
+    installed=True, 
+    purpose='extract core metadata',
+    require_id=True)(args.dataset_path).ds
+source_dataset_id = source_dataset.id,
+source_dataset_version = source_dataset.repo.get_hexsha()
+agent_name = source_dataset.config.get("user.name")
+agent_email = source_dataset.config.get("user.email")
+status = source_dataset.subdatasets(result_renderer="disabled")
+metadata = get_dataset_metadata(source_dataset, source_dataset_version, status)
+default_context = {
+    # schema.org definitions by default
+    "@vocab": "http://schema.org/",
+    # DataLad ID prefix, pointing to our own resolver
+    "datalad": "http://dx.datalad.org/",
+}
+# meta_out structures the metadata in exactly the same
+# way as datalad meta-extract outputs it
+meta_out = dict(
+    dataset_id = source_dataset.id,
+    dataset_version = source_dataset_version,
+    extractor_name = "catalog_core",
+    extractor_version = "1",
+    extraction_parameter = {},
+    extraction_time = time.time(),
+    agent_name = agent_name,
+    agent_email = agent_email,
+    extracted_metadata = {
+        "@context": default_context,
+        "@graph": metadata,
+    },
+    type="dataset",
+)
+# translate() does more or less the same as
+# datalad_catalog.translators.metalad_core_translator
+meta_translated = translate(meta_out, metadata)
+print(json.dumps(meta_translated))
\ No newline at end of file