Skip to content

SHACL Validation #767

New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Merged
merged 36 commits into from
Nov 29, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
d5be7fe
Adds shacl validation to renku doctor
Oct 23, 2019
f8625b3
Updates SHACL schema to validate current KG
Oct 25, 2019
6a83b7b
Adds Unit tests for dataset and project jsonld structure
Oct 31, 2019
7a08904
Improves SHACL tests for project and dataset jsonld
Nov 1, 2019
9a34f7f
Merge branch 'master' into 755-kg-validation
Nov 1, 2019
d1243f3
Fixes SHACl validation and adds validation to renku log
Nov 1, 2019
e3bc017
Adds renku log tests
Nov 5, 2019
52c5bcc
Adds some provenance types to the SHACL graph
Nov 5, 2019
db33fc0
Merge branch 'master' into 755-kg-validation
Nov 7, 2019
39b2e9a
fixes rdf log output and pyshacl dependency
Nov 7, 2019
b9535a0
Fixes Project and Dataset SHACL tests
Nov 7, 2019
5339b6d
Adds workflow SHACl validation
Nov 8, 2019
5624fdd
Fix SHACL schema and renku log validation
Nov 8, 2019
f0f20d4
Fix memory leak in integration tests
Nov 8, 2019
b8ddbd5
Merge branch 'master' into 755-kg-validation
rokroskar Nov 10, 2019
8370636
Merge branch 'master' into 755-kg-validation
Panaetius Nov 12, 2019
c827f3a
Fixes Python 3.5 path
Nov 12, 2019
bcb441b
Merge branch '755-kg-validation' of github.com:SwissDataScienceCenter…
Nov 12, 2019
a760d2f
Fixes sort order
Nov 12, 2019
901c67c
Fixes python 3.5 tests
Nov 12, 2019
5e72d13
Merge branch 'master' into 755-kg-validation
Panaetius Nov 12, 2019
835cff1
Patches pyld to prevent contexts from overwhelming its cache
Nov 14, 2019
fda27ad
Merge branch '755-kg-validation' of github.com:SwissDataScienceCenter…
Nov 14, 2019
b776273
Merge branch 'master' into 755-kg-validation
Panaetius Nov 14, 2019
04b07eb
Cleanup of pyld patch
Nov 14, 2019
ed81d9f
Updates documentation
Nov 14, 2019
b299b49
Cleanup: remove hardcoded versions from shacl shape
Nov 14, 2019
86e0456
Merge branch 'master' into 755-kg-validation
jsam Nov 15, 2019
328f5b5
Cleanup and addresses PR comments
Nov 15, 2019
e868ba0
Merge branch 'master' into 755-kg-validation
Panaetius Nov 15, 2019
c0468b4
fix: sanitize author name for nt output
rokroskar Nov 15, 2019
cd13981
refactor: Creator --> Person
rokroskar Nov 15, 2019
35296da
Merge branch 'master' into 755-kg-validation
rokroskar Nov 15, 2019
7537693
Merge branch 'master' into 755-kg-validation
Panaetius Nov 27, 2019
3a4a74d
Adapts SHACL and tests to changes on master
Nov 28, 2019
877113e
Changed to proper objects on loading and fix DatasetFile ids for old…
Nov 29, 2019
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ recursive-include renku *.html
recursive-include renku *.sh
recursive-include renku *.txt
recursive-include renku *.yml
recursive-include renku *.json
recursive-include renku Dockerfile
recursive-include tests *.py *.gz *.yml
recursive-include tests *.py *.gz *.yml *.json
prune .github
28 changes: 26 additions & 2 deletions renku/cli/log.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,15 @@

* `ascii`
* `dot`
* `dot-full`
* `dot-landscape`
* `dot-full-landscape`
* `dot-debug`
* `json-ld`
* `json-ld-graph`
* `Makefile`
* `nt`
* `rdf`

You can generate a PNG of the full history of all files in the repository
using the :program:`dot` program.
Expand All @@ -62,6 +71,15 @@
$ renku log --format dot $FILES | dot -Tpng > /tmp/graph.png
$ open /tmp/graph.png

Output validation
~~~~~~~~~~~~~~~~~

The ``--strict`` option forces the output to be validated against the Renku
SHACL schema, causing the command to fail if the generated output is not
valid, as well as printing detailed information on all the issues found.
The ``--strict`` option is only supported for the ``jsonld``, ``rdf`` and
``nt`` output formats.

"""

import click
Expand All @@ -86,9 +104,15 @@
default=False,
help='Display commands without output files.'
)
@click.option(
'--strict',
is_flag=True,
default=False,
help='Validate triples before output.'
)
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
@pass_local_client
def log(client, revision, format, no_output, paths):
def log(client, revision, format, no_output, strict, paths):
"""Show logs for a file."""
graph = Graph(client)
if not paths:
Expand All @@ -108,4 +132,4 @@ def log(client, revision, format, no_output, paths):
# NOTE shall we warn when "not no_output and not paths"?
graph.build(paths=paths, revision=revision, can_be_cwl=no_output)

FORMATS[format](graph)
FORMATS[format](graph, strict=strict)
3 changes: 3 additions & 0 deletions renku/core/commands/checks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,14 @@

from .migration import check_dataset_metadata, check_missing_files
from .references import check_missing_references
from .validate_shacl import check_project_structure, check_datasets_structure

# Checks will be executed in the order as they are listed in __all__.
# They are mostly used in ``doctor`` command to inspect broken things.
__all__ = (
'check_dataset_metadata',
'check_missing_files',
'check_missing_references',
'check_project_structure',
'check_datasets_structure',
)
114 changes: 114 additions & 0 deletions renku/core/commands/checks/validate_shacl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
# -*- coding: utf-8 -*-
#
# Copyright 2019 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Check KG structure using SHACL."""
import yaml
from rdflib.namespace import Namespace
from rdflib.term import BNode

from renku.core.commands.echo import WARNING
from renku.core.compat import pyld
from renku.core.models.jsonld import NoDatesSafeLoader
from renku.core.utils.shacl import validate_graph


def _shacl_graph_to_string(graph):
"""Converts a shacl validation graph into human readable format."""
sh = Namespace('http://www.w3.org/ns/shacl#')

problems = []

for _, result in graph.subject_objects(sh.result):
path = graph.value(result, sh.resultPath)
res = graph.value(result, sh.resultMessage)

if res:
message = '{0}: {1}'.format(path, res)
else:
kind = graph.value(result, sh.sourceConstraintComponent)
focusNode = graph.value(result, sh.focusNode)

if isinstance(focusNode, BNode):
focusNode = '<Anonymous>'

message = '{0}: Type: {1}, Node ID: {2}'.format(
path, kind, focusNode
)

problems.append(message)

return '\n\t'.join(problems)


def check_project_structure(client):
"""Validate project metadata against SHACL."""
project_path = client.renku_metadata_path

conform, graph, t = check_shacl_structure(project_path)

if conform:
return True, None

problems = '{0}Invalid structure of project metadata\n\t{1}'.format(
WARNING, _shacl_graph_to_string(graph)
)

return False, problems


def check_datasets_structure(client):
"""Validate dataset metadata against SHACL."""
ok = True

problems = ['{0}Invalid structure of dataset metadata'.format(WARNING)]

for path in client.renku_datasets_path.rglob(client.METADATA):
try:
conform, graph, t = check_shacl_structure(path)
except (Exception, BaseException) as e:
problems.append('Couldn\'t validate {0}: {1}\n\n'.format(path, e))
continue

if conform:
continue

ok = False

problems.append(
'{0}\n\t{1}\n'.format(path, _shacl_graph_to_string(graph))
)

if ok:
return True, None

return False, '\n'.join(problems)


def check_shacl_structure(path):
"""Validates all metadata aginst the SHACL schema."""
with path.open(mode='r') as fp:
source = yaml.load(fp, Loader=NoDatesSafeLoader) or {}

rdf = pyld.jsonld.to_rdf(
source,
options={
'format': 'application/n-quads',
'produceGeneralizedRdf': True
}
)

return validate_graph(rdf)
2 changes: 1 addition & 1 deletion renku/core/commands/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -567,7 +567,7 @@ def update_datasets(

file_.dataset = dataset
possible_updates.append(file_)
unique_remotes.add(file_.based_on['url'])
unique_remotes.add(file_.based_on.url)

if ref and len(unique_remotes) > 1:
raise ParameterError(
Expand Down
95 changes: 61 additions & 34 deletions renku/core/commands/format/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,43 +21,58 @@

import click

from renku.core.errors import SHACLValidationError
from renku.core.utils.shacl import validate_graph

def ascii(graph):

def ascii(graph, strict=False):
"""Format graph as an ASCII art."""
from ..ascii import DAG
from ..echo import echo_via_pager

if strict:
raise SHACLValidationError('--strict not supported for json-ld-graph')

echo_via_pager(str(DAG(graph)))


def _jsonld(graph, format, *args, **kwargs):
"""Return formatted graph in JSON-LD ``format`` function."""
import json

from pyld import jsonld
from renku.core.compat import pyld
from renku.core.models.jsonld import asjsonld

output = getattr(jsonld, format)([
output = getattr(pyld.jsonld, format)([
asjsonld(action) for action in graph.activities.values()
])
return json.dumps(output, indent=2)


def dot(graph, simple=True, debug=False, landscape=False):
"""Format graph as a dot file."""
import sys

def _conjunctive_graph(graph):
"""Convert a renku ``Graph`` to an rdflib ``ConjunctiveGraph``."""
from rdflib import ConjunctiveGraph
from rdflib.plugin import register, Parser
from rdflib.tools.rdf2dot import rdf2dot

register('json-ld', Parser, 'rdflib_jsonld.parser', 'JsonLDParser')

g = ConjunctiveGraph().parse(
return ConjunctiveGraph().parse(
data=_jsonld(graph, 'expand'),
format='json-ld',
)


def dot(graph, simple=True, debug=False, landscape=False, strict=False):
"""Format graph as a dot file."""
import sys

from rdflib.tools.rdf2dot import rdf2dot

if strict:
raise SHACLValidationError('--strict not supported for json-ld-graph')

g = _conjunctive_graph(graph)

g.bind('prov', 'http://www.w3.org/ns/prov#')
g.bind('foaf', 'http://xmlns.com/foaf/0.1/')
g.bind('wfdesc', 'http://purl.org/wf4ever/wfdesc#')
Expand Down Expand Up @@ -92,7 +107,7 @@ def _rdf2dot_simple(g, stream):
import re

path_re = re.compile(
r'file:///(?P<type>[a-zA-Z]+)/'
r'(?P<prefix>file://|https://\w+/\w+/){0,1}(?P<type>[a-zA-Z]+)/'
r'(?P<commit>\w+)'
r'(?P<path>.+)?'
)
Expand Down Expand Up @@ -293,10 +308,13 @@ def color(p):
stream.write('}\n')


def makefile(graph):
def makefile(graph, strict=False):
"""Format graph as Makefile."""
from renku.core.models.provenance.activities import ProcessRun, WorkflowRun

if strict:
raise SHACLValidationError('--strict not supported for json-ld-graph')

for activity in graph.activities.values():
if not isinstance(activity, ProcessRun):
continue
Expand All @@ -316,44 +334,53 @@ def makefile(graph):
)


def jsonld(graph):
def jsonld(graph, strict=False):
"""Format graph as JSON-LD file."""
click.echo(_jsonld(graph, 'expand'))
ld = _jsonld(graph, 'expand')

if strict:
r, _, t = validate_graph(ld, format='json-ld')

if not r:
raise SHACLValidationError(
"{}\nCouldn't get log: Invalid Knowledge Graph data".format(t)
)
click.echo(ld)


def jsonld_graph(graph):
def jsonld_graph(graph, strict=False):
"""Format graph as JSON-LD graph file."""
if strict:
raise SHACLValidationError('--strict not supported for json-ld-graph')
click.echo(_jsonld(graph, 'flatten'))


def nt(graph):
def nt(graph, strict=False):
"""Format graph as n-tuples."""
from rdflib import ConjunctiveGraph
from rdflib.plugin import register, Parser
nt = _conjunctive_graph(graph).serialize(format='nt')
if strict:
r, _, t = validate_graph(nt, format='nt')

register('json-ld', Parser, 'rdflib_jsonld.parser', 'JsonLDParser')
if not r:
raise SHACLValidationError(
"{}\nCouldn't get log: Invalid Knowledge Graph data".format(t)
)

click.echo(
ConjunctiveGraph().parse(
data=_jsonld(graph, 'expand'),
format='json-ld',
).serialize(format='nt')
)
click.echo(nt)


def rdf(graph):
def rdf(graph, strict=False):
"""Output the graph as RDF."""
from rdflib import ConjunctiveGraph
from rdflib.plugin import register, Parser
xml = _conjunctive_graph(graph).serialize(format='application/rdf+xml')
if strict:
r, _, t = validate_graph(xml, format='xml')

register('json-ld', Parser, 'rdflib_jsonld.parser', 'JsonLDParser')
if not r:
raise SHACLValidationError(
"{}\nCouldn't get log: Invalid Knowledge Graph data".format(t)
)

click.echo(
ConjunctiveGraph().parse(
data=_jsonld(graph, 'expand'),
format='json-ld',
).serialize(format='application/rdf+xml')
)
click.echo(xml)


FORMATS = {
Expand Down
Loading