diff --git a/.travis.yml b/.travis.yml index 78c0d47b..64ce8763 100644 --- a/.travis.yml +++ b/.travis.yml @@ -75,7 +75,7 @@ install: export PIPCMD=pip; fi; - - $PIPCMD install lxml enum34 pyyaml rdflib + - $PIPCMD install lxml enum34 pyyaml rdflib owlrl requests script: - which $PYCMD diff --git a/appveyor.yml b/appveyor.yml index 8cc7906b..b4717893 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -37,7 +37,7 @@ init: build: false install: - - python -m pip install lxml enum34 pyyaml rdflib + - python -m pip install lxml enum34 pyyaml rdflib owlrl requests test_script: - python --version diff --git a/odml/tools/rdf_converter.py b/odml/tools/rdf_converter.py index 205ec7c6..30d370a6 100644 --- a/odml/tools/rdf_converter.py +++ b/odml/tools/rdf_converter.py @@ -4,12 +4,14 @@ """ import os +import string import uuid +import warnings from io import StringIO from rdflib import Graph, Literal, URIRef from rdflib.graph import Seq -from rdflib.namespace import XSD, RDF +from rdflib.namespace import XSD, RDF, RDFS import yaml @@ -57,14 +59,32 @@ class RDFWriter(object): """ A writer to parse odML files into RDF documents. + Use the 'rdf_subclassing' flag to disable default usage of Section type conversion to + RDF Subclasses. + Provide a custom Section type to RDF Subclass Name mapping dictionary via the + 'custom_subclasses' attribute to add custom or overwrite default RDF Subclass mappings. + Usage: RDFWriter(odml_docs).get_rdf_str('turtle') RDFWriter(odml_docs).write_file("/output_path", "rdf_format") + + RDFWriter(odml_docs, rdf_subclassing=False).write_file("path", "rdf_format") + RDFWriter(odml_docs, custom_subclasses=custom_dict).write_file("path", "rdf_format") """ - def __init__(self, odml_documents): + def __init__(self, odml_documents, rdf_subclassing=True, custom_subclasses=None): """ :param odml_documents: list of odML documents + :param rdf_subclassing: Flag whether Section types should be converted to RDF Subclasses + for enhanced SPARQL queries. Default is 'True'. + :param custom_subclasses: A dict where the keys reference a Section type and the + corresponding values reference an RDF Class Name. When exporting + a Section of a type contained in this dict, the resulting RDF + Instance will be of the corresponding Class and this Class will + be added as a Subclass of RDF Class "odml:Section" to the + RDF document. + Key:value pairs of the "custom_subclasses" dict will overwrite + existing key:value pairs of the default subclassing dict. """ if not isinstance(odml_documents, list): odml_documents = [odml_documents] @@ -74,7 +94,13 @@ def __init__(self, odml_documents): self.graph = Graph() self.graph.bind("odml", ODML_NS) + self.rdf_subclassing = rdf_subclassing + self.section_subclasses = load_rdf_subclasses() + # If a custom Section type to RDF Subclass dict has been provided, + # parse it and update the default section_subclasses dict with the content. + if custom_subclasses and isinstance(custom_subclasses, dict): + self._parse_custom_subclasses(custom_subclasses) def convert_to_rdf(self): """ @@ -221,10 +247,16 @@ def save_section(self, sec, curr_node): # Add type of current node to the RDF graph curr_type = fmt.rdf_type + # Handle section subclass types - sub_sec = self._get_section_subclass(sec) - if sub_sec: - curr_type = sub_sec + if self.rdf_subclassing: + sub_sec = self._get_section_subclass(sec) + if sub_sec: + curr_type = sub_sec + self.graph.add((URIRef(fmt.rdf_type), RDF.type, RDFS.Class)) + self.graph.add((URIRef(curr_type), RDF.type, RDFS.Class)) + self.graph.add((URIRef(curr_type), RDFS.subClassOf, URIRef(fmt.rdf_type))) + self.graph.add((curr_node, RDF.type, URIRef(curr_type))) for k in fmt.rdf_map_keys: @@ -294,6 +326,33 @@ class Section. return None + def _parse_custom_subclasses(self, custom_subclasses): + """ + Parses a provided dictionary of "Section type": "RDF Subclass name" + key value pairs and adds the pairs to the parsers' 'section_subclasses' + default dictionary. Existing key:value pairs will be overwritten + with provided custom key:value pairs and a Warning will be issued. + Dictionary values containing whitespaces will raise a ValueError. + + :param custom_subclasses: dictionary of "Section type": "RDF Subclass name" key value pairs. + Values must not contain whitespaces, a ValueError will be raised + otherwise. + """ + + # Do not allow any whitespace characters in values + vals = "".join(custom_subclasses.values()).encode() + if vals != vals.translate(None, string.whitespace.encode()): + msg = "Custom RDF Subclass names must not contain any whitespace characters." + raise ValueError(msg) + + for k in custom_subclasses: + val = custom_subclasses[k] + if k in self.section_subclasses: + msg = "RDFWriter custom subclasses: Key '%s' already exists. " % k + msg += "Value '%s' replaces default value '%s'." % (val, self.section_subclasses[k]) + warnings.warn(msg, stacklevel=2) + self.section_subclasses[k] = val + def __str__(self): return self.convert_to_rdf().serialize(format='turtle').decode("utf-8") diff --git a/setup.py b/setup.py index 9418eda0..0250e7fa 100644 --- a/setup.py +++ b/setup.py @@ -32,6 +32,8 @@ install_req = ["lxml", "pyyaml>=5.1", "rdflib", "docopt", "pathlib"] +tests_req = ["owlrl", "requests"] + if sys.version_info < (3, 4): install_req += ["enum34"] @@ -45,6 +47,7 @@ packages=packages, test_suite='test', install_requires=install_req, + tests_require=tests_req, include_package_data=True, long_description=description_text, long_description_content_type="text/markdown", diff --git a/test/test_rdf_writer.py b/test/test_rdf_writer.py index e3a4a23a..87448e4b 100644 --- a/test/test_rdf_writer.py +++ b/test/test_rdf_writer.py @@ -2,15 +2,21 @@ import os import unittest +from sys import version_info + import yaml +if version_info > (3, 4): + from owlrl import DeductiveClosure, RDFS_Semantics + from rdflib import URIRef, Literal -from rdflib.namespace import XSD, RDF +from rdflib.namespace import Namespace, RDF, RDFS, XSD +from rdflib.plugins.sparql import prepareQuery import odml from odml.format import Format -from odml.tools.rdf_converter import RDFWriter +from odml.tools.rdf_converter import ODML_NS, RDFWriter from .test_samplefile import SampleFileCreator from .test_samplefile import parse @@ -288,3 +294,234 @@ def test_get_rdf_string(self): with self.assertRaises(ValueError): rdf_writer.get_rdf_str("abc") + + def test_rdf_subclassing_switch(self): + """ + Test the RDF section subclassing switch. + """ + # Section type term defined in odml/resources/section_subclasses.yaml that will + # be converted to an RDF Section Subclass of Class "Cell". + sub_class_term = "cell" + + # Create minimal document + doc = odml.Document() + _ = odml.Section(name="test_subclassing", type=sub_class_term, parent=doc) + + # Test default subclassing + rdf_writer = RDFWriter([doc]) + result = rdf_writer.get_rdf_str() + self.assertIn("odml:Cell", result) + + # Test inactivation of subclassing feature + rdf_writer = RDFWriter([doc], rdf_subclassing=False) + result = rdf_writer.get_rdf_str() + self.assertNotIn("odml:Cell", result) + + def test_rdf_custom_subclasses(self): + """ + Test collection of the odml RDF subclassing feature. + Tests that the resulting output RDF document contains any required + additional RDF subclasses. + """ + sub_class_term = "cell" + + # Create minimal document + doc = odml.Document() + _ = odml.Section(name="test_subclassing", type=sub_class_term, parent=doc) + + # Test None dict + rdf_writer = RDFWriter([doc], custom_subclasses=None) + self.assertIn("odml:Cell", rdf_writer.get_rdf_str()) + + # Test invalid dict + rdf_writer = RDFWriter([doc], custom_subclasses=["invalid"]) + self.assertIn("odml:Cell", rdf_writer.get_rdf_str()) + + # Test value whitespace + inval_a = "This should" + inval_b = "fail\nin" + inval_c = "the\tmost" + inval_d = "complete\rway" + invalid_dict = {"type_1": inval_a, "type_2": inval_b, "type_3": inval_c, "type_4": inval_d} + with self.assertRaises(ValueError): + _ = RDFWriter([doc], custom_subclasses=invalid_dict) + + # Test custom subclassing + type_custom_class = "species" + custom_class_dict = {type_custom_class: "Species"} + + doc = odml.Document() + _ = odml.Section(name="test_subclassing", type="cell", parent=doc) + _ = odml.Section(name="test_custom_subclassing", type=type_custom_class, parent=doc) + + rdf_writer = RDFWriter([doc], custom_subclasses=custom_class_dict) + self.assertIn("odml:Cell", rdf_writer.get_rdf_str()) + self.assertIn("odml:Species", rdf_writer.get_rdf_str()) + + # Test custom subclassing overwrite + sub_class_type = "cell" + custom_class_dict = {sub_class_type: "Neuron"} + + doc = odml.Document() + _ = odml.Section(name="test_subclassing", type=sub_class_type, parent=doc) + + if version_info > (3, 4): + with self.assertWarns(UserWarning): + rdf_writer = RDFWriter([doc], custom_subclasses=custom_class_dict) + self.assertNotIn("odml:Cell", rdf_writer.get_rdf_str()) + self.assertIn("odml:Neuron", rdf_writer.get_rdf_str()) + + def test_rdf_subclassing_definitions(self): + """ + Test that RDF Subclass definitions are written to the resulting graph. + """ + # -- Test default subclassing + doc = odml.Document() + _ = odml.Section(name="test_subclassing", type="cell", parent=doc) + + rdf_writer = RDFWriter([doc]) + curr_str = " ".join(rdf_writer.get_rdf_str().split()) + self.assertIn("odml:Cell a rdfs:Class ; rdfs:subClassOf odml:Section", curr_str) + self.assertIn("odml:Section a rdfs:Class", curr_str) + + # -- Test multiple entries; a definition should only occur once in an RDF document + doc = odml.Document() + sec = odml.Section(name="test_subclassing", type="cell", parent=doc) + sub_sec = odml.Section(name="test_subclassing", type="cell", parent=sec) + _ = odml.Section(name="test_subclassing", type="cell", parent=sub_sec) + + rdf_writer = RDFWriter([doc]) + curr_str = " ".join(rdf_writer.get_rdf_str().split()) + self.assertIn("odml:Cell a rdfs:Class ; rdfs:subClassOf odml:Section", curr_str) + self.assertIs(curr_str.count("odml:Cell a rdfs:Class ; rdfs:subClassOf odml:Section"), 1) + self.assertIn("odml:Section a rdfs:Class", curr_str) + self.assertIs(curr_str.count("odml:Section a rdfs:Class"), 1) + + # -- Test custom subclassing + type_custom_class = "species" + custom_class_dict = {type_custom_class: "Species"} + + doc = odml.Document() + _ = odml.Section(name="test_subclassing", type="cell", parent=doc) + _ = odml.Section(name="test_custom_subclassing", type=type_custom_class, parent=doc) + + rdf_writer = RDFWriter([doc], custom_subclasses=custom_class_dict) + curr_str = " ".join(rdf_writer.get_rdf_str().split()) + self.assertIn("odml:Cell a rdfs:Class ; rdfs:subClassOf odml:Section", curr_str) + self.assertIn("odml:Species a rdfs:Class ; rdfs:subClassOf odml:Section", curr_str) + self.assertIn("odml:Section a rdfs:Class", curr_str) + + # -- Test inactive subclassing + doc = odml.Document() + _ = odml.Section(name="test_subclassing", type="cell", parent=doc) + + rdf_writer = RDFWriter([doc], rdf_subclassing=False) + curr_str = " ".join(rdf_writer.get_rdf_str().split()) + self.assertNotIn("odml:Section a rdfs:Class", curr_str) + self.assertNotIn("odml:Cell a rdfs:Class ; rdfs:subClassOf odml:Section", curr_str) + + def test_rdf_subclassing_queries(self): + """ + Test the proper implementation of the RDF subclassing feature. Tests ensure, that queries + relying on RDF Subclasses return appropriate results. + """ + if version_info > (3, 4): + namespace_map = {"odml": Namespace(ODML_NS), "rdf": RDF, "rdfs": RDFS} + + doc = odml.Document() + _ = odml.Section(name="test_subclass", type="cell", parent=doc) + _ = odml.Section(name="test_regular_class", type="regular", parent=doc) + + rdf_writer = RDFWriter([doc]) + _ = rdf_writer.get_rdf_str() + + use_graph = rdf_writer.graph + DeductiveClosure(RDFS_Semantics).expand(use_graph) + + q_string = "SELECT * WHERE {?s rdf:type odml:Section .}" + curr_query = prepareQuery(q_string, initNs=namespace_map) + + # Make sure the query finds two sections + self.assertIs(len(use_graph.query(curr_query)), 2) + + # Make sure the query finds + result_section = [] + for row in use_graph.query(curr_query): + result_section.append(row.s) + + q_string = "SELECT * WHERE {?s rdf:type odml:Cell .}" + curr_query = prepareQuery(q_string, initNs=namespace_map) + + self.assertIs(len(use_graph.query(curr_query)), 1) + for row in use_graph.query(curr_query): + self.assertIn(row.s, result_section) + + # -- Test custom subclassing queries + type_custom_class = "species" + type_overwrite_class = "cell" + custom_class_dict = {type_custom_class: "Species", type_overwrite_class: "Neuron"} + + doc = odml.Document() + sec = odml.Section(name="test_subclass", type="species", parent=doc) + _ = odml.Section(name="test_subclass_overwrite", type="cell", parent=sec) + _ = odml.Section(name="test_regular_class", type="regular", parent=sec) + + rdf_writer = RDFWriter([doc], custom_subclasses=custom_class_dict) + _ = rdf_writer.get_rdf_str() + + use_graph = rdf_writer.graph + DeductiveClosure(RDFS_Semantics).expand(use_graph) + + q_string = "SELECT * WHERE {?s rdf:type odml:Section .}" + curr_query = prepareQuery(q_string, initNs=namespace_map) + + # Make sure the query finds three sections + self.assertIs(len(use_graph.query(curr_query)), 3) + + # Make sure the query finds + result_section = [] + for row in use_graph.query(curr_query): + result_section.append(row.s) + + # Custom class 'Species' should be found. + q_string = "SELECT * WHERE {?s rdf:type odml:Species .}" + curr_query = prepareQuery(q_string, initNs=namespace_map) + + self.assertIs(len(use_graph.query(curr_query)), 1) + for row in use_graph.query(curr_query): + self.assertIn(row.s, result_section) + + # Custom class 'Neuron' should be found. + q_string = "SELECT * WHERE {?s rdf:type odml:Neuron .}" + curr_query = prepareQuery(q_string, initNs=namespace_map) + + self.assertIs(len(use_graph.query(curr_query)), 1) + for row in use_graph.query(curr_query): + self.assertIn(row.s, result_section) + + # Default class 'Cell' was replaced and should not return any result. + q_string = "SELECT * WHERE {?s rdf:type odml:Cell .}" + curr_query = prepareQuery(q_string, initNs=namespace_map) + + self.assertIs(len(use_graph.query(curr_query)), 0) + + # -- Test inactivated subclassing + doc = odml.Document() + _ = odml.Section(name="test_regular_class", type="regular", parent=doc) + _ = odml.Section(name="test_subclass", type="cell", parent=doc) + + rdf_writer = RDFWriter([doc], rdf_subclassing=False) + _ = rdf_writer.get_rdf_str() + + use_graph = rdf_writer.graph + DeductiveClosure(RDFS_Semantics).expand(use_graph) + + q_string = "SELECT * WHERE {?s rdf:type odml:Section .}" + curr_query = prepareQuery(q_string, initNs=namespace_map) + + self.assertIs(len(use_graph.query(curr_query)), 2) + + q_string = "SELECT * WHERE {?s rdf:type odml:Cell .}" + curr_query = prepareQuery(q_string, initNs=namespace_map) + + self.assertIs(len(use_graph.query(curr_query)), 0)