diff --git a/.gitignore b/.gitignore index c872f80e..9a0e7df5 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,4 @@ dist/ # Test output route.svg +coverage.xml diff --git a/tests/dataset/test_tabledoc.py b/tests/dataset/test_tabledoc.py index 278e7881..67ce74ed 100644 --- a/tests/dataset/test_tabledoc.py +++ b/tests/dataset/test_tabledoc.py @@ -81,3 +81,50 @@ def test_as_dicts(): ts = Triplestore(backend="rdflib") td.save(ts) print(ts.serialize()) + + +# if True: +def test_csv(): + """Test parsing a csv file.""" + from dataset_paths import indir, outdir # pylint: disable=import-error + + pytest.importorskip("rdflib") + + from tripper import Triplestore + from tripper.dataset import TableDoc + + # Read csv file + td = TableDoc.parse_csv( + indir / "semdata.csv", + delimiter=";", + prefixes={ + "sem": "https://w3id.com/emmo/domain/sem/0.1#", + "semdata": "https://he-matchmaker.eu/data/sem/", + "sample": "https://he-matchmaker.eu/sample/", + "mat": "https://he-matchmaker.eu/material/", + "dm": "http://onto-ns.com/meta/characterisation/0.1/SEMImage#", + "parser": "http://sintef.no/dlite/parser#", + "gen": "http://sintef.no/dlite/generator#", + }, + ) + + # pylint: disable=unused-variable,unbalanced-tuple-unpacking + img, series, batch, sample = td.asdicts() + + assert img["@id"] == ( + "https://he-matchmaker.eu/data/sem/SEM_cement_batch2/" + "77600-23-001/77600-23-001_5kV_400x_m001" + ) + assert img.distribution.downloadURL == ( + "https://github.com/EMMC-ASBL/tripper/raw/refs/heads/dataset/" + "tests/input/77600-23-001_5kV_400x_m001.tif" + ) + + # Write the table to a new csv file + td.write_csv(outdir / "semdata.csv") + + # Print serialised KB + ts = Triplestore(backend="rdflib") + td.save(ts) + ts.serialize(outdir / "semdata.ttl") + print(ts.serialize()) diff --git a/tests/input/semdata.csv b/tests/input/semdata.csv index 631d9e69..4d7a78c6 100644 --- a/tests/input/semdata.csv +++ b/tests/input/semdata.csv @@ -1,5 +1,5 @@ @id;@type;title;description;creator;contactPoint;inSeries;datamodel;datamodelStorage;distribution.downloadURL;distribution.mediaType;distribution.parser;fromSample;isDescriptionOf -semdata:SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001;sem:SEMImage;SEM image of cement;Back-scattered SEM image of cement sample 77600 from Heidelberg, polished with 1 µm diamond compound.;Sigurd Wenner;Sigurd Wenner ;semdata:SEM_cement_batch2/77600-23-001;http://onto-ns.com/meta/matchmaker/0.2/SEMImage;https://github.com/HEU-MatCHMaker/DataDocumentation/blob/master/SEM/datamodels/SEMImage.yaml;https://github.com/EMMC-ASBL/tripper/raw/refs/heads/dataset/tests/input/77600-23-001_5kV_400x_m001.tif;image/tiff;parser:sem_hitachi;sample:SEM_cement_batch2/77600-23-001;mat:concrete1 -semdata:SEM_cement_batch2/77600-23-001;sem:SEMImageSeries;Series of SEM image of cement sample 77600;Back-scattered SEM image of cement sample 77600, polished with 1 µm diamond compound.;Sigurd Wenner;Sigurd Wenner ;semdata:SEM_cement_batch2; ;;sftp://nas.aimen.es/P_MATCHMAKER_SHARE_SINTEF/SEM_cement_batch2/77600-23-001;inode/directory;;; -semdata:SEM_cement_batch2;sem:SEMImageSeries;Nested series of SEM images of cement batch2;…;Sigurd Wenner;Sigurd Wenner ; ;;;sftp://nas.aimen.es/P_MATCHMAKER_SHARE_SINTEF/SEM_cement_batch2;inode/directory;;; -mple:SEM_cement_batch2/77600-23-001;chameo:Sample;Series for SEM images for sample 77600-23-001.; ;;;;;;;;;; +semdata:SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001;sem:SEMImage;SEM image of cement;Back-scattered SEM image of cement sample 77600 from Heidelberg, polished with 1 µm diamond compound.;Sigurd Wenner;Sigurd Wenner ;semdata:SEM_cement_batch2/77600-23-001;http://onto-ns.com/meta/matchmaker/0.2/SEMImage;https://github.com/HEU-MatCHMaker/DataDocumentation/blob/master/SEM/datamodels/SEMImage.yaml;https://github.com/EMMC-ASBL/tripper/raw/refs/heads/dataset/tests/input/77600-23-001_5kV_400x_m001.tif;image/tiff;parser:sem_hitachi;sample:SEM_cement_batch2/77600-23-001;mat:concrete1 +semdata:SEM_cement_batch2/77600-23-001;sem:SEMImageSeries;Series of SEM image of cement sample 77600;Back-scattered SEM image of cement sample 77600, polished with 1 µm diamond compound.;Sigurd Wenner;Sigurd Wenner ;semdata:SEM_cement_batch2; ;;sftp://nas.aimen.es/P_MATCHMAKER_SHARE_SINTEF/SEM_cement_batch2/77600-23-001;inode/directory;;; +semdata:SEM_cement_batch2;sem:SEMImageSeries;Nested series of SEM images of cement batch2;...;Sigurd Wenner;Sigurd Wenner ; ;;;sftp://nas.aimen.es/P_MATCHMAKER_SHARE_SINTEF/SEM_cement_batch2;inode/directory;;; +sample:SEM_cement_batch2/77600-23-001;chameo:Sample;Series for SEM images for sample 77600-23-001.; ;;;;;;;;;; diff --git a/tests/output/.gitignore b/tests/output/.gitignore index c26b5163..613dbf65 100644 --- a/tests/output/.gitignore +++ b/tests/output/.gitignore @@ -3,3 +3,4 @@ *.ttl *.png *.tiff +*.csv diff --git a/tripper/dataset/tabledoc.py b/tripper/dataset/tabledoc.py index 9fd5d988..6dbf8b32 100644 --- a/tripper/dataset/tabledoc.py +++ b/tripper/dataset/tabledoc.py @@ -1,5 +1,7 @@ """Basic interface for tabular documentation of datasets.""" +import csv +from pathlib import Path from typing import TYPE_CHECKING from tripper import Triplestore @@ -26,6 +28,7 @@ class TableDoc: prefixes: Dict with prefixes in addition to those included in the JSON-LD context. Should map namespace prefixes to IRIs. context: Dict with user-defined JSON-LD context. + strip: Whether to strip leading and trailing whitespaces from cells. """ @@ -38,12 +41,14 @@ def __init__( type: "Optional[str]" = "dataset", prefixes: "Optional[dict]" = None, context: "Optional[Union[dict, list]]" = None, + strip: bool = True, ): self.header = header self.data = data self.type = type self.prefixes = prefixes self.context = context + self.strip = strip def asdicts(self) -> "List[dict]": """Return the table as a list of dicts.""" @@ -53,9 +58,11 @@ def asdicts(self) -> "List[dict]": for row in self.data: d = AttrDict() for i, colname in enumerate(self.header): - cell = row[i] + cell = row[i].strip() if row[i] and self.strip else row[i] if cell: - addnested(d, colname, cell) + addnested( + d, colname.strip() if self.strip else colname, cell + ) jsonld = as_jsonld( d, type=self.type, prefixes=self.prefixes, **kw # type: ignore ) @@ -66,3 +73,80 @@ def save(self, ts: Triplestore) -> None: """Save tabular datadocumentation to triplestore.""" for d in self.asdicts(): save_dict(ts, d) + + @staticmethod + def parse_csv( + csvfile: "Union[Path, str]", + type: "Optional[str]" = "dataset", + prefixes: "Optional[dict]" = None, + context: "Optional[Union[dict, list]]" = None, + encoding: str = "utf-8", + dialect: "Union[csv.Dialect, str]" = "excel", + **kwargs, + ) -> "TableDoc": + # pylint: disable=line-too-long + """Parse a csv file using the standard library csv module. + + Arguments: + csvfile: CSV file to parse. + type: Type of data to save (applies to all rows). Should + either be one of the pre-defined names: "dataset", + "distribution", "accessService", "parser" and "generator" + or an IRI to a class in an ontology. Defaults to + "dataset". + prefixes: Dict with prefixes in addition to those included in the + JSON-LD context. Should map namespace prefixes to IRIs. + context: Dict with user-defined JSON-LD context. + encoding: The encoding of the csv file. Note that Excel may + encode as "ISO-8859" (commonly used in 1990th). + dialect: A subclass of csv.Dialect, or the name of the dialect, + specifying how the `csvfile` is formatted. For more details, + see [Dialects and Formatting Parameters]. + kwargs: Additional keyword arguments overriding individual + formatting parameters. For more details, see + [Dialects and Formatting Parameters]. + + References: + [Dialects and Formatting Parameters]: https://docs.python.org/3/library/csv.html#dialects-and-formatting-parameters + """ + with open(csvfile, mode="rt", encoding=encoding) as f: + reader = csv.reader(f, dialect=dialect, **kwargs) + header = next(reader) + data = list(reader) + + return TableDoc( + header=header, + data=data, + type=type, + prefixes=prefixes, + context=context, + ) + + def write_csv( + self, + csvfile: "Union[Path, str]", + encoding: str = "utf-8", + dialect: "Union[csv.Dialect, str]" = "excel", + **kwargs, + ) -> None: + # pylint: disable=line-too-long + """Write the table to a csv file using the standard library csv module. + + Arguments: + csvfile: CSV file to parse. + encoding: The encoding of the csv file. + dialect: A subclass of csv.Dialect, or the name of the dialect, + specifying how the `csvfile` is formatted. For more details, + see [Dialects and Formatting Parameters]. + kwargs: Additional keyword arguments overriding individual + formatting parameters. For more details, see + [Dialects and Formatting Parameters]. + + References: + [Dialects and Formatting Parameters]: https://docs.python.org/3/library/csv.html#dialects-and-formatting-parameters + """ + with open(csvfile, mode="wt", encoding=encoding) as f: + writer = csv.writer(f, dialect=dialect, **kwargs) + writer.writerow(self.header) + for row in self.data: + writer.writerow(row)