Skip to content

Commit

Permalink
Added csv parser to Tabledoc (#275)
Browse files Browse the repository at this point in the history
# Description
Added csv parser to Tabledoc (using the built-in csv module).

---------

Co-authored-by: Tor S. Haugland <torshaugland@gmail.com>
  • Loading branch information
jesper-friis and torhaugl authored Jan 3, 2025
1 parent c84bae1 commit 415fa4f
Show file tree
Hide file tree
Showing 5 changed files with 139 additions and 6 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@ dist/

# Test output
route.svg
coverage.xml
47 changes: 47 additions & 0 deletions tests/dataset/test_tabledoc.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,3 +81,50 @@ def test_as_dicts():
ts = Triplestore(backend="rdflib")
td.save(ts)
print(ts.serialize())


# if True:
def test_csv():
"""Test parsing a csv file."""
from dataset_paths import indir, outdir # pylint: disable=import-error

pytest.importorskip("rdflib")

from tripper import Triplestore
from tripper.dataset import TableDoc

# Read csv file
td = TableDoc.parse_csv(
indir / "semdata.csv",
delimiter=";",
prefixes={
"sem": "https://w3id.com/emmo/domain/sem/0.1#",
"semdata": "https://he-matchmaker.eu/data/sem/",
"sample": "https://he-matchmaker.eu/sample/",
"mat": "https://he-matchmaker.eu/material/",
"dm": "http://onto-ns.com/meta/characterisation/0.1/SEMImage#",
"parser": "http://sintef.no/dlite/parser#",
"gen": "http://sintef.no/dlite/generator#",
},
)

# pylint: disable=unused-variable,unbalanced-tuple-unpacking
img, series, batch, sample = td.asdicts()

assert img["@id"] == (
"https://he-matchmaker.eu/data/sem/SEM_cement_batch2/"
"77600-23-001/77600-23-001_5kV_400x_m001"
)
assert img.distribution.downloadURL == (
"https://github.com/EMMC-ASBL/tripper/raw/refs/heads/dataset/"
"tests/input/77600-23-001_5kV_400x_m001.tif"
)

# Write the table to a new csv file
td.write_csv(outdir / "semdata.csv")

# Print serialised KB
ts = Triplestore(backend="rdflib")
td.save(ts)
ts.serialize(outdir / "semdata.ttl")
print(ts.serialize())
8 changes: 4 additions & 4 deletions tests/input/semdata.csv
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
@id;@type;title;description;creator;contactPoint;inSeries;datamodel;datamodelStorage;distribution.downloadURL;distribution.mediaType;distribution.parser;fromSample;isDescriptionOf
semdata:SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001;sem:SEMImage;SEM image of cement;Back-scattered SEM image of cement sample 77600 from Heidelberg, polished with 1 �m diamond compound.;Sigurd Wenner;Sigurd Wenner <Sigurd.Wenner@sintef.no>;semdata:SEM_cement_batch2/77600-23-001;http://onto-ns.com/meta/matchmaker/0.2/SEMImage;https://github.com/HEU-MatCHMaker/DataDocumentation/blob/master/SEM/datamodels/SEMImage.yaml;https://github.com/EMMC-ASBL/tripper/raw/refs/heads/dataset/tests/input/77600-23-001_5kV_400x_m001.tif;image/tiff;parser:sem_hitachi;sample:SEM_cement_batch2/77600-23-001;mat:concrete1
semdata:SEM_cement_batch2/77600-23-001;sem:SEMImageSeries;Series of SEM image of cement sample 77600;Back-scattered SEM image of cement sample 77600, polished with 1 �m diamond compound.;Sigurd Wenner;Sigurd Wenner <Sigurd.Wenner@sintef.no>;semdata:SEM_cement_batch2; ;;sftp://nas.aimen.es/P_MATCHMAKER_SHARE_SINTEF/SEM_cement_batch2/77600-23-001;inode/directory;;;
semdata:SEM_cement_batch2;sem:SEMImageSeries;Nested series of SEM images of cement batch2;;Sigurd Wenner;Sigurd Wenner <Sigurd.Wenner@sintef.no>; ;;;sftp://nas.aimen.es/P_MATCHMAKER_SHARE_SINTEF/SEM_cement_batch2;inode/directory;;;
mple:SEM_cement_batch2/77600-23-001;chameo:Sample;Series for SEM images for sample 77600-23-001.; ;;;;;;;;;;
semdata:SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001;sem:SEMImage;SEM image of cement;Back-scattered SEM image of cement sample 77600 from Heidelberg, polished with 1 µm diamond compound.;Sigurd Wenner;Sigurd Wenner <Sigurd.Wenner@sintef.no>;semdata:SEM_cement_batch2/77600-23-001;http://onto-ns.com/meta/matchmaker/0.2/SEMImage;https://github.com/HEU-MatCHMaker/DataDocumentation/blob/master/SEM/datamodels/SEMImage.yaml;https://github.com/EMMC-ASBL/tripper/raw/refs/heads/dataset/tests/input/77600-23-001_5kV_400x_m001.tif;image/tiff;parser:sem_hitachi;sample:SEM_cement_batch2/77600-23-001;mat:concrete1
semdata:SEM_cement_batch2/77600-23-001;sem:SEMImageSeries;Series of SEM image of cement sample 77600;Back-scattered SEM image of cement sample 77600, polished with 1 µm diamond compound.;Sigurd Wenner;Sigurd Wenner <Sigurd.Wenner@sintef.no>;semdata:SEM_cement_batch2; ;;sftp://nas.aimen.es/P_MATCHMAKER_SHARE_SINTEF/SEM_cement_batch2/77600-23-001;inode/directory;;;
semdata:SEM_cement_batch2;sem:SEMImageSeries;Nested series of SEM images of cement batch2;...;Sigurd Wenner;Sigurd Wenner <Sigurd.Wenner@sintef.no>; ;;;sftp://nas.aimen.es/P_MATCHMAKER_SHARE_SINTEF/SEM_cement_batch2;inode/directory;;;
sample:SEM_cement_batch2/77600-23-001;chameo:Sample;Series for SEM images for sample 77600-23-001.; ;;;;;;;;;;
1 change: 1 addition & 0 deletions tests/output/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
*.ttl
*.png
*.tiff
*.csv
88 changes: 86 additions & 2 deletions tripper/dataset/tabledoc.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""Basic interface for tabular documentation of datasets."""

import csv
from pathlib import Path
from typing import TYPE_CHECKING

from tripper import Triplestore
Expand All @@ -26,6 +28,7 @@ class TableDoc:
prefixes: Dict with prefixes in addition to those included in the
JSON-LD context. Should map namespace prefixes to IRIs.
context: Dict with user-defined JSON-LD context.
strip: Whether to strip leading and trailing whitespaces from cells.
"""

Expand All @@ -38,12 +41,14 @@ def __init__(
type: "Optional[str]" = "dataset",
prefixes: "Optional[dict]" = None,
context: "Optional[Union[dict, list]]" = None,
strip: bool = True,
):
self.header = header
self.data = data
self.type = type
self.prefixes = prefixes
self.context = context
self.strip = strip

def asdicts(self) -> "List[dict]":
"""Return the table as a list of dicts."""
Expand All @@ -53,9 +58,11 @@ def asdicts(self) -> "List[dict]":
for row in self.data:
d = AttrDict()
for i, colname in enumerate(self.header):
cell = row[i]
cell = row[i].strip() if row[i] and self.strip else row[i]
if cell:
addnested(d, colname, cell)
addnested(
d, colname.strip() if self.strip else colname, cell
)
jsonld = as_jsonld(
d, type=self.type, prefixes=self.prefixes, **kw # type: ignore
)
Expand All @@ -66,3 +73,80 @@ def save(self, ts: Triplestore) -> None:
"""Save tabular datadocumentation to triplestore."""
for d in self.asdicts():
save_dict(ts, d)

@staticmethod
def parse_csv(
csvfile: "Union[Path, str]",
type: "Optional[str]" = "dataset",
prefixes: "Optional[dict]" = None,
context: "Optional[Union[dict, list]]" = None,
encoding: str = "utf-8",
dialect: "Union[csv.Dialect, str]" = "excel",
**kwargs,
) -> "TableDoc":
# pylint: disable=line-too-long
"""Parse a csv file using the standard library csv module.
Arguments:
csvfile: CSV file to parse.
type: Type of data to save (applies to all rows). Should
either be one of the pre-defined names: "dataset",
"distribution", "accessService", "parser" and "generator"
or an IRI to a class in an ontology. Defaults to
"dataset".
prefixes: Dict with prefixes in addition to those included in the
JSON-LD context. Should map namespace prefixes to IRIs.
context: Dict with user-defined JSON-LD context.
encoding: The encoding of the csv file. Note that Excel may
encode as "ISO-8859" (commonly used in 1990th).
dialect: A subclass of csv.Dialect, or the name of the dialect,
specifying how the `csvfile` is formatted. For more details,
see [Dialects and Formatting Parameters].
kwargs: Additional keyword arguments overriding individual
formatting parameters. For more details, see
[Dialects and Formatting Parameters].
References:
[Dialects and Formatting Parameters]: https://docs.python.org/3/library/csv.html#dialects-and-formatting-parameters
"""
with open(csvfile, mode="rt", encoding=encoding) as f:
reader = csv.reader(f, dialect=dialect, **kwargs)
header = next(reader)
data = list(reader)

return TableDoc(
header=header,
data=data,
type=type,
prefixes=prefixes,
context=context,
)

def write_csv(
self,
csvfile: "Union[Path, str]",
encoding: str = "utf-8",
dialect: "Union[csv.Dialect, str]" = "excel",
**kwargs,
) -> None:
# pylint: disable=line-too-long
"""Write the table to a csv file using the standard library csv module.
Arguments:
csvfile: CSV file to parse.
encoding: The encoding of the csv file.
dialect: A subclass of csv.Dialect, or the name of the dialect,
specifying how the `csvfile` is formatted. For more details,
see [Dialects and Formatting Parameters].
kwargs: Additional keyword arguments overriding individual
formatting parameters. For more details, see
[Dialects and Formatting Parameters].
References:
[Dialects and Formatting Parameters]: https://docs.python.org/3/library/csv.html#dialects-and-formatting-parameters
"""
with open(csvfile, mode="wt", encoding=encoding) as f:
writer = csv.writer(f, dialect=dialect, **kwargs)
writer.writerow(self.header)
for row in self.data:
writer.writerow(row)

0 comments on commit 415fa4f

Please # to comment.