Added csv parser to Tabledoc (#275)

# Description Added csv parser to Tabledoc (using the built-in csv module). --------- Co-authored-by: Tor S. Haugland <torshaugland@gmail.com>
EMMC-ASBL · Jan 3, 2025 · 415fa4f · 415fa4f
1 parent c84bae1
commit 415fa4f
Show file tree

Hide file tree

Showing 5 changed files with 139 additions and 6 deletions.
diff --git a/.gitignore b/.gitignore
@@ -18,3 +18,4 @@ dist/
 
 # Test output
 route.svg
+coverage.xml
diff --git a/tests/dataset/test_tabledoc.py b/tests/dataset/test_tabledoc.py
@@ -81,3 +81,50 @@ def test_as_dicts():
     ts = Triplestore(backend="rdflib")
     td.save(ts)
     print(ts.serialize())
+
+
+# if True:
+def test_csv():
+    """Test parsing a csv file."""
+    from dataset_paths import indir, outdir  # pylint: disable=import-error
+
+    pytest.importorskip("rdflib")
+
+    from tripper import Triplestore
+    from tripper.dataset import TableDoc
+
+    # Read csv file
+    td = TableDoc.parse_csv(
+        indir / "semdata.csv",
+        delimiter=";",
+        prefixes={
+            "sem": "https://w3id.com/emmo/domain/sem/0.1#",
+            "semdata": "https://he-matchmaker.eu/data/sem/",
+            "sample": "https://he-matchmaker.eu/sample/",
+            "mat": "https://he-matchmaker.eu/material/",
+            "dm": "http://onto-ns.com/meta/characterisation/0.1/SEMImage#",
+            "parser": "http://sintef.no/dlite/parser#",
+            "gen": "http://sintef.no/dlite/generator#",
+        },
+    )
+
+    # pylint: disable=unused-variable,unbalanced-tuple-unpacking
+    img, series, batch, sample = td.asdicts()
+
+    assert img["@id"] == (
+        "https://he-matchmaker.eu/data/sem/SEM_cement_batch2/"
+        "77600-23-001/77600-23-001_5kV_400x_m001"
+    )
+    assert img.distribution.downloadURL == (
+        "https://github.com/EMMC-ASBL/tripper/raw/refs/heads/dataset/"
+        "tests/input/77600-23-001_5kV_400x_m001.tif"
+    )
+
+    # Write the table to a new csv file
+    td.write_csv(outdir / "semdata.csv")
+
+    # Print serialised KB
+    ts = Triplestore(backend="rdflib")
+    td.save(ts)
+    ts.serialize(outdir / "semdata.ttl")
+    print(ts.serialize())
diff --git a/tests/input/semdata.csv b/tests/input/semdata.csv
@@ -1,5 +1,5 @@
 @id;@type;title;description;creator;contactPoint;inSeries;datamodel;datamodelStorage;distribution.downloadURL;distribution.mediaType;distribution.parser;fromSample;isDescriptionOf
-semdata:SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001;sem:SEMImage;SEM image of cement;Back-scattered SEM image of cement sample 77600 from Heidelberg, polished with 1 �m diamond compound.;Sigurd Wenner;Sigurd Wenner <Sigurd.Wenner@sintef.no>;semdata:SEM_cement_batch2/77600-23-001;http://onto-ns.com/meta/matchmaker/0.2/SEMImage;https://github.com/HEU-MatCHMaker/DataDocumentation/blob/master/SEM/datamodels/SEMImage.yaml;https://github.com/EMMC-ASBL/tripper/raw/refs/heads/dataset/tests/input/77600-23-001_5kV_400x_m001.tif;image/tiff;parser:sem_hitachi;sample:SEM_cement_batch2/77600-23-001;mat:concrete1
-semdata:SEM_cement_batch2/77600-23-001;sem:SEMImageSeries;Series of SEM image of cement sample 77600;Back-scattered SEM image of cement sample 77600, polished with 1 �m diamond compound.;Sigurd Wenner;Sigurd Wenner <Sigurd.Wenner@sintef.no>;semdata:SEM_cement_batch2; ;;sftp://nas.aimen.es/P_MATCHMAKER_SHARE_SINTEF/SEM_cement_batch2/77600-23-001;inode/directory;;;
-semdata:SEM_cement_batch2;sem:SEMImageSeries;Nested series of SEM images of cement batch2;�;Sigurd Wenner;Sigurd Wenner <Sigurd.Wenner@sintef.no>; ;;;sftp://nas.aimen.es/P_MATCHMAKER_SHARE_SINTEF/SEM_cement_batch2;inode/directory;;;
-mple:SEM_cement_batch2/77600-23-001;chameo:Sample;Series for SEM images for sample 77600-23-001.; ;;;;;;;;;;
+semdata:SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001;sem:SEMImage;SEM image of cement;Back-scattered SEM image of cement sample 77600 from Heidelberg, polished with 1 µm diamond compound.;Sigurd Wenner;Sigurd Wenner <Sigurd.Wenner@sintef.no>;semdata:SEM_cement_batch2/77600-23-001;http://onto-ns.com/meta/matchmaker/0.2/SEMImage;https://github.com/HEU-MatCHMaker/DataDocumentation/blob/master/SEM/datamodels/SEMImage.yaml;https://github.com/EMMC-ASBL/tripper/raw/refs/heads/dataset/tests/input/77600-23-001_5kV_400x_m001.tif;image/tiff;parser:sem_hitachi;sample:SEM_cement_batch2/77600-23-001;mat:concrete1
+semdata:SEM_cement_batch2/77600-23-001;sem:SEMImageSeries;Series of SEM image of cement sample 77600;Back-scattered SEM image of cement sample 77600, polished with 1 µm diamond compound.;Sigurd Wenner;Sigurd Wenner <Sigurd.Wenner@sintef.no>;semdata:SEM_cement_batch2; ;;sftp://nas.aimen.es/P_MATCHMAKER_SHARE_SINTEF/SEM_cement_batch2/77600-23-001;inode/directory;;;
+semdata:SEM_cement_batch2;sem:SEMImageSeries;Nested series of SEM images of cement batch2;...;Sigurd Wenner;Sigurd Wenner <Sigurd.Wenner@sintef.no>; ;;;sftp://nas.aimen.es/P_MATCHMAKER_SHARE_SINTEF/SEM_cement_batch2;inode/directory;;;
+sample:SEM_cement_batch2/77600-23-001;chameo:Sample;Series for SEM images for sample 77600-23-001.; ;;;;;;;;;;
diff --git a/tests/output/.gitignore b/tests/output/.gitignore
@@ -3,3 +3,4 @@
 *.ttl
 *.png
 *.tiff
+*.csv
diff --git a/tripper/dataset/tabledoc.py b/tripper/dataset/tabledoc.py
@@ -1,5 +1,7 @@
 """Basic interface for tabular documentation of datasets."""
 
+import csv
+from pathlib import Path
 from typing import TYPE_CHECKING
 
 from tripper import Triplestore
@@ -26,6 +28,7 @@ class TableDoc:
         prefixes: Dict with prefixes in addition to those included in the
             JSON-LD context.  Should map namespace prefixes to IRIs.
         context: Dict with user-defined JSON-LD context.
+        strip: Whether to strip leading and trailing whitespaces from cells.
 
     """
 
@@ -38,12 +41,14 @@ def __init__(
         type: "Optional[str]" = "dataset",
         prefixes: "Optional[dict]" = None,
         context: "Optional[Union[dict, list]]" = None,
+        strip: bool = True,
     ):
         self.header = header
         self.data = data
         self.type = type
         self.prefixes = prefixes
         self.context = context
+        self.strip = strip
 
     def asdicts(self) -> "List[dict]":
         """Return the table as a list of dicts."""
@@ -53,9 +58,11 @@ def asdicts(self) -> "List[dict]":
         for row in self.data:
             d = AttrDict()
             for i, colname in enumerate(self.header):
-                cell = row[i]
+                cell = row[i].strip() if row[i] and self.strip else row[i]
                 if cell:
-                    addnested(d, colname, cell)
+                    addnested(
+                        d, colname.strip() if self.strip else colname, cell
+                    )
             jsonld = as_jsonld(
                 d, type=self.type, prefixes=self.prefixes, **kw  # type: ignore
             )
@@ -66,3 +73,80 @@ def save(self, ts: Triplestore) -> None:
         """Save tabular datadocumentation to triplestore."""
         for d in self.asdicts():
             save_dict(ts, d)
+
+    @staticmethod
+    def parse_csv(
+        csvfile: "Union[Path, str]",
+        type: "Optional[str]" = "dataset",
+        prefixes: "Optional[dict]" = None,
+        context: "Optional[Union[dict, list]]" = None,
+        encoding: str = "utf-8",
+        dialect: "Union[csv.Dialect, str]" = "excel",
+        **kwargs,
+    ) -> "TableDoc":
+        # pylint: disable=line-too-long
+        """Parse a csv file using the standard library csv module.
+
+        Arguments:
+            csvfile: CSV file to parse.
+            type: Type of data to save (applies to all rows).  Should
+                either be one of the pre-defined names: "dataset",
+                "distribution", "accessService", "parser" and "generator"
+                or an IRI to a class in an ontology.  Defaults to
+                "dataset".
+            prefixes: Dict with prefixes in addition to those included in the
+                JSON-LD context.  Should map namespace prefixes to IRIs.
+            context: Dict with user-defined JSON-LD context.
+            encoding: The encoding of the csv file.  Note that Excel may
+                encode as "ISO-8859" (commonly used in 1990th).
+            dialect: A subclass of csv.Dialect, or the name of the dialect,
+                specifying how the `csvfile` is formatted.  For more details,
+                see [Dialects and Formatting Parameters].
+            kwargs: Additional keyword arguments overriding individual
+                formatting parameters.  For more details, see
+                [Dialects and Formatting Parameters].
+
+        References:
+        [Dialects and Formatting Parameters]: https://docs.python.org/3/library/csv.html#dialects-and-formatting-parameters
+        """
+        with open(csvfile, mode="rt", encoding=encoding) as f:
+            reader = csv.reader(f, dialect=dialect, **kwargs)
+            header = next(reader)
+            data = list(reader)
+
+        return TableDoc(
+            header=header,
+            data=data,
+            type=type,
+            prefixes=prefixes,
+            context=context,
+        )
+
+    def write_csv(
+        self,
+        csvfile: "Union[Path, str]",
+        encoding: str = "utf-8",
+        dialect: "Union[csv.Dialect, str]" = "excel",
+        **kwargs,
+    ) -> None:
+        # pylint: disable=line-too-long
+        """Write the table to a csv file using the standard library csv module.
+
+        Arguments:
+            csvfile: CSV file to parse.
+            encoding: The encoding of the csv file.
+            dialect: A subclass of csv.Dialect, or the name of the dialect,
+                specifying how the `csvfile` is formatted.  For more details,
+                see [Dialects and Formatting Parameters].
+            kwargs: Additional keyword arguments overriding individual
+                formatting parameters.  For more details, see
+                [Dialects and Formatting Parameters].
+
+        References:
+        [Dialects and Formatting Parameters]: https://docs.python.org/3/library/csv.html#dialects-and-formatting-parameters
+        """
+        with open(csvfile, mode="wt", encoding=encoding) as f:
+            writer = csv.writer(f, dialect=dialect, **kwargs)
+            writer.writerow(self.header)
+            for row in self.data:
+                writer.writerow(row)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -18,3 +18,4 @@ dist/

		# Test output
		route.svg
		coverage.xml
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,3 +3,4 @@ @@
     *.ttl
     *.png
     *.tiff
+    *.csv