Save pandas.DataFrame as Apache Parquet format in OMMX Artifact (#60)

- Annotation for NumPy layer has changed to add `org.ommx.user.` header. For example, `title` is renamed to `org.ommx.user.title`. They are filtered and trimmed by `Digest.user_annotations` new property. - Add `ArtifactBuilder.add_dataframe` with above annotation mechanism
Jij-Inc · Jun 10, 2024 · 1870608 · 1870608
1 parent 38845fd
commit 1870608
Show file tree

Hide file tree

Showing 5 changed files with 86 additions and 15 deletions.
diff --git a/ARTIFACT.md b/ARTIFACT.md
@@ -26,17 +26,21 @@ OCI Artifact is represented as an [OCI Image manifest](https://github.com/openco
 OMMX Artifact is a collection of `config`, `layers`, and annotations.
 
 - `config` is a JSON blob with the following media types:
-    - `application/org.ommx.v1.config+json`
+  - `application/org.ommx.v1.config+json`
 - `layers` consists of the following blobs:
-    - `application/org.ommx.v1.solution` blob with the following annotations:
-        - `org.ommx.v1.solution.instance`: (digest) The corresponding instance of the solution
-        - `org.ommx.v1.solution.solver`: (JSON) The solver information which generated the solution as a JSON
-        - `org.ommx.v1.solution.parameters`: (JSON) Solver parameters used to generate the solution as a JSON
-        - `org.ommx.v1.solution.start`: (RFC3339) The start time of the solution as a RFC3339 string
-        - `org.ommx.v1.solution.end`: (RFC3339) The end time of the solution as a RFC3339 string
-    - `application/org.ommx.v1.instance` blob with the following annotations:
-        - `org.ommx.v1.instance.title`: (Free string) The title of this instance
-        - `org.ommx.v1.instance.created`: (RFC3339) When this instance was created
+  - `application/org.ommx.v1.solution` blob with the following annotations:
+    - `org.ommx.v1.solution.instance`: (digest) The corresponding instance of the solution
+    - `org.ommx.v1.solution.solver`: (JSON) The solver information which generated the solution as a JSON
+    - `org.ommx.v1.solution.parameters`: (JSON) Solver parameters used to generate the solution as a JSON
+    - `org.ommx.v1.solution.start`: (RFC3339) The start time of the solution as a RFC3339 string
+    - `org.ommx.v1.solution.end`: (RFC3339) The end time of the solution as a RFC3339 string
+  - `application/org.ommx.v1.instance` blob with the following annotations:
+    - `org.ommx.v1.instance.title`: (Free string) The title of this instance
+    - `org.ommx.v1.instance.created`: (RFC3339) When this instance was created
+  - `application/vnd.numpy`: NumPy's ndarray with NPY format
+  - `application/vnd.apache.parquet`: DataFrame with Parquet format
+  - And other blobs with appropriate media types. The media type SHOULD be registered in the [IANA media type registry](https://www.iana.org/assignments/media-types/media-types.xhtml).
+    - Users can add arbitrary annotation to arbitrary layer. `org.ommx.user.` prefix is reserved for user-defined annotations.
 - Annotations in manifest:
   - TBA
 

diff --git a/python/ommx/_ommx_rust.pyi b/python/ommx/_ommx_rust.pyi
@@ -8,6 +8,8 @@ class Descriptor:
     @property
     def annotations(self) -> dict[str, str]: ...
     @property
+    def user_annotations(self) -> dict[str, str]: ...
+    @property
     def media_type(self) -> str: ...
     def __str__(self) -> str: ...
     def to_dict(self) -> dict[str, str | int | dict[str, str]]: ...

diff --git a/python/ommx/artifact.py b/python/ommx/artifact.py
@@ -2,6 +2,7 @@
 
 import io
 import json
+import pandas
 import numpy
 from dataclasses import dataclass
 from pathlib import Path
@@ -196,6 +197,14 @@ def get_ndarray(self, descriptor: Descriptor) -> numpy.ndarray:
         f = io.BytesIO(blob)
         return numpy.load(f)
 
+    def get_dataframe(self, descriptor: Descriptor) -> pandas.DataFrame:
+        """
+        Get a pandas DataFrame from an artifact layer stored by :py:meth:`ArtifactBuilder.add_dataframe`
+        """
+        assert descriptor.media_type == "application/vnd.apache.parquet"
+        blob = self.get_blob(descriptor)
+        return pandas.read_parquet(io.BytesIO(blob))
+
 
 @dataclass(frozen=True)
 class ArtifactBuilder:
@@ -369,9 +378,7 @@ def add_solution(self, solution: Solution) -> Descriptor:
             annotations["org.ommx.v1.solution.end"] = solution.end.isoformat()
         return self.add_layer("application/org.ommx.v1.solution", blob, annotations)
 
-    def add_ndarray(
-        self, array: numpy.ndarray, annotations: dict[str, str] = {}
-    ) -> Descriptor:
+    def add_ndarray(self, array: numpy.ndarray, /, **annotations: str) -> Descriptor:
         """
         Add a numpy ndarray to the artifact with npy format
 
@@ -381,16 +388,22 @@ def add_ndarray(
         >>> import numpy as np
         >>> array = np.array([1, 2, 3])
 
-        Store the array in the artifact with `application/vnd.numpy` media type
+        Store the array in the artifact with `application/vnd.numpy` media type. We can also add annotations to the layer.
 
         >>> import uuid
         >>> builder = ArtifactBuilder.new_archive_unnamed(f"data/test_array.ommx.{uuid.uuid4()}")
-        >>> _desc = builder.add_ndarray(array)
+        >>> _desc = builder.add_ndarray(array, title="test_array")
         >>> artifact = builder.build()
 
+        The `title` annotation is stored as `org.ommx.user.title` in the artifact, which can be accessed by :py:attr:`Descriptor.annotations` or :py:attr:`Descriptor.user_annotations`.
+
         >>> layer = artifact.layers[0]
         >>> print(layer.media_type)
         application/vnd.numpy
+        >>> print(layer.annotations)
+        {'org.ommx.user.title': 'test_array'}
+        >>> print(layer.user_annotations)
+        {'title': 'test_array'}
 
         Load the array from the artifact by :py:meth:`Artifact.get_ndarray`
 
@@ -402,8 +415,43 @@ def add_ndarray(
         f = io.BytesIO()
         numpy.save(f, array)
         blob = f.getvalue()
+        annotations = {"org.ommx.user." + k: v for k, v in annotations.items()}
         return self.add_layer("application/vnd.numpy", blob, annotations)
 
+    def add_dataframe(self, df: pandas.DataFrame, /, **annotations: str) -> Descriptor:
+        """
+        Add a pandas DataFrame to the artifact with parquet format
+
+        Example
+        ========
+        >>> import pandas as pd
+        >>> df = pd.DataFrame({"a": [1, 2], "b": [3, 4]})
+
+        Store the DataFrame in the artifact with `application/vnd.apache.parquet` media type.
+
+        >>> import uuid
+        >>> builder = ArtifactBuilder.new_archive_unnamed(f"data/test_dataframe.ommx.{uuid.uuid4()}")
+        >>> _desc = builder.add_dataframe(df, title="test_dataframe")
+        >>> artifact = builder.build()
+
+        The `title` annotation is stored as `org.ommx.user.title` in the artifact, which can be accessed by :py:attr:`Descriptor.annotations` or :py:attr:`Descriptor.user_annotations`.
+
+        >>> layer = artifact.layers[0]
+        >>> print(layer.media_type)
+        application/vnd.apache.parquet
+        >>> print(layer.annotations)
+        {'org.ommx.user.title': 'test_dataframe'}
+        >>> print(layer.user_annotations)
+        {'title': 'test_dataframe'}
+
+        >>> df2 = artifact.get_dataframe(layer)
+        >>> assert df.equals(df2)
+
+        """
+        blob = df.to_parquet()
+        annotations = {"org.ommx.user." + k: v for k, v in annotations.items()}
+        return self.add_layer("application/vnd.apache.parquet", blob, annotations)
+
     def add_layer(
         self, media_type: str, blob: bytes, annotations: dict[str, str] = {}
     ) -> Descriptor:

diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -28,6 +28,7 @@ requires-python = ">=3.8"
 dependencies = [
     "numpy>=1.23.0, <2.0.0",
     "pandas>=2.0.0, <3.0.0",
+    "pyarrow>=16.0.0, <17.0.0",
     "protobuf>=5.26.1, <6.0.0",
     "python-dateutil>=2.9.0, <3.0.0",
 ]

diff --git a/python/src/descriptor.rs b/python/src/descriptor.rs
@@ -48,4 +48,20 @@ impl PyDescriptor {
             HashMap::new()
         }
     }
+
+    /// Return annotations with key prefix "org.ommx.user."
+    #[getter]
+    pub fn user_annotations(&self) -> HashMap<String, String> {
+        if let Some(annotations) = self.0.annotations() {
+            annotations
+                .iter()
+                .flat_map(|(k, v)| {
+                    k.strip_prefix("org.ommx.user.")
+                        .map(|k| (k.to_string(), v.clone()))
+                })
+                .collect()
+        } else {
+            HashMap::new()
+        }
+    }
 }