Skip to content

Commit

Permalink
Save pandas.DataFrame as Apache Parquet format in OMMX Artifact (#60)
Browse files Browse the repository at this point in the history
- Annotation for NumPy layer has changed to add `org.ommx.user.` header.
For example, `title` is renamed to `org.ommx.user.title`. They are
filtered and trimmed by `Digest.user_annotations` new property.
- Add `ArtifactBuilder.add_dataframe` with above annotation mechanism
  • Loading branch information
termoshtt authored Jun 10, 2024
1 parent 38845fd commit 1870608
Show file tree
Hide file tree
Showing 5 changed files with 86 additions and 15 deletions.
24 changes: 14 additions & 10 deletions ARTIFACT.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,17 +26,21 @@ OCI Artifact is represented as an [OCI Image manifest](https://github.com/openco
OMMX Artifact is a collection of `config`, `layers`, and annotations.

- `config` is a JSON blob with the following media types:
- `application/org.ommx.v1.config+json`
- `application/org.ommx.v1.config+json`
- `layers` consists of the following blobs:
- `application/org.ommx.v1.solution` blob with the following annotations:
- `org.ommx.v1.solution.instance`: (digest) The corresponding instance of the solution
- `org.ommx.v1.solution.solver`: (JSON) The solver information which generated the solution as a JSON
- `org.ommx.v1.solution.parameters`: (JSON) Solver parameters used to generate the solution as a JSON
- `org.ommx.v1.solution.start`: (RFC3339) The start time of the solution as a RFC3339 string
- `org.ommx.v1.solution.end`: (RFC3339) The end time of the solution as a RFC3339 string
- `application/org.ommx.v1.instance` blob with the following annotations:
- `org.ommx.v1.instance.title`: (Free string) The title of this instance
- `org.ommx.v1.instance.created`: (RFC3339) When this instance was created
- `application/org.ommx.v1.solution` blob with the following annotations:
- `org.ommx.v1.solution.instance`: (digest) The corresponding instance of the solution
- `org.ommx.v1.solution.solver`: (JSON) The solver information which generated the solution as a JSON
- `org.ommx.v1.solution.parameters`: (JSON) Solver parameters used to generate the solution as a JSON
- `org.ommx.v1.solution.start`: (RFC3339) The start time of the solution as a RFC3339 string
- `org.ommx.v1.solution.end`: (RFC3339) The end time of the solution as a RFC3339 string
- `application/org.ommx.v1.instance` blob with the following annotations:
- `org.ommx.v1.instance.title`: (Free string) The title of this instance
- `org.ommx.v1.instance.created`: (RFC3339) When this instance was created
- `application/vnd.numpy`: NumPy's ndarray with NPY format
- `application/vnd.apache.parquet`: DataFrame with Parquet format
- And other blobs with appropriate media types. The media type SHOULD be registered in the [IANA media type registry](https://www.iana.org/assignments/media-types/media-types.xhtml).
- Users can add arbitrary annotation to arbitrary layer. `org.ommx.user.` prefix is reserved for user-defined annotations.
- Annotations in manifest:
- TBA

Expand Down
2 changes: 2 additions & 0 deletions python/ommx/_ommx_rust.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ class Descriptor:
@property
def annotations(self) -> dict[str, str]: ...
@property
def user_annotations(self) -> dict[str, str]: ...
@property
def media_type(self) -> str: ...
def __str__(self) -> str: ...
def to_dict(self) -> dict[str, str | int | dict[str, str]]: ...
Expand Down
58 changes: 53 additions & 5 deletions python/ommx/artifact.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import io
import json
import pandas
import numpy
from dataclasses import dataclass
from pathlib import Path
Expand Down Expand Up @@ -196,6 +197,14 @@ def get_ndarray(self, descriptor: Descriptor) -> numpy.ndarray:
f = io.BytesIO(blob)
return numpy.load(f)

def get_dataframe(self, descriptor: Descriptor) -> pandas.DataFrame:
"""
Get a pandas DataFrame from an artifact layer stored by :py:meth:`ArtifactBuilder.add_dataframe`
"""
assert descriptor.media_type == "application/vnd.apache.parquet"
blob = self.get_blob(descriptor)
return pandas.read_parquet(io.BytesIO(blob))


@dataclass(frozen=True)
class ArtifactBuilder:
Expand Down Expand Up @@ -369,9 +378,7 @@ def add_solution(self, solution: Solution) -> Descriptor:
annotations["org.ommx.v1.solution.end"] = solution.end.isoformat()
return self.add_layer("application/org.ommx.v1.solution", blob, annotations)

def add_ndarray(
self, array: numpy.ndarray, annotations: dict[str, str] = {}
) -> Descriptor:
def add_ndarray(self, array: numpy.ndarray, /, **annotations: str) -> Descriptor:
"""
Add a numpy ndarray to the artifact with npy format
Expand All @@ -381,16 +388,22 @@ def add_ndarray(
>>> import numpy as np
>>> array = np.array([1, 2, 3])
Store the array in the artifact with `application/vnd.numpy` media type
Store the array in the artifact with `application/vnd.numpy` media type. We can also add annotations to the layer.
>>> import uuid
>>> builder = ArtifactBuilder.new_archive_unnamed(f"data/test_array.ommx.{uuid.uuid4()}")
>>> _desc = builder.add_ndarray(array)
>>> _desc = builder.add_ndarray(array, title="test_array")
>>> artifact = builder.build()
The `title` annotation is stored as `org.ommx.user.title` in the artifact, which can be accessed by :py:attr:`Descriptor.annotations` or :py:attr:`Descriptor.user_annotations`.
>>> layer = artifact.layers[0]
>>> print(layer.media_type)
application/vnd.numpy
>>> print(layer.annotations)
{'org.ommx.user.title': 'test_array'}
>>> print(layer.user_annotations)
{'title': 'test_array'}
Load the array from the artifact by :py:meth:`Artifact.get_ndarray`
Expand All @@ -402,8 +415,43 @@ def add_ndarray(
f = io.BytesIO()
numpy.save(f, array)
blob = f.getvalue()
annotations = {"org.ommx.user." + k: v for k, v in annotations.items()}
return self.add_layer("application/vnd.numpy", blob, annotations)

def add_dataframe(self, df: pandas.DataFrame, /, **annotations: str) -> Descriptor:
"""
Add a pandas DataFrame to the artifact with parquet format
Example
========
>>> import pandas as pd
>>> df = pd.DataFrame({"a": [1, 2], "b": [3, 4]})
Store the DataFrame in the artifact with `application/vnd.apache.parquet` media type.
>>> import uuid
>>> builder = ArtifactBuilder.new_archive_unnamed(f"data/test_dataframe.ommx.{uuid.uuid4()}")
>>> _desc = builder.add_dataframe(df, title="test_dataframe")
>>> artifact = builder.build()
The `title` annotation is stored as `org.ommx.user.title` in the artifact, which can be accessed by :py:attr:`Descriptor.annotations` or :py:attr:`Descriptor.user_annotations`.
>>> layer = artifact.layers[0]
>>> print(layer.media_type)
application/vnd.apache.parquet
>>> print(layer.annotations)
{'org.ommx.user.title': 'test_dataframe'}
>>> print(layer.user_annotations)
{'title': 'test_dataframe'}
>>> df2 = artifact.get_dataframe(layer)
>>> assert df.equals(df2)
"""
blob = df.to_parquet()
annotations = {"org.ommx.user." + k: v for k, v in annotations.items()}
return self.add_layer("application/vnd.apache.parquet", blob, annotations)

def add_layer(
self, media_type: str, blob: bytes, annotations: dict[str, str] = {}
) -> Descriptor:
Expand Down
1 change: 1 addition & 0 deletions python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ requires-python = ">=3.8"
dependencies = [
"numpy>=1.23.0, <2.0.0",
"pandas>=2.0.0, <3.0.0",
"pyarrow>=16.0.0, <17.0.0",
"protobuf>=5.26.1, <6.0.0",
"python-dateutil>=2.9.0, <3.0.0",
]
Expand Down
16 changes: 16 additions & 0 deletions python/src/descriptor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,20 @@ impl PyDescriptor {
HashMap::new()
}
}

/// Return annotations with key prefix "org.ommx.user."
#[getter]
pub fn user_annotations(&self) -> HashMap<String, String> {
if let Some(annotations) = self.0.annotations() {
annotations
.iter()
.flat_map(|(k, v)| {
k.strip_prefix("org.ommx.user.")
.map(|k| (k.to_string(), v.clone()))
})
.collect()
} else {
HashMap::new()
}
}
}

0 comments on commit 1870608

Please # to comment.