Skip to content
This repository was archived by the owner on Sep 13, 2023. It is now read-only.

Commit

Permalink
art sizes and hashes (#131)
Browse files Browse the repository at this point in the history
closes #90
  • Loading branch information
mike0sv authored Dec 10, 2021
1 parent 2103fa0 commit 0070e31
Show file tree
Hide file tree
Showing 7 changed files with 93 additions and 20 deletions.
23 changes: 17 additions & 6 deletions mlem/contrib/dvc.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,12 @@
from fsspec.implementations.github import GithubFileSystem
from fsspec.implementations.local import LocalFileSystem

from mlem.core.artifacts import LocalArtifact, LocalStorage, Storage
from mlem.core.artifacts import (
LocalArtifact,
LocalStorage,
Storage,
get_local_file_info,
)
from mlem.core.meta_io import get_fs

BATCH_SIZE = 10 ** 5
Expand Down Expand Up @@ -37,12 +42,18 @@ class DVCStorage(LocalStorage):
uri: str = ""

def upload(self, local_path: str, target_path: str) -> "DVCArtifact":
return DVCArtifact(uri=super().upload(local_path, target_path).uri)
return DVCArtifact(
uri=super().upload(local_path, target_path).uri,
**get_local_file_info(local_path),
)

@contextlib.contextmanager
def open(self, path) -> Iterator[Tuple[IO, "DVCArtifact"]]:
with super().open(path) as (io, _):
yield io, DVCArtifact(uri=path)
with super().open(path) as (io, art):
dvc_art = DVCArtifact(uri=path, size=-1, hash="")
yield io, dvc_art
dvc_art.size = art.size
dvc_art.hash = art.hash

def relative(self, fs: AbstractFileSystem, path: str) -> Storage:
storage = super().relative(fs, path)
Expand All @@ -65,7 +76,7 @@ def _download(self, target_path: str) -> LocalArtifact:
while batch:
fout.write(batch)
batch = fin.read(BATCH_SIZE)
return LocalArtifact(uri=target_path)
return LocalArtifact(uri=target_path, size=self.size, hash=self.hash)

@contextlib.contextmanager
def open(self) -> Iterator[IO]:
Expand Down Expand Up @@ -96,4 +107,4 @@ def open(self) -> Iterator[IO]:

def relative(self, fs: AbstractFileSystem, path: str) -> "DVCArtifact":
relative = super().relative(fs, path)
return DVCArtifact(uri=relative.uri) # pylint: disable=no-member
return DVCArtifact(uri=relative.uri, size=self.size, hash=self.hash)
69 changes: 59 additions & 10 deletions mlem/core/artifacts.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
such as model binaries or .csv files
"""
import contextlib
import hashlib
import os
import posixpath
import tempfile
Expand All @@ -22,17 +23,26 @@
import fsspec
from fsspec import AbstractFileSystem
from fsspec.implementations.local import LocalFileSystem
from typing_extensions import Literal
from typing_extensions import Literal, TypedDict

from mlem.core.base import MlemObject
from mlem.core.meta_io import get_fs, get_path_by_fs_path

CHUNK_SIZE = 2 ** 20 # 1 mb


class ArtifactInfo(TypedDict):
size: int
hash: str


class Artifact(MlemObject, ABC):
__type_root__ = True
__default_type__: ClassVar = "local"
abs_name: ClassVar = "artifact"
uri: str
size: int
hash: str

@overload
def materialize(
Expand Down Expand Up @@ -60,7 +70,11 @@ def materialize(
target_path, posixpath.basename(tmp.uri)
)
target_fs.upload(tmp.uri, target_path)
return FSSpecArtifact(uri=get_path_by_fs_path(target_fs, target_path))
return FSSpecArtifact(
uri=get_path_by_fs_path(target_fs, target_path),
size=self.size,
hash=self.hash,
)

@abstractmethod
def _download(self, target_path: str) -> "LocalArtifact":
Expand Down Expand Up @@ -90,7 +104,7 @@ def _download(self, target_path: str) -> "LocalArtifact":
if os.path.isdir(target_path):
target_path = posixpath.join(target_path, posixpath.basename(path))
fs.download(path, target_path)
return LocalArtifact(uri=target_path)
return LocalArtifact(uri=target_path, size=self.size, hash=self.hash)

@contextlib.contextmanager
def open(self) -> Iterator[IO]:
Expand Down Expand Up @@ -143,15 +157,21 @@ def upload(self, local_path: str, target_path: str) -> FSSpecArtifact:
path = posixpath.join(self.base_path, target_path)
fs.makedirs(posixpath.dirname(path), exist_ok=True)
fs.upload(local_path, path)
return FSSpecArtifact(uri=self.create_uri(target_path))
return FSSpecArtifact(
uri=self.create_uri(target_path), **get_local_file_info(local_path)
)

@contextlib.contextmanager
def open(self, path) -> Iterator[Tuple[IO, FSSpecArtifact]]:
fs = self.get_fs()
fullpath = posixpath.join(self.base_path, path)
fs.makedirs(posixpath.dirname(fullpath), exist_ok=True)
art = FSSpecArtifact(uri=(self.create_uri(path)), size=-1, hash="")
with fs.open(fullpath, "wb") as f:
yield f, FSSpecArtifact(uri=(self.create_uri(path)))
yield f, art
file_info = get_file_info(fullpath, fs)
art.size = file_info["size"]
art.hash = file_info["hash"]

def relative(
self,
Expand Down Expand Up @@ -205,12 +225,17 @@ def relative(self, fs: AbstractFileSystem, path: str) -> "Storage":

def upload(self, local_path: str, target_path: str) -> "LocalArtifact":
super().upload(local_path, target_path)
return LocalArtifact(uri=target_path)
return LocalArtifact(
uri=target_path, **get_local_file_info(local_path)
)

@contextlib.contextmanager
def open(self, path) -> Iterator[Tuple[IO, "LocalArtifact"]]:
with super().open(path) as (io, _):
yield io, LocalArtifact(uri=path)
with super().open(path) as (io, art):
local_art = LocalArtifact(uri=path, size=-1, hash="")
yield io, local_art
local_art.size = art.size
local_art.hash = art.hash


class LocalArtifact(FSSpecArtifact):
Expand All @@ -219,13 +244,37 @@ class LocalArtifact(FSSpecArtifact):
def relative(self, fs: AbstractFileSystem, path: str) -> "FSSpecArtifact":

if isinstance(fs, LocalFileSystem):
return LocalArtifact(uri=posixpath.join(path, self.uri))
return LocalArtifact(
uri=posixpath.join(path, self.uri),
size=self.size,
hash=self.hash,
)

return FSSpecArtifact(
uri=get_path_by_fs_path(fs, posixpath.join(path, self.uri))
uri=get_path_by_fs_path(fs, posixpath.join(path, self.uri)),
size=self.size,
hash=self.hash,
)


def md5_fileobj(fobj):
hash_md5 = hashlib.md5()
for chunk in iter(lambda: fobj.read(CHUNK_SIZE), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()


def get_file_info(path: str, fs: AbstractFileSystem) -> ArtifactInfo:
info = fs.info(path)
with fs.open(path) as fobj:
hash_value = md5_fileobj(fobj)
return {"size": info["size"], "hash": hash_value}


def get_local_file_info(path: str):
return get_file_info(path, LocalFileSystem())


LOCAL_STORAGE = LocalStorage(uri="")

Artifacts = List[Artifact]
4 changes: 3 additions & 1 deletion mlem/core/objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -443,7 +443,9 @@ def clone(
)
if isinstance(download, FSSpecArtifact):
download = LocalArtifact(
uri=posixpath.relpath(download.uri, make_posix(path))
uri=posixpath.relpath(download.uri, make_posix(path)),
size=download.size,
hash=download.hash,
)
new.artifacts.append(download)
new._write_meta(location, link) # pylint: disable=protected-access
Expand Down
3 changes: 2 additions & 1 deletion mlem/utils/module.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,8 @@ class ISortModuleFinder:
def __init__(self):
config = default.copy()
config["known_first_party"].append("mlem")
config["known_third_party"].append("xgboost")
# TODO: https://github.com/iterative/mlem/issues/45
config["known_third_party"].extend(["xgboost", "lightgbm", "catboost"])
config["known_standard_library"].extend(
[
"opcode",
Expand Down
3 changes: 2 additions & 1 deletion tests/core/custom_requirements/test_remote_custom_model.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import posixpath

from mlem.core.metadata import load_meta
from tests.conftest import MLEM_TEST_REPO, need_test_repo_auth
from tests.conftest import MLEM_TEST_REPO, long, need_test_repo_auth


@long
@need_test_repo_auth
def test_remote_custom_model(current_test_branch):
model_meta = load_meta(
Expand Down
9 changes: 8 additions & 1 deletion tests/core/test_artifacts.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ def test_fsspec_backend_s3_upload(tmpdir, s3_tmp_path, s3_storage):
resource = resource_path(__file__, "file.txt")
artifact = s3_storage.upload(resource, target)
assert isinstance(artifact, FSSpecArtifact)
assert artifact.hash != ""
assert artifact.size > 0
local_target = str(tmpdir / "file.txt")
artifact.materialize(local_target)
with open(local_target, "r", encoding="utf8") as actual, open(
Expand All @@ -35,7 +37,8 @@ def test_fsspec_backend_s3_open(s3_tmp_path, s3_storage):
with s3_storage.open(target) as (f, artifact):
f.write(b"a")
assert isinstance(artifact, FSSpecArtifact)

assert artifact.hash != ""
assert artifact.size > 0
with artifact.open() as f:
assert f.read() == b"a"

Expand Down Expand Up @@ -68,10 +71,14 @@ def test_local_storage_relative(tmpdir):
with rstorage.open("file2") as (f, open_art):
f.write(b"1")
assert isinstance(open_art, LocalArtifact)
assert open_art.hash != ""
assert open_art.size > 0
assert open_art.uri == "file2"
assert os.path.isfile(os.path.join(tmpdir, "subdir", open_art.uri))

upload_art = rstorage.upload(__file__, "file")
assert isinstance(upload_art, LocalArtifact)
assert upload_art.uri == "file"
assert upload_art.hash != ""
assert upload_art.size > 0
assert os.path.isfile(os.path.join(tmpdir, "subdir", upload_art.uri))
2 changes: 2 additions & 0 deletions tests/core/test_objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,8 @@ def _check_cloned_model(cloned_model_meta: MlemMeta, path, fs=None):
assert len(cloned_model_meta.artifacts) == 1
art = cloned_model_meta.artifacts[0]
assert isinstance(art, LocalArtifact)
assert art.hash != ""
assert art.size > 0
assert art.uri == posixpath.join(ART_DIR, "data.pkl")
assert not os.path.isabs(art.uri)
assert fs.isfile(posixpath.join(path, art.uri))
Expand Down

0 comments on commit 0070e31

Please # to comment.