diff --git a/mlem/contrib/dvc.py b/mlem/contrib/dvc.py index 4214ea9e..de54b5c0 100644 --- a/mlem/contrib/dvc.py +++ b/mlem/contrib/dvc.py @@ -8,7 +8,12 @@ from fsspec.implementations.github import GithubFileSystem from fsspec.implementations.local import LocalFileSystem -from mlem.core.artifacts import LocalArtifact, LocalStorage, Storage +from mlem.core.artifacts import ( + LocalArtifact, + LocalStorage, + Storage, + get_local_file_info, +) from mlem.core.meta_io import get_fs BATCH_SIZE = 10 ** 5 @@ -37,12 +42,18 @@ class DVCStorage(LocalStorage): uri: str = "" def upload(self, local_path: str, target_path: str) -> "DVCArtifact": - return DVCArtifact(uri=super().upload(local_path, target_path).uri) + return DVCArtifact( + uri=super().upload(local_path, target_path).uri, + **get_local_file_info(local_path), + ) @contextlib.contextmanager def open(self, path) -> Iterator[Tuple[IO, "DVCArtifact"]]: - with super().open(path) as (io, _): - yield io, DVCArtifact(uri=path) + with super().open(path) as (io, art): + dvc_art = DVCArtifact(uri=path, size=-1, hash="") + yield io, dvc_art + dvc_art.size = art.size + dvc_art.hash = art.hash def relative(self, fs: AbstractFileSystem, path: str) -> Storage: storage = super().relative(fs, path) @@ -65,7 +76,7 @@ def _download(self, target_path: str) -> LocalArtifact: while batch: fout.write(batch) batch = fin.read(BATCH_SIZE) - return LocalArtifact(uri=target_path) + return LocalArtifact(uri=target_path, size=self.size, hash=self.hash) @contextlib.contextmanager def open(self) -> Iterator[IO]: @@ -96,4 +107,4 @@ def open(self) -> Iterator[IO]: def relative(self, fs: AbstractFileSystem, path: str) -> "DVCArtifact": relative = super().relative(fs, path) - return DVCArtifact(uri=relative.uri) # pylint: disable=no-member + return DVCArtifact(uri=relative.uri, size=self.size, hash=self.hash) diff --git a/mlem/core/artifacts.py b/mlem/core/artifacts.py index cf25b332..11b633b5 100644 --- a/mlem/core/artifacts.py +++ b/mlem/core/artifacts.py @@ -3,6 +3,7 @@ such as model binaries or .csv files """ import contextlib +import hashlib import os import posixpath import tempfile @@ -22,17 +23,26 @@ import fsspec from fsspec import AbstractFileSystem from fsspec.implementations.local import LocalFileSystem -from typing_extensions import Literal +from typing_extensions import Literal, TypedDict from mlem.core.base import MlemObject from mlem.core.meta_io import get_fs, get_path_by_fs_path +CHUNK_SIZE = 2 ** 20 # 1 mb + + +class ArtifactInfo(TypedDict): + size: int + hash: str + class Artifact(MlemObject, ABC): __type_root__ = True __default_type__: ClassVar = "local" abs_name: ClassVar = "artifact" uri: str + size: int + hash: str @overload def materialize( @@ -60,7 +70,11 @@ def materialize( target_path, posixpath.basename(tmp.uri) ) target_fs.upload(tmp.uri, target_path) - return FSSpecArtifact(uri=get_path_by_fs_path(target_fs, target_path)) + return FSSpecArtifact( + uri=get_path_by_fs_path(target_fs, target_path), + size=self.size, + hash=self.hash, + ) @abstractmethod def _download(self, target_path: str) -> "LocalArtifact": @@ -90,7 +104,7 @@ def _download(self, target_path: str) -> "LocalArtifact": if os.path.isdir(target_path): target_path = posixpath.join(target_path, posixpath.basename(path)) fs.download(path, target_path) - return LocalArtifact(uri=target_path) + return LocalArtifact(uri=target_path, size=self.size, hash=self.hash) @contextlib.contextmanager def open(self) -> Iterator[IO]: @@ -143,15 +157,21 @@ def upload(self, local_path: str, target_path: str) -> FSSpecArtifact: path = posixpath.join(self.base_path, target_path) fs.makedirs(posixpath.dirname(path), exist_ok=True) fs.upload(local_path, path) - return FSSpecArtifact(uri=self.create_uri(target_path)) + return FSSpecArtifact( + uri=self.create_uri(target_path), **get_local_file_info(local_path) + ) @contextlib.contextmanager def open(self, path) -> Iterator[Tuple[IO, FSSpecArtifact]]: fs = self.get_fs() fullpath = posixpath.join(self.base_path, path) fs.makedirs(posixpath.dirname(fullpath), exist_ok=True) + art = FSSpecArtifact(uri=(self.create_uri(path)), size=-1, hash="") with fs.open(fullpath, "wb") as f: - yield f, FSSpecArtifact(uri=(self.create_uri(path))) + yield f, art + file_info = get_file_info(fullpath, fs) + art.size = file_info["size"] + art.hash = file_info["hash"] def relative( self, @@ -205,12 +225,17 @@ def relative(self, fs: AbstractFileSystem, path: str) -> "Storage": def upload(self, local_path: str, target_path: str) -> "LocalArtifact": super().upload(local_path, target_path) - return LocalArtifact(uri=target_path) + return LocalArtifact( + uri=target_path, **get_local_file_info(local_path) + ) @contextlib.contextmanager def open(self, path) -> Iterator[Tuple[IO, "LocalArtifact"]]: - with super().open(path) as (io, _): - yield io, LocalArtifact(uri=path) + with super().open(path) as (io, art): + local_art = LocalArtifact(uri=path, size=-1, hash="") + yield io, local_art + local_art.size = art.size + local_art.hash = art.hash class LocalArtifact(FSSpecArtifact): @@ -219,13 +244,37 @@ class LocalArtifact(FSSpecArtifact): def relative(self, fs: AbstractFileSystem, path: str) -> "FSSpecArtifact": if isinstance(fs, LocalFileSystem): - return LocalArtifact(uri=posixpath.join(path, self.uri)) + return LocalArtifact( + uri=posixpath.join(path, self.uri), + size=self.size, + hash=self.hash, + ) return FSSpecArtifact( - uri=get_path_by_fs_path(fs, posixpath.join(path, self.uri)) + uri=get_path_by_fs_path(fs, posixpath.join(path, self.uri)), + size=self.size, + hash=self.hash, ) +def md5_fileobj(fobj): + hash_md5 = hashlib.md5() + for chunk in iter(lambda: fobj.read(CHUNK_SIZE), b""): + hash_md5.update(chunk) + return hash_md5.hexdigest() + + +def get_file_info(path: str, fs: AbstractFileSystem) -> ArtifactInfo: + info = fs.info(path) + with fs.open(path) as fobj: + hash_value = md5_fileobj(fobj) + return {"size": info["size"], "hash": hash_value} + + +def get_local_file_info(path: str): + return get_file_info(path, LocalFileSystem()) + + LOCAL_STORAGE = LocalStorage(uri="") Artifacts = List[Artifact] diff --git a/mlem/core/objects.py b/mlem/core/objects.py index d2dd7ec3..f8fe3ef7 100644 --- a/mlem/core/objects.py +++ b/mlem/core/objects.py @@ -443,7 +443,9 @@ def clone( ) if isinstance(download, FSSpecArtifact): download = LocalArtifact( - uri=posixpath.relpath(download.uri, make_posix(path)) + uri=posixpath.relpath(download.uri, make_posix(path)), + size=download.size, + hash=download.hash, ) new.artifacts.append(download) new._write_meta(location, link) # pylint: disable=protected-access diff --git a/mlem/utils/module.py b/mlem/utils/module.py index 4cd5cb0b..8f0ec46b 100644 --- a/mlem/utils/module.py +++ b/mlem/utils/module.py @@ -155,7 +155,8 @@ class ISortModuleFinder: def __init__(self): config = default.copy() config["known_first_party"].append("mlem") - config["known_third_party"].append("xgboost") + # TODO: https://github.com/iterative/mlem/issues/45 + config["known_third_party"].extend(["xgboost", "lightgbm", "catboost"]) config["known_standard_library"].extend( [ "opcode", diff --git a/tests/core/custom_requirements/test_remote_custom_model.py b/tests/core/custom_requirements/test_remote_custom_model.py index 7cd4287a..4a83058d 100644 --- a/tests/core/custom_requirements/test_remote_custom_model.py +++ b/tests/core/custom_requirements/test_remote_custom_model.py @@ -1,9 +1,10 @@ import posixpath from mlem.core.metadata import load_meta -from tests.conftest import MLEM_TEST_REPO, need_test_repo_auth +from tests.conftest import MLEM_TEST_REPO, long, need_test_repo_auth +@long @need_test_repo_auth def test_remote_custom_model(current_test_branch): model_meta = load_meta( diff --git a/tests/core/test_artifacts.py b/tests/core/test_artifacts.py index fb9f9754..445f6394 100644 --- a/tests/core/test_artifacts.py +++ b/tests/core/test_artifacts.py @@ -20,6 +20,8 @@ def test_fsspec_backend_s3_upload(tmpdir, s3_tmp_path, s3_storage): resource = resource_path(__file__, "file.txt") artifact = s3_storage.upload(resource, target) assert isinstance(artifact, FSSpecArtifact) + assert artifact.hash != "" + assert artifact.size > 0 local_target = str(tmpdir / "file.txt") artifact.materialize(local_target) with open(local_target, "r", encoding="utf8") as actual, open( @@ -35,7 +37,8 @@ def test_fsspec_backend_s3_open(s3_tmp_path, s3_storage): with s3_storage.open(target) as (f, artifact): f.write(b"a") assert isinstance(artifact, FSSpecArtifact) - + assert artifact.hash != "" + assert artifact.size > 0 with artifact.open() as f: assert f.read() == b"a" @@ -68,10 +71,14 @@ def test_local_storage_relative(tmpdir): with rstorage.open("file2") as (f, open_art): f.write(b"1") assert isinstance(open_art, LocalArtifact) + assert open_art.hash != "" + assert open_art.size > 0 assert open_art.uri == "file2" assert os.path.isfile(os.path.join(tmpdir, "subdir", open_art.uri)) upload_art = rstorage.upload(__file__, "file") assert isinstance(upload_art, LocalArtifact) assert upload_art.uri == "file" + assert upload_art.hash != "" + assert upload_art.size > 0 assert os.path.isfile(os.path.join(tmpdir, "subdir", upload_art.uri)) diff --git a/tests/core/test_objects.py b/tests/core/test_objects.py index 7706badd..17033674 100644 --- a/tests/core/test_objects.py +++ b/tests/core/test_objects.py @@ -169,6 +169,8 @@ def _check_cloned_model(cloned_model_meta: MlemMeta, path, fs=None): assert len(cloned_model_meta.artifacts) == 1 art = cloned_model_meta.artifacts[0] assert isinstance(art, LocalArtifact) + assert art.hash != "" + assert art.size > 0 assert art.uri == posixpath.join(ART_DIR, "data.pkl") assert not os.path.isabs(art.uri) assert fs.isfile(posixpath.join(path, art.uri))