From 1ff6d20eca0e8712ea172eaa06b2d51bc6d6a6e9 Mon Sep 17 00:00:00 2001 From: Wauplin Date: Fri, 9 Sep 2022 16:01:18 +0200 Subject: [PATCH 1/6] add timestamp stats from files in cache scanner --- src/huggingface_hub/commands/cache.py | 6 + src/huggingface_hub/utils/_cache_manager.py | 150 ++++++++++++++++++-- tests/test_utils_cache.py | 40 +++++- 3 files changed, 183 insertions(+), 13 deletions(-) diff --git a/src/huggingface_hub/commands/cache.py b/src/huggingface_hub/commands/cache.py index 7c0f9d09a6..16dd59cd44 100644 --- a/src/huggingface_hub/commands/cache.py +++ b/src/huggingface_hub/commands/cache.py @@ -81,6 +81,8 @@ def _print_hf_cache_info_as_table(self, hf_cache_info: HFCacheInfo) -> None: repo.repo_type, "{:>12}".format(repo.size_on_disk_str), repo.nb_files, + repo.last_accessed_str, + repo.last_modified_str, ", ".join(sorted(repo.refs)), str(repo.repo_path), ] @@ -93,6 +95,8 @@ def _print_hf_cache_info_as_table(self, hf_cache_info: HFCacheInfo) -> None: "REPO TYPE", "SIZE ON DISK", "NB FILES", + "LAST_ACCESSED", + "LAST_MODIFIED", "REFS", "LOCAL PATH", ], @@ -108,6 +112,7 @@ def _print_hf_cache_info_as_table(self, hf_cache_info: HFCacheInfo) -> None: revision.commit_hash, "{:>12}".format(revision.size_on_disk_str), revision.nb_files, + revision.last_modified_str, ", ".join(sorted(revision.refs)), str(revision.snapshot_path), ] @@ -124,6 +129,7 @@ def _print_hf_cache_info_as_table(self, hf_cache_info: HFCacheInfo) -> None: "REVISION", "SIZE ON DISK", "NB FILES", + "LAST_MODIFIED", "REFS", "LOCAL PATH", ], diff --git a/src/huggingface_hub/utils/_cache_manager.py b/src/huggingface_hub/utils/_cache_manager.py index 58f15b3344..76b9cb073e 100644 --- a/src/huggingface_hub/utils/_cache_manager.py +++ b/src/huggingface_hub/utils/_cache_manager.py @@ -15,6 +15,7 @@ """Contains utilities to manage the HF cache directory.""" import os import shutil +import time from collections import defaultdict from dataclasses import dataclass from pathlib import Path @@ -48,6 +49,19 @@ class CachedFileInfo: Path of the blob file. This is equivalent to `file_path.resolve()`. size_on_disk (`int`): Size of the blob file in bytes. + blob_last_accessed (`float`): + Timestamp of the last time the blob file has been accessed (from any + revision). + blob_last_modified (`float`): + Timestamp of the last time the blob file has been modified/created. + + + + `blob_last_accessed` and `blob_last_modified` reliability can depend on the OS you + are using. See [python documentation](https://docs.python.org/3/library/os.html#os.stat_result) + for more details. + + """ file_name: str @@ -55,6 +69,29 @@ class CachedFileInfo: blob_path: Path size_on_disk: int + blob_last_accessed: float + blob_last_modified: float + + @property + def blob_last_accessed_str(self) -> str: + """ + (property) Timestamp of the last time the blob file has been accessed (from any + revision), returned as a human-readable string. + + Example: "2 weeks ago". + """ + return _format_ts(self.blob_last_accessed) + + @property + def blob_last_modified_str(self) -> str: + """ + (property) Timestamp of the last time the blob file has been modified, returned + as a human-readable string. + + Example: "2 weeks ago". + """ + return _format_ts(self.blob_last_modified) + @property def size_on_disk_str(self) -> str: """ @@ -80,14 +117,23 @@ class CachedRevisionInfo: snapshot_path (`Path`): Path to the revision directory in the `snapshots` folder. It contains the exact tree structure as the repo on the Hub. - size_on_disk (`int`): - Sum of the blob file sizes that are symlink-ed by the revision. files: (`FrozenSet[CachedFileInfo]`): Set of [`~CachedFileInfo`] describing all files contained in the snapshot. refs (`FrozenSet[str]`): Set of `refs` pointing to this revision. If the revision has no `refs`, it is considered detached. Example: `{"main", "2.4.0"}` or `{"refs/pr/1"}`. + size_on_disk (`int`): + Sum of the blob file sizes that are symlink-ed by the revision. + last_modified (`float`): + Timestamp of the last time the revision has been created/modified. + + + + `last_accessed` cannot be determined correctly on a single revision as blob files + are shared across revisions. + + @@ -104,6 +150,18 @@ class CachedRevisionInfo: files: FrozenSet[CachedFileInfo] refs: FrozenSet[str] + last_modified: float + + @property + def last_modified_str(self) -> str: + """ + (property) Timestamp of the last time the revision has been modified, returned + as a human-readable string. + + Example: "2 weeks ago". + """ + return _format_ts(self.last_modified) + @property def size_on_disk_str(self) -> str: """ @@ -138,6 +196,10 @@ class CachedRepoInfo: Total number of blob files in the cached repo. revisions (`FrozenSet[CachedRevisionInfo]`): Set of [`~CachedRevisionInfo`] describing all revisions cached in the repo. + last_accessed (`float`): + Timestamp of the last time a blob file of the repo has been accessed. + last_modified (`float`): + Timestamp of the last time a blob file of the repo has been modified/created. @@ -145,6 +207,14 @@ class CachedRepoInfo: duplicated files. Besides, only blobs are taken into account, not the (negligible) size of folders and symlinks. + + + + + `last_accessed` and `last_modified` reliability can depend on the OS you are using. + See [python documentation](https://docs.python.org/3/library/os.html#os.stat_result) + for more details. + """ @@ -155,6 +225,29 @@ class CachedRepoInfo: nb_files: int revisions: FrozenSet[CachedRevisionInfo] + last_accessed: float + last_modified: float + + @property + def last_accessed_str(self) -> str: + """ + (property) Last time a blob file of the repo has been accessed, returned as a + human-readable string. + + Example: "2 weeks ago". + """ + return _format_ts(self.last_accessed) + + @property + def last_modified_str(self) -> str: + """ + (property) Last time a blob file of the repo has been modified, returned as a + human-readable string. + + Example: "2 weeks ago". + """ + return _format_ts(self.last_modified) + @property def size_on_disk_str(self) -> str: """ @@ -534,7 +627,7 @@ def _scan_cached_repo(repo_path: Path) -> CachedRepoInfo: f" ({repo_path})." ) - blob_sizes: Dict[Path, int] = {} # Key is blob_path, value is blob size (in bytes) + blob_stats: Dict[Path, os.stat_result] = {} # Key is blob_path, value is blob stats snapshots_path = repo_path / "snapshots" refs_path = repo_path / "refs" @@ -602,28 +695,33 @@ def _scan_cached_repo(repo_path: Path) -> CachedRepoInfo: f"Blob symlink points outside of blob directory: {blob_path}" ) - if blob_path not in blob_sizes: - blob_sizes[blob_path] = blob_path.stat().st_size + if blob_path not in blob_stats: + blob_stats[blob_path] = blob_path.stat() cached_files.add( CachedFileInfo( file_name=file_path.name, file_path=file_path, - size_on_disk=blob_sizes[blob_path], + size_on_disk=blob_stats[blob_path].st_size, blob_path=blob_path, + blob_last_accessed=blob_stats[blob_path].st_atime, + blob_last_modified=blob_stats[blob_path].st_mtime, ) ) cached_revisions.add( CachedRevisionInfo( commit_hash=revision_path.name, + files=frozenset(cached_files), + refs=frozenset(refs_by_hash.pop(revision_path.name, set())), size_on_disk=sum( - blob_sizes[blob_path] + blob_stats[blob_path].st_size for blob_path in set(file.blob_path for file in cached_files) ), - files=frozenset(cached_files), snapshot_path=revision_path, - refs=frozenset(refs_by_hash.pop(revision_path.name, set())), + last_modified=max( + blob_stats[file.blob_path].st_mtime for file in cached_files + ), ) ) @@ -636,12 +734,14 @@ def _scan_cached_repo(repo_path: Path) -> CachedRepoInfo: # Build and return frozen structure return CachedRepoInfo( + nb_files=len(blob_stats), repo_id=repo_id, + repo_path=repo_path, repo_type=repo_type, # type: ignore revisions=frozenset(cached_revisions), - repo_path=repo_path, - size_on_disk=sum(blob_sizes.values()), - nb_files=len(blob_sizes), + size_on_disk=sum(stat.st_size for stat in blob_stats.values()), + last_accessed=max(stat.st_atime for stat in blob_stats.values()), + last_modified=max(stat.st_mtime for stat in blob_stats.values()), ) @@ -658,6 +758,32 @@ def _format_size(num: int) -> str: return f"{num_f:.1f}Y" +_TIMESINCE_CHUNKS = ( + # Label, divider, max value + ("second", 1, 60), + ("minute", 60, 60), + ("hour", 60 * 60, 24), + ("day", 60 * 60 * 24, 6), + ("week", 60 * 60 * 24 * 7, 6), + ("month", 60 * 60 * 24 * 30, 11), + ("year", 60 * 60 * 24 * 365, None), +) + + +def _format_ts(ts: float) -> str: + """Format timestamp in seconds into a human-readable string, relative to now. + + Vaguely inspired by Django's `timesince` formatter. + """ + delta = time.time() - ts + if delta < 2: + return "now" + for label, divider, max_value in _TIMESINCE_CHUNKS: + value = round(delta / divider) + if max_value is None or value <= max_value: + return f"{value} {label}{'s' if value > 1 else ''} ago" + + def _try_delete_path(path: Path, path_type: str) -> None: """Try to delete a local file or folder. diff --git a/tests/test_utils_cache.py b/tests/test_utils_cache.py index d1110916db..7314336f48 100644 --- a/tests/test_utils_cache.py +++ b/tests/test_utils_cache.py @@ -1,6 +1,7 @@ import os import shutil import sys +import time import unittest from io import StringIO from pathlib import Path @@ -14,7 +15,11 @@ from huggingface_hub._snapshot_download import snapshot_download from huggingface_hub.commands.cache import ScanCacheCommand from huggingface_hub.utils import DeleteCacheStrategy, HFCacheInfo, scan_cache_dir -from huggingface_hub.utils._cache_manager import _try_delete_path +from huggingface_hub.utils._cache_manager import ( + _format_size, + _format_ts, + _try_delete_path, +) from .testing_constants import TOKEN @@ -649,3 +654,36 @@ def test_delete_path_on_local_folder_with_wrong_permission(self) -> None: # For proper cleanup dir_path.chmod(509) + + +class TestStringFormatters(unittest.TestCase): + SIZES = { + 16.0: "16.0", + 1000.0: "1.0K", + 1024 * 1024 * 1024: "1.1G", # not 1.0GB + } + + SINCE = { + 1: "now", + 5: "5 seconds ago", + 80: "1 minute ago", + 1000: "17 minutes ago", + 4000: "1 hour ago", + 8000: "2 hours ago", + 3600 * 24 * 13: "2 weeks ago", + 3600 * 24 * 30 * 8.2: "8 months ago", + 3600 * 24 * 365: "1 year ago", + 3600 * 24 * 365 * 9.6: "10 years ago", + } + + def test_format_size(self) -> None: + """Test `_format_size` formatter.""" + for size, expected in self.SIZES.items(): + self.assertEqual(_format_size(size), expected) + + def test_format_ts(self) -> None: + """Test `_format_ts` formatter.""" + for ts, expected in self.SINCE.items(): + self.assertEqual( + _format_ts(time.time() - ts), expected, msg=f"Wrong formatting for {ts}" + ) From 9ffc29557224bc1c8f9a8efb5e44c913eeffd0d8 Mon Sep 17 00:00:00 2001 From: Wauplin Date: Mon, 12 Sep 2022 10:00:04 +0200 Subject: [PATCH 2/6] fix tests with timing --- src/huggingface_hub/utils/_cache_manager.py | 4 +-- tests/test_utils_cache.py | 27 +++++++++++---------- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/src/huggingface_hub/utils/_cache_manager.py b/src/huggingface_hub/utils/_cache_manager.py index 76b9cb073e..00fadbf461 100644 --- a/src/huggingface_hub/utils/_cache_manager.py +++ b/src/huggingface_hub/utils/_cache_manager.py @@ -776,8 +776,8 @@ def _format_ts(ts: float) -> str: Vaguely inspired by Django's `timesince` formatter. """ delta = time.time() - ts - if delta < 2: - return "now" + if delta < 20: + return "a few seconds ago" for label, divider, max_value in _TIMESINCE_CHUNKS: value = round(delta / divider) if max_value is None or value <= max_value: diff --git a/tests/test_utils_cache.py b/tests/test_utils_cache.py index 7314336f48..cc8f2d9ec9 100644 --- a/tests/test_utils_cache.py +++ b/tests/test_utils_cache.py @@ -201,10 +201,10 @@ def test_cli_scan_cache_quiet(self) -> None: sys.stdout = previous_output expected_output = f""" - REPO ID REPO TYPE SIZE ON DISK NB FILES REFS LOCAL PATH - ----------------------------- --------- ------------ -------- --------------- ------------------------------------------------------------------------------------------------------------- - valid_org/test_scan_dataset_b dataset 2.2K 2 main {self.cache_dir}/datasets--valid_org--test_scan_dataset_b - valid_org/test_scan_repo_a model 1.4K 4 main, refs/pr/1 {self.cache_dir}/models--valid_org--test_scan_repo_a + REPO ID REPO TYPE SIZE ON DISK NB FILES LAST_ACCESSED LAST_MODIFIED REFS LOCAL PATH + ----------------------------- --------- ------------ -------- ----------------- ----------------- --------------- --------------------------------------------------------- + valid_org/test_scan_dataset_b dataset 2.2K 2 a few seconds ago a few seconds ago main {self.cache_dir}/datasets--valid_org--test_scan_dataset_b + valid_org/test_scan_repo_a model 1.4K 4 a few seconds ago a few seconds ago main, refs/pr/1 {self.cache_dir}/models--valid_org--test_scan_repo_a Done in 0.0s. Scanned 2 repo(s) for a total of \x1b[1m\x1b[31m3.5K\x1b[0m. """ @@ -231,12 +231,12 @@ def test_cli_scan_cache_verbose(self) -> None: sys.stdout = previous_output expected_output = f""" - REPO ID REPO TYPE REVISION SIZE ON DISK NB FILES REFS LOCAL PATH - ----------------------------- --------- ---------------------------------------- ------------ -------- --------- ---------------------------------------------------------------------------------------------------------------------------------------------------------------- - valid_org/test_scan_dataset_b dataset 1ac47c6f707cbc4825c2aa431ad5ab8cf09e60ed 2.2K 2 main {self.cache_dir}/datasets--valid_org--test_scan_dataset_b/snapshots/1ac47c6f707cbc4825c2aa431ad5ab8cf09e60ed - valid_org/test_scan_repo_a model 1da18ebd9185d146bcf84e308de53715d97d67d1 1.3K 1 {self.cache_dir}/models--valid_org--test_scan_repo_a/snapshots/1da18ebd9185d146bcf84e308de53715d97d67d1 - valid_org/test_scan_repo_a model 401874e6a9c254a8baae85edd8a073921ecbd7f5 1.4K 3 main {self.cache_dir}/models--valid_org--test_scan_repo_a/snapshots/401874e6a9c254a8baae85edd8a073921ecbd7f5 - valid_org/test_scan_repo_a model fc674b0d440d3ea6f94bc4012e33ebd1dfc11b5b 1.4K 4 refs/pr/1 {self.cache_dir}/models--valid_org--test_scan_repo_a/snapshots/fc674b0d440d3ea6f94bc4012e33ebd1dfc11b5b + REPO ID REPO TYPE REVISION SIZE ON DISK NB FILES LAST_MODIFIED REFS LOCAL PATH + ----------------------------- --------- ---------------------------------------- ------------ -------- ----------------- --------- ------------------------------------------------------------------------------------------------------------ + valid_org/test_scan_dataset_b dataset 1ac47c6f707cbc4825c2aa431ad5ab8cf09e60ed 2.2K 2 a few seconds ago main {self.cache_dir}/datasets--valid_org--test_scan_dataset_b/snapshots/1ac47c6f707cbc4825c2aa431ad5ab8cf09e60ed + valid_org/test_scan_repo_a model 1da18ebd9185d146bcf84e308de53715d97d67d1 1.3K 1 a few seconds ago {self.cache_dir}/models--valid_org--test_scan_repo_a/snapshots/1da18ebd9185d146bcf84e308de53715d97d67d1 + valid_org/test_scan_repo_a model 401874e6a9c254a8baae85edd8a073921ecbd7f5 1.4K 3 a few seconds ago main {self.cache_dir}/models--valid_org--test_scan_repo_a/snapshots/401874e6a9c254a8baae85edd8a073921ecbd7f5 + valid_org/test_scan_repo_a model fc674b0d440d3ea6f94bc4012e33ebd1dfc11b5b 1.4K 4 a few seconds ago refs/pr/1 {self.cache_dir}/models--valid_org--test_scan_repo_a/snapshots/fc674b0d440d3ea6f94bc4012e33ebd1dfc11b5b Done in 0.0s. Scanned 2 repo(s) for a total of \x1b[1m\x1b[31m3.5K\x1b[0m. """ @@ -660,12 +660,13 @@ class TestStringFormatters(unittest.TestCase): SIZES = { 16.0: "16.0", 1000.0: "1.0K", - 1024 * 1024 * 1024: "1.1G", # not 1.0GB + 1024 * 1024 * 1024: "1.1G", # not 1.0GiB } SINCE = { - 1: "now", - 5: "5 seconds ago", + 1: "a few seconds ago", + 15: "a few seconds ago", + 25: "25 seconds ago", 80: "1 minute ago", 1000: "17 minutes ago", 4000: "1 hour ago", From ea380f5ea683afa6ba93e14b87bea08cac6bad6e Mon Sep 17 00:00:00 2001 From: Wauplin Date: Mon, 12 Sep 2022 10:31:31 +0200 Subject: [PATCH 3/6] test time details in scan cache --- tests/test_utils_cache.py | 73 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 72 insertions(+), 1 deletion(-) diff --git a/tests/test_utils_cache.py b/tests/test_utils_cache.py index cc8f2d9ec9..929eff499c 100644 --- a/tests/test_utils_cache.py +++ b/tests/test_utils_cache.py @@ -253,7 +253,7 @@ class TestCorruptedCacheUtils(unittest.TestCase): repo_path: Path def setUp(self) -> None: - """Setup a clean cache for tests that will get corrupted in tests.""" + """Setup a clean cache for tests that will get corrupted/modified in tests.""" # Download latest main snapshot_download( repo_id=VALID_MODEL_ID, @@ -352,6 +352,77 @@ def test_ref_to_missing_revision(self) -> None: + f"({self.repo_path }).", ) + def test_scan_cache_last_modified_and_last_accessed(self) -> None: + """Scan the last_modified and last_accessed properties when scanning.""" + TIME_GAP = 0.1 + + # Make a first scan + report_1 = scan_cache_dir(self.cache_dir) + + # Values from first report + repo_1 = list(report_1.repos)[0] + revision_1 = list(repo_1.revisions)[0] + readme_file_1 = [ + file for file in revision_1.files if file.file_name == "README.md" + ][0] + another_file_1 = [ + file for file in revision_1.files if file.file_name == "Another file.md" + ][0] + + # Comparison of last_accessed/last_modified between file and repo + self.assertLessEqual(readme_file_1.blob_last_accessed, repo_1.last_accessed) + self.assertLessEqual(readme_file_1.blob_last_modified, repo_1.last_modified) + self.assertEqual(revision_1.last_modified, repo_1.last_modified) + + # Sleep and write new readme + time.sleep(TIME_GAP) + readme_file_1.file_path.write_text("modified readme") + + # Sleep and read content from readme + time.sleep(TIME_GAP) + with readme_file_1.file_path.open("r") as f: + _ = f.read() + + # Sleep and re-scan + time.sleep(TIME_GAP) + report_2 = scan_cache_dir(self.cache_dir) + + # Values from second report + repo_2 = list(report_2.repos)[0] + revision_2 = list(repo_2.revisions)[0] + readme_file_2 = [ + file for file in revision_2.files if file.file_name == "README.md" + ][0] + another_file_2 = [ + file for file in revision_1.files if file.file_name == "Another file.md" + ][0] + + # Report 1 is not updated when cache changes + self.assertLess(repo_1.last_accessed, repo_2.last_accessed) + self.assertLess(repo_1.last_modified, repo_2.last_modified) + + # "Another_file.md" did not change + self.assertEqual(another_file_1, another_file_2) + + # Readme.md has been modified and then accessed more recently + self.assertGreaterEqual( + readme_file_2.blob_last_modified - readme_file_1.blob_last_modified, + TIME_GAP * 0.999, # 0.999 factor because not exactly precise + ) + self.assertGreaterEqual( + readme_file_2.blob_last_accessed - readme_file_1.blob_last_accessed, + 2 * TIME_GAP * 0.999, # 0.999 factor because not exactly precise + ) + self.assertGreaterEqual( + readme_file_2.blob_last_accessed - readme_file_2.blob_last_modified, + TIME_GAP * 0.999, # 0.999 factor because not exactly precise + ) + + # Comparison of last_accessed/last_modified between file and repo + self.assertEqual(readme_file_2.blob_last_accessed, repo_2.last_accessed) + self.assertEqual(readme_file_2.blob_last_modified, repo_2.last_modified) + self.assertEqual(revision_2.last_modified, repo_2.last_modified) + class TestDeleteRevisionsDryRun(unittest.TestCase): cache_info: Mock # Mocked HFCacheInfo From 6a3e7b6f3100b16a94036c4cbfad5a4c1b97bff0 Mon Sep 17 00:00:00 2001 From: Wauplin Date: Mon, 12 Sep 2022 10:33:14 +0200 Subject: [PATCH 4/6] renamed to _format_timesince --- src/huggingface_hub/utils/_cache_manager.py | 12 ++++++------ tests/test_utils_cache.py | 10 ++++++---- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/src/huggingface_hub/utils/_cache_manager.py b/src/huggingface_hub/utils/_cache_manager.py index 00fadbf461..a1c7d5f375 100644 --- a/src/huggingface_hub/utils/_cache_manager.py +++ b/src/huggingface_hub/utils/_cache_manager.py @@ -80,7 +80,7 @@ def blob_last_accessed_str(self) -> str: Example: "2 weeks ago". """ - return _format_ts(self.blob_last_accessed) + return _format_timesince(self.blob_last_accessed) @property def blob_last_modified_str(self) -> str: @@ -90,7 +90,7 @@ def blob_last_modified_str(self) -> str: Example: "2 weeks ago". """ - return _format_ts(self.blob_last_modified) + return _format_timesince(self.blob_last_modified) @property def size_on_disk_str(self) -> str: @@ -160,7 +160,7 @@ def last_modified_str(self) -> str: Example: "2 weeks ago". """ - return _format_ts(self.last_modified) + return _format_timesince(self.last_modified) @property def size_on_disk_str(self) -> str: @@ -236,7 +236,7 @@ def last_accessed_str(self) -> str: Example: "2 weeks ago". """ - return _format_ts(self.last_accessed) + return _format_timesince(self.last_accessed) @property def last_modified_str(self) -> str: @@ -246,7 +246,7 @@ def last_modified_str(self) -> str: Example: "2 weeks ago". """ - return _format_ts(self.last_modified) + return _format_timesince(self.last_modified) @property def size_on_disk_str(self) -> str: @@ -770,7 +770,7 @@ def _format_size(num: int) -> str: ) -def _format_ts(ts: float) -> str: +def _format_timesince(ts: float) -> str: """Format timestamp in seconds into a human-readable string, relative to now. Vaguely inspired by Django's `timesince` formatter. diff --git a/tests/test_utils_cache.py b/tests/test_utils_cache.py index 929eff499c..f4b15b3940 100644 --- a/tests/test_utils_cache.py +++ b/tests/test_utils_cache.py @@ -17,7 +17,7 @@ from huggingface_hub.utils import DeleteCacheStrategy, HFCacheInfo, scan_cache_dir from huggingface_hub.utils._cache_manager import ( _format_size, - _format_ts, + _format_timesince, _try_delete_path, ) @@ -753,9 +753,11 @@ def test_format_size(self) -> None: for size, expected in self.SIZES.items(): self.assertEqual(_format_size(size), expected) - def test_format_ts(self) -> None: - """Test `_format_ts` formatter.""" + def test_format_timesince(self) -> None: + """Test `_format_timesince` formatter.""" for ts, expected in self.SINCE.items(): self.assertEqual( - _format_ts(time.time() - ts), expected, msg=f"Wrong formatting for {ts}" + _format_timesince(time.time() - ts), + expected, + msg=f"Wrong formatting for {ts}", ) From e56bd00ac0e77fb17a11eec4e27f4ad4f85cb548 Mon Sep 17 00:00:00 2001 From: Wauplin Date: Mon, 12 Sep 2022 10:45:35 +0200 Subject: [PATCH 5/6] add documentation --- docs/source/how-to-cache.mdx | 54 ++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/docs/source/how-to-cache.mdx b/docs/source/how-to-cache.mdx index a065176584..8fff32bd7c 100644 --- a/docs/source/how-to-cache.mdx +++ b/docs/source/how-to-cache.mdx @@ -128,14 +128,14 @@ cached. ```text ➜ huggingface-cli scan-cache -REPO ID REPO TYPE SIZE ON DISK NB FILES REFS LOCAL PATH ---------------------------- --------- ------------ -------- ------------------- ------------------------------------------------------------------------- -glue dataset 116.3K 15 2.4.0, main, 1.17.0 /Users/lucain/.cache/huggingface/hub/datasets--glue -google/fleurs dataset 64.9M 6 refs/pr/1, main /Users/lucain/.cache/huggingface/hub/datasets--google--fleurs -Jean-Baptiste/camembert-ner model 441.0M 7 main /Users/lucain/.cache/huggingface/hub/models--Jean-Baptiste--camembert-ner -bert-base-cased model 1.9G 13 main /Users/lucain/.cache/huggingface/hub/models--bert-base-cased -t5-base model 10.1K 3 main /Users/lucain/.cache/huggingface/hub/models--t5-base -t5-small model 970.7M 11 refs/pr/1, main /Users/lucain/.cache/huggingface/hub/models--t5-small +REPO ID REPO TYPE SIZE ON DISK NB FILES LAST_ACCESSED LAST_MODIFIED REFS LOCAL PATH +--------------------------- --------- ------------ -------- ------------- ------------- ------------------- ------------------------------------------------------------------------- +glue dataset 116.3K 15 4 days ago 4 days ago 2.4.0, main, 1.17.0 /home/wauplin/.cache/huggingface/hub/datasets--glue +google/fleurs dataset 64.9M 6 1 week ago 1 week ago refs/pr/1, main /home/wauplin/.cache/huggingface/hub/datasets--google--fleurs +Jean-Baptiste/camembert-ner model 441.0M 7 2 weeks ago 16 hours ago main /home/wauplin/.cache/huggingface/hub/models--Jean-Baptiste--camembert-ner +bert-base-cased model 1.9G 13 1 week ago 2 years ago /home/wauplin/.cache/huggingface/hub/models--bert-base-cased +t5-base model 10.1K 3 3 months ago 3 months ago main /home/wauplin/.cache/huggingface/hub/models--t5-base +t5-small model 970.7M 11 3 days ago 3 days ago refs/pr/1, main /home/wauplin/.cache/huggingface/hub/models--t5-small Done in 0.0s. Scanned 6 repo(s) for a total of 3.4G. Got 1 warning(s) while scanning. Use -vvv to print details. @@ -150,19 +150,19 @@ usage is only 1.9G. ```text ➜ huggingface-cli scan-cache -v -REPO ID REPO TYPE REVISION SIZE ON DISK NB FILES REFS LOCAL PATH ---------------------------- --------- ---------------------------------------- ------------ -------- ----------- ---------------------------------------------------------------------------------------------------------------------------- -glue dataset 9338f7b671827df886678df2bdd7cc7b4f36dffd 97.7K 14 main, 2.4.0 /Users/lucain/.cache/huggingface/hub/datasets--glue/snapshots/9338f7b671827df886678df2bdd7cc7b4f36dffd -glue dataset f021ae41c879fcabcf823648ec685e3fead91fe7 97.8K 14 1.17.0 /Users/lucain/.cache/huggingface/hub/datasets--glue/snapshots/f021ae41c879fcabcf823648ec685e3fead91fe7 -google/fleurs dataset 129b6e96cf1967cd5d2b9b6aec75ce6cce7c89e8 25.4K 3 refs/pr/1 /Users/lucain/.cache/huggingface/hub/datasets--google--fleurs/snapshots/129b6e96cf1967cd5d2b9b6aec75ce6cce7c89e8 -google/fleurs dataset 24f85a01eb955224ca3946e70050869c56446805 64.9M 4 main /Users/lucain/.cache/huggingface/hub/datasets--google--fleurs/snapshots/24f85a01eb955224ca3946e70050869c56446805 -Jean-Baptiste/camembert-ner model dbec8489a1c44ecad9da8a9185115bccabd799fe 441.0M 7 main /Users/lucain/.cache/huggingface/hub/models--Jean-Baptiste--camembert-ner/snapshots/dbec8489a1c44ecad9da8a9185115bccabd799fe -bert-base-cased model 378aa1bda6387fd00e824948ebe3488630ad8565 1.5G 9 /Users/lucain/.cache/huggingface/hub/models--bert-base-cased/snapshots/378aa1bda6387fd00e824948ebe3488630ad8565 -bert-base-cased model a8d257ba9925ef39f3036bfc338acf5283c512d9 1.4G 9 main /Users/lucain/.cache/huggingface/hub/models--bert-base-cased/snapshots/a8d257ba9925ef39f3036bfc338acf5283c512d9 -t5-base model 23aa4f41cb7c08d4b05c8f327b22bfa0eb8c7ad9 10.1K 3 main /Users/lucain/.cache/huggingface/hub/models--t5-base/snapshots/23aa4f41cb7c08d4b05c8f327b22bfa0eb8c7ad9 -t5-small model 98ffebbb27340ec1b1abd7c45da12c253ee1882a 726.2M 6 refs/pr/1 /Users/lucain/.cache/huggingface/hub/models--t5-small/snapshots/98ffebbb27340ec1b1abd7c45da12c253ee1882a -t5-small model d0a119eedb3718e34c648e594394474cf95e0617 485.8M 6 /Users/lucain/.cache/huggingface/hub/models--t5-small/snapshots/d0a119eedb3718e34c648e594394474cf95e0617 -t5-small model d78aea13fa7ecd06c29e3e46195d6341255065d5 970.7M 9 main /Users/lucain/.cache/huggingface/hub/models--t5-small/snapshots/d78aea13fa7ecd06c29e3e46195d6341255065d5 +REPO ID REPO TYPE REVISION SIZE ON DISK NB FILES LAST_MODIFIED REFS LOCAL PATH +--------------------------- --------- ---------------------------------------- ------------ -------- ------------- ----------- ---------------------------------------------------------------------------------------------------------------------------- +glue dataset 9338f7b671827df886678df2bdd7cc7b4f36dffd 97.7K 14 4 days ago main, 2.4.0 /home/wauplin/.cache/huggingface/hub/datasets--glue/snapshots/9338f7b671827df886678df2bdd7cc7b4f36dffd +glue dataset f021ae41c879fcabcf823648ec685e3fead91fe7 97.8K 14 1 week ago 1.17.0 /home/wauplin/.cache/huggingface/hub/datasets--glue/snapshots/f021ae41c879fcabcf823648ec685e3fead91fe7 +google/fleurs dataset 129b6e96cf1967cd5d2b9b6aec75ce6cce7c89e8 25.4K 3 2 weeks ago refs/pr/1 /home/wauplin/.cache/huggingface/hub/datasets--google--fleurs/snapshots/129b6e96cf1967cd5d2b9b6aec75ce6cce7c89e8 +google/fleurs dataset 24f85a01eb955224ca3946e70050869c56446805 64.9M 4 1 week ago main /home/wauplin/.cache/huggingface/hub/datasets--google--fleurs/snapshots/24f85a01eb955224ca3946e70050869c56446805 +Jean-Baptiste/camembert-ner model dbec8489a1c44ecad9da8a9185115bccabd799fe 441.0M 7 16 hours ago main /home/wauplin/.cache/huggingface/hub/models--Jean-Baptiste--camembert-ner/snapshots/dbec8489a1c44ecad9da8a9185115bccabd799fe +bert-base-cased model 378aa1bda6387fd00e824948ebe3488630ad8565 1.5G 9 2 years ago /home/wauplin/.cache/huggingface/hub/models--bert-base-cased/snapshots/378aa1bda6387fd00e824948ebe3488630ad8565 +bert-base-cased model a8d257ba9925ef39f3036bfc338acf5283c512d9 1.4G 9 3 days ago main /home/wauplin/.cache/huggingface/hub/models--bert-base-cased/snapshots/a8d257ba9925ef39f3036bfc338acf5283c512d9 +t5-base model 23aa4f41cb7c08d4b05c8f327b22bfa0eb8c7ad9 10.1K 3 1 week ago main /home/wauplin/.cache/huggingface/hub/models--t5-base/snapshots/23aa4f41cb7c08d4b05c8f327b22bfa0eb8c7ad9 +t5-small model 98ffebbb27340ec1b1abd7c45da12c253ee1882a 726.2M 6 1 week ago refs/pr/1 /home/wauplin/.cache/huggingface/hub/models--t5-small/snapshots/98ffebbb27340ec1b1abd7c45da12c253ee1882a +t5-small model d0a119eedb3718e34c648e594394474cf95e0617 485.8M 6 4 weeks ago /home/wauplin/.cache/huggingface/hub/models--t5-small/snapshots/d0a119eedb3718e34c648e594394474cf95e0617 +t5-small model d78aea13fa7ecd06c29e3e46195d6341255065d5 970.7M 9 1 week ago main /home/wauplin/.cache/huggingface/hub/models--t5-small/snapshots/d78aea13fa7ecd06c29e3e46195d6341255065d5 Done in 0.0s. Scanned 6 repo(s) for a total of 3.4G. Got 1 warning(s) while scanning. Use -vvv to print details. @@ -176,9 +176,9 @@ model on a Unix-based machine. ```text ➜ eval "huggingface-cli scan-cache -v" | grep "t5-small" -t5-small model 98ffebbb27340ec1b1abd7c45da12c253ee1882a 726.2M 6 refs/pr/1 /Users/lucain/.cache/huggingface/hub/models--t5-small/snapshots/98ffebbb27340ec1b1abd7c45da12c253ee1882a -t5-small model d0a119eedb3718e34c648e594394474cf95e0617 485.8M 6 /Users/lucain/.cache/huggingface/hub/models--t5-small/snapshots/d0a119eedb3718e34c648e594394474cf95e0617 -t5-small model d78aea13fa7ecd06c29e3e46195d6341255065d5 970.7M 9 main /Users/lucain/.cache/huggingface/hub/models--t5-small/snapshots/d78aea13fa7ecd06c29e3e46195d6341255065d5 +t5-small model 98ffebbb27340ec1b1abd7c45da12c253ee1882a 726.2M 6 1 week ago refs/pr/1 /home/wauplin/.cache/huggingface/hub/models--t5-small/snapshots/98ffebbb27340ec1b1abd7c45da12c253ee1882a +t5-small model d0a119eedb3718e34c648e594394474cf95e0617 485.8M 6 4 weeks ago /home/wauplin/.cache/huggingface/hub/models--t5-small/snapshots/d0a119eedb3718e34c648e594394474cf95e0617 +t5-small model d78aea13fa7ecd06c29e3e46195d6341255065d5 970.7M 9 1 week ago main /home/wauplin/.cache/huggingface/hub/models--t5-small/snapshots/d78aea13fa7ecd06c29e3e46195d6341255065d5 ``` ### From Python @@ -208,17 +208,23 @@ HFCacheInfo( repo_path=PosixPath(...), size_on_disk=970726914, nb_files=11, + last_accessed=1662971707.3567169, + last_modified=1662971107.3567169, revisions=frozenset({ CachedRevisionInfo( commit_hash='d78aea13fa7ecd06c29e3e46195d6341255065d5', size_on_disk=970726339, snapshot_path=PosixPath(...), + # No `last_accessed` as blobs are shared among revisions + last_modified=1662971107.3567169, files=frozenset({ CachedFileInfo( file_name='config.json', size_on_disk=1197 file_path=PosixPath(...), blob_path=PosixPath(...), + blob_last_accessed=1662971707.3567169, + blob_last_modified=1662971107.3567169, ), CachedFileInfo(...), ... From 56c41875cecc801e91ec660e8258b6163753781c Mon Sep 17 00:00:00 2001 From: Wauplin Date: Mon, 12 Sep 2022 10:54:51 +0200 Subject: [PATCH 6/6] add informatino in failing test --- tests/test_utils_cache.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/test_utils_cache.py b/tests/test_utils_cache.py index f4b15b3940..9d8c25ad71 100644 --- a/tests/test_utils_cache.py +++ b/tests/test_utils_cache.py @@ -751,7 +751,11 @@ class TestStringFormatters(unittest.TestCase): def test_format_size(self) -> None: """Test `_format_size` formatter.""" for size, expected in self.SIZES.items(): - self.assertEqual(_format_size(size), expected) + self.assertEqual( + _format_size(size), + expected, + msg=f"Wrong formatting for {size} == '{expected}'", + ) def test_format_timesince(self) -> None: """Test `_format_timesince` formatter.""" @@ -759,5 +763,5 @@ def test_format_timesince(self) -> None: self.assertEqual( _format_timesince(time.time() - ts), expected, - msg=f"Wrong formatting for {ts}", + msg=f"Wrong formatting for {ts} == '{expected}'", )