From ce1ee0870ce3db70f1a54691299ea1b7087d4f6f Mon Sep 17 00:00:00 2001 From: mdepristo Date: Wed, 19 Sep 2018 14:45:19 -0700 Subject: [PATCH] Improve the documentation of resolve_filespecs. Also have it return 0 for num_shards instead of None when there are no shards; update all call sites to reflect this change. PiperOrigin-RevId: 213698824 --- nucleus/util/io_utils.py | 13 ++++--- nucleus/util/io_utils_test.py | 72 ++++++++++++++++++++++++++--------- 2 files changed, 61 insertions(+), 24 deletions(-) diff --git a/nucleus/util/io_utils.py b/nucleus/util/io_utils.py index 25db56c..467eca6 100644 --- a/nucleus/util/io_utils.py +++ b/nucleus/util/io_utils.py @@ -52,9 +52,9 @@ def ParseShardedFileSpec(spec): # pylint:disable=invalid-name 'gs://some/file@200.txt'. Here, '@200' specifies the number of shards. Returns: - basename: The basename for the files. - num_shards: The number of shards. - suffix: The suffix if there is one, or '' if not. + basename: str. The basename for the files. + num_shards: int >= 0. The number of shards. + suffix: str. The suffix if there is one, or '' if not. Raises: ShardError: If the spec is not a valid sharded specification. """ @@ -188,8 +188,9 @@ def resolve_filespecs(shard, *filespecs): shard-specific file paths. Returns: - A list. The first element is the number of shards, followed by the - shard-specific paths for each filespec, in order. + A list. The first element is the number of shards, which is an int >= 1 when + filespecs contains sharded paths and 0 if none do. All subsequent + returned values follow the shard-specific paths for each filespec, in order. Raises: ValueError: if any filespecs are inconsistent. @@ -200,7 +201,7 @@ def resolve_filespecs(shard, *filespecs): master = filespecs[0] master_is_sharded = IsShardedFileSpec(master) - master_num_shards = None + master_num_shards = 0 if master_is_sharded: _, master_num_shards, _ = ParseShardedFileSpec(master) if shard >= master_num_shards or shard < 0: diff --git a/nucleus/util/io_utils_test.py b/nucleus/util/io_utils_test.py index 53832ec..1482b1b 100644 --- a/nucleus/util/io_utils_test.py +++ b/nucleus/util/io_utils_test.py @@ -34,27 +34,63 @@ class IOTest(parameterized.TestCase): @parameterized.parameters( # Unsharded outputs pass through as expected. - (0, ['foo.txt'], [None, 'foo.txt']), - (0, ['foo.txt', 'bar.txt'], [None, 'foo.txt', 'bar.txt']), - (0, ['bar.txt', 'foo.txt'], [None, 'bar.txt', 'foo.txt']), + dict(task_id=0, filespecs=['foo.txt'], expected=[0, 'foo.txt']), + dict( + task_id=0, + filespecs=['foo.txt', 'bar.txt'], + expected=[0, 'foo.txt', 'bar.txt']), + dict( + task_id=0, + filespecs=['bar.txt', 'foo.txt'], + expected=[0, 'bar.txt', 'foo.txt']), # It's ok to have False values for other bindings. - (0, ['foo.txt', None], [None, 'foo.txt', None]), - (0, ['foo.txt', ''], [None, 'foo.txt', '']), - (0, ['foo@10.txt', None], [10, 'foo-00000-of-00010.txt', None]), - (0, ['foo@10.txt', ''], [10, 'foo-00000-of-00010.txt', '']), + dict( + task_id=0, filespecs=['foo.txt', None], expected=[0, 'foo.txt', + None]), + dict(task_id=0, filespecs=['foo.txt', ''], expected=[0, 'foo.txt', '']), + dict( + task_id=0, + filespecs=['foo@10.txt', None], + expected=[10, 'foo-00000-of-00010.txt', None]), + dict( + task_id=0, + filespecs=['foo@10.txt', ''], + expected=[10, 'foo-00000-of-00010.txt', '']), # Simple check that master behaves as expected. - (0, ['foo@10.txt', None], [10, 'foo-00000-of-00010.txt', None]), - (0, ['foo@10', None], [10, 'foo-00000-of-00010', None]), - (1, ['foo@10', None], [10, 'foo-00001-of-00010', None]), - (9, ['foo@10', None], [10, 'foo-00009-of-00010', None]), - # Make sure we handle sharding of multiple outputs. - (0, ['foo@10', 'bar@10', 'baz@10'], - [10, 'foo-00000-of-00010', 'bar-00000-of-00010', 'baz-00000-of-00010']), - (9, ['foo@10', 'bar@10', 'baz@10'], - [10, 'foo-00009-of-00010', 'bar-00009-of-00010', 'baz-00009-of-00010']), + dict( + task_id=0, + filespecs=['foo@10.txt', None], + expected=[10, 'foo-00000-of-00010.txt', None]), + dict( + task_id=0, + filespecs=['foo@10', None], + expected=[10, 'foo-00000-of-00010', None]), + dict( + task_id=1, + filespecs=['foo@10', None], + expected=[10, 'foo-00001-of-00010', None]), + dict( + task_id=9, + filespecs=['foo@10', None], + expected=[10, 'foo-00009-of-00010', None]), + # Make sure we handle sharding of multiple filespecs. + dict( + task_id=0, + filespecs=['foo@10', 'bar@10', 'baz@10'], + expected=[ + 10, 'foo-00000-of-00010', 'bar-00000-of-00010', + 'baz-00000-of-00010' + ]), + dict( + task_id=9, + filespecs=['foo@10', 'bar@10', 'baz@10'], + expected=[ + 10, 'foo-00009-of-00010', 'bar-00009-of-00010', + 'baz-00009-of-00010' + ]), ) - def test_resolve_filespecs(self, task_id, outputs, expected): - self.assertEqual(io.resolve_filespecs(task_id, *outputs), expected) + def test_resolve_filespecs(self, task_id, filespecs, expected): + self.assertEqual(io.resolve_filespecs(task_id, *filespecs), expected) @parameterized.parameters( # shard >= num_shards.