From 9fb86c641757af03bde502eefe7ae0c7cf9e760b Mon Sep 17 00:00:00 2001 From: Jiarui XU Date: Wed, 14 Jul 2021 22:38:31 -0700 Subject: [PATCH 1/4] [Bug fix] Fix efficient test for multi-node --- mmseg/apis/test.py | 100 ++++------------------------ mmseg/core/evaluation/eval_hooks.py | 5 ++ tools/test.py | 2 +- 3 files changed, 19 insertions(+), 88 deletions(-) diff --git a/mmseg/apis/test.py b/mmseg/apis/test.py index 9728de4c68..ffab4ba44f 100644 --- a/mmseg/apis/test.py +++ b/mmseg/apis/test.py @@ -1,17 +1,15 @@ import os.path as osp -import pickle -import shutil import tempfile import mmcv import numpy as np import torch -import torch.distributed as dist +from mmcv.engine import collect_results_cpu, collect_results_gpu from mmcv.image import tensor2imgs from mmcv.runner import get_dist_info -def np2tmp(array, temp_file_name=None): +def np2tmp(array, temp_file_name=None, tmpdir=None): """Save ndarray to local numpy file. Args: @@ -19,6 +17,7 @@ def np2tmp(array, temp_file_name=None): temp_file_name (str): Numpy file name. If 'temp_file_name=None', this function will generate a file name with tempfile.NamedTemporaryFile to save ndarray. Default: None. + tmpdir (str): Temporary directory to save Ndarray files. Default: None. Returns: str: The numpy file name. @@ -26,7 +25,7 @@ def np2tmp(array, temp_file_name=None): if temp_file_name is None: temp_file_name = tempfile.NamedTemporaryFile( - suffix='.npy', delete=False).name + suffix='.npy', delete=False, dir=tmpdir).name np.save(temp_file_name, array) return temp_file_name @@ -36,7 +35,8 @@ def single_gpu_test(model, show=False, out_dir=None, efficient_test=False, - opacity=0.5): + opacity=0.5, + tmpdir=None): """Test with single GPU. Args: @@ -50,6 +50,8 @@ def single_gpu_test(model, opacity(float): Opacity of painted segmentation map. Default 0.5. Must be in (0, 1] range. + tmpdir (str): Path of directory to save the temporary results for + efficient test. Returns: list: The prediction results. """ @@ -90,7 +92,7 @@ def single_gpu_test(model, if isinstance(result, list): if efficient_test: - result = [np2tmp(_) for _ in result] + result = [np2tmp(_, tmpdir=tmpdir) for _ in result] results.extend(result) else: if efficient_test: @@ -120,7 +122,8 @@ def multi_gpu_test(model, model (nn.Module): Model to be tested. data_loader (utils.data.Dataloader): Pytorch data loader. tmpdir (str): Path of directory to save the temporary results from - different gpus under cpu mode. + different gpus under cpu mode. The same path is used for efficient + test. gpu_collect (bool): Option to use either gpu or cpu to collect results. efficient_test (bool): Whether save the results as local numpy files to save CPU memory during evaluation. Default: False. @@ -141,11 +144,11 @@ def multi_gpu_test(model, if isinstance(result, list): if efficient_test: - result = [np2tmp(_) for _ in result] + result = [np2tmp(_, tmpdir=tmpdir) for _ in result] results.extend(result) else: if efficient_test: - result = np2tmp(result) + result = np2tmp(result, tmpdir=tmpdir) results.append(result) if rank == 0: @@ -159,80 +162,3 @@ def multi_gpu_test(model, else: results = collect_results_cpu(results, len(dataset), tmpdir) return results - - -def collect_results_cpu(result_part, size, tmpdir=None): - """Collect results with CPU.""" - rank, world_size = get_dist_info() - # create a tmp dir if it is not specified - if tmpdir is None: - MAX_LEN = 512 - # 32 is whitespace - dir_tensor = torch.full((MAX_LEN, ), - 32, - dtype=torch.uint8, - device='cuda') - if rank == 0: - tmpdir = tempfile.mkdtemp() - tmpdir = torch.tensor( - bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda') - dir_tensor[:len(tmpdir)] = tmpdir - dist.broadcast(dir_tensor, 0) - tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() - else: - mmcv.mkdir_or_exist(tmpdir) - # dump the part result to the dir - mmcv.dump(result_part, osp.join(tmpdir, 'part_{}.pkl'.format(rank))) - dist.barrier() - # collect all parts - if rank != 0: - return None - else: - # load results of all parts from tmp dir - part_list = [] - for i in range(world_size): - part_file = osp.join(tmpdir, 'part_{}.pkl'.format(i)) - part_list.append(mmcv.load(part_file)) - # sort the results - ordered_results = [] - for res in zip(*part_list): - ordered_results.extend(list(res)) - # the dataloader may pad some samples - ordered_results = ordered_results[:size] - # remove tmp dir - shutil.rmtree(tmpdir) - return ordered_results - - -def collect_results_gpu(result_part, size): - """Collect results with GPU.""" - rank, world_size = get_dist_info() - # dump result part to tensor with pickle - part_tensor = torch.tensor( - bytearray(pickle.dumps(result_part)), dtype=torch.uint8, device='cuda') - # gather all result part tensor shape - shape_tensor = torch.tensor(part_tensor.shape, device='cuda') - shape_list = [shape_tensor.clone() for _ in range(world_size)] - dist.all_gather(shape_list, shape_tensor) - # padding result part tensor to max length - shape_max = torch.tensor(shape_list).max() - part_send = torch.zeros(shape_max, dtype=torch.uint8, device='cuda') - part_send[:shape_tensor[0]] = part_tensor - part_recv_list = [ - part_tensor.new_zeros(shape_max) for _ in range(world_size) - ] - # gather all result part - dist.all_gather(part_recv_list, part_send) - - if rank == 0: - part_list = [] - for recv, shape in zip(part_recv_list, shape_list): - part_list.append( - pickle.loads(recv[:shape[0]].cpu().numpy().tobytes())) - # sort the results - ordered_results = [] - for res in zip(*part_list): - ordered_results.extend(list(res)) - # the dataloader may pad some samples - ordered_results = ordered_results[:size] - return ordered_results diff --git a/mmseg/core/evaluation/eval_hooks.py b/mmseg/core/evaluation/eval_hooks.py index 928f2ba612..6563cc90c4 100644 --- a/mmseg/core/evaluation/eval_hooks.py +++ b/mmseg/core/evaluation/eval_hooks.py @@ -30,10 +30,15 @@ def _do_evaluate(self, runner): if not self._should_evaluate(runner): return + tmpdir = self.tmpdir + if tmpdir is None: + tmpdir = osp.join(runner.work_dir, '.eval_hook') + from mmseg.apis import single_gpu_test results = single_gpu_test( runner.model, self.dataloader, + tmpdir=tmpdir, show=False, efficient_test=self.efficient_test) runner.log_buffer.output['eval_iter_num'] = len(self.dataloader) diff --git a/tools/test.py b/tools/test.py index ab2bd60175..27fe2cfcdd 100644 --- a/tools/test.py +++ b/tools/test.py @@ -140,7 +140,7 @@ def main(): if not distributed: model = MMDataParallel(model, device_ids=[0]) outputs = single_gpu_test(model, data_loader, args.show, args.show_dir, - efficient_test, args.opacity) + efficient_test, args.opacity, args.tmpdir) else: model = MMDistributedDataParallel( model.cuda(), From 9d4e224284cba3bbd96317714e7e32f120a24489 Mon Sep 17 00:00:00 2001 From: Jiarui XU Date: Wed, 14 Jul 2021 22:55:23 -0700 Subject: [PATCH 2/4] fixed CI --- mmseg/core/evaluation/eval_hooks.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mmseg/core/evaluation/eval_hooks.py b/mmseg/core/evaluation/eval_hooks.py index 6563cc90c4..c5184f0ed0 100644 --- a/mmseg/core/evaluation/eval_hooks.py +++ b/mmseg/core/evaluation/eval_hooks.py @@ -30,9 +30,7 @@ def _do_evaluate(self, runner): if not self._should_evaluate(runner): return - tmpdir = self.tmpdir - if tmpdir is None: - tmpdir = osp.join(runner.work_dir, '.eval_hook') + tmpdir = osp.join(runner.work_dir, '.eval_hook') from mmseg.apis import single_gpu_test results = single_gpu_test( From cb360fe39b278543d92e85a199dd3d48a35efa67 Mon Sep 17 00:00:00 2001 From: Jiarui XU Date: Thu, 15 Jul 2021 09:45:46 -0700 Subject: [PATCH 3/4] add efficient test dir --- mmseg/apis/test.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/mmseg/apis/test.py b/mmseg/apis/test.py index ffab4ba44f..afcfad82a1 100644 --- a/mmseg/apis/test.py +++ b/mmseg/apis/test.py @@ -60,6 +60,8 @@ def single_gpu_test(model, results = [] dataset = data_loader.dataset prog_bar = mmcv.ProgressBar(len(dataset)) + if efficient_test: + mmcv.mkdir_or_exist('.efficient_test') for i, data in enumerate(data_loader): with torch.no_grad(): result = model(return_loss=False, **data) @@ -92,11 +94,11 @@ def single_gpu_test(model, if isinstance(result, list): if efficient_test: - result = [np2tmp(_, tmpdir=tmpdir) for _ in result] + result = [np2tmp(_, tmpdir='.efficient_test') for _ in result] results.extend(result) else: if efficient_test: - result = np2tmp(result) + result = np2tmp(result, tmpdir='.efficient_test') results.append(result) batch_size = len(result) @@ -138,17 +140,19 @@ def multi_gpu_test(model, rank, world_size = get_dist_info() if rank == 0: prog_bar = mmcv.ProgressBar(len(dataset)) + if efficient_test: + mmcv.mkdir_or_exist('.efficient_test') for i, data in enumerate(data_loader): with torch.no_grad(): result = model(return_loss=False, rescale=True, **data) if isinstance(result, list): if efficient_test: - result = [np2tmp(_, tmpdir=tmpdir) for _ in result] + result = [np2tmp(_, tmpdir='.efficient_test') for _ in result] results.extend(result) else: if efficient_test: - result = np2tmp(result, tmpdir=tmpdir) + result = np2tmp(result, tmpdir='.efficient_test') results.append(result) if rank == 0: From 053967860aaacbf3fb7f3082c1660af8f93254e0 Mon Sep 17 00:00:00 2001 From: Jiarui XU Date: Thu, 15 Jul 2021 11:59:13 -0700 Subject: [PATCH 4/4] remove unused args --- mmseg/apis/test.py | 5 +---- mmseg/core/evaluation/eval_hooks.py | 3 --- tools/test.py | 2 +- 3 files changed, 2 insertions(+), 8 deletions(-) diff --git a/mmseg/apis/test.py b/mmseg/apis/test.py index afcfad82a1..0034159689 100644 --- a/mmseg/apis/test.py +++ b/mmseg/apis/test.py @@ -35,8 +35,7 @@ def single_gpu_test(model, show=False, out_dir=None, efficient_test=False, - opacity=0.5, - tmpdir=None): + opacity=0.5): """Test with single GPU. Args: @@ -50,8 +49,6 @@ def single_gpu_test(model, opacity(float): Opacity of painted segmentation map. Default 0.5. Must be in (0, 1] range. - tmpdir (str): Path of directory to save the temporary results for - efficient test. Returns: list: The prediction results. """ diff --git a/mmseg/core/evaluation/eval_hooks.py b/mmseg/core/evaluation/eval_hooks.py index c5184f0ed0..928f2ba612 100644 --- a/mmseg/core/evaluation/eval_hooks.py +++ b/mmseg/core/evaluation/eval_hooks.py @@ -30,13 +30,10 @@ def _do_evaluate(self, runner): if not self._should_evaluate(runner): return - tmpdir = osp.join(runner.work_dir, '.eval_hook') - from mmseg.apis import single_gpu_test results = single_gpu_test( runner.model, self.dataloader, - tmpdir=tmpdir, show=False, efficient_test=self.efficient_test) runner.log_buffer.output['eval_iter_num'] = len(self.dataloader) diff --git a/tools/test.py b/tools/test.py index 27fe2cfcdd..ab2bd60175 100644 --- a/tools/test.py +++ b/tools/test.py @@ -140,7 +140,7 @@ def main(): if not distributed: model = MMDataParallel(model, device_ids=[0]) outputs = single_gpu_test(model, data_loader, args.show, args.show_dir, - efficient_test, args.opacity, args.tmpdir) + efficient_test, args.opacity) else: model = MMDistributedDataParallel( model.cuda(),