open-mmlab · innerlee · Feb 19, 2021 · Feb 2, 2021 · Feb 2, 2021 · Feb 2, 2021
diff --git a/configs/recognition/tsm/README.md b/configs/recognition/tsm/README.md
@@ -32,6 +32,7 @@
 |:--|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|
 |[tsm_r50_1x1x8_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py) |340x256|8| ResNet50| ImageNet |70.24|89.56|[70.36](https://github.com/mit-han-lab/temporal-shift-module/blob/8d53d6fda40bea2f1b37a6095279c4b454d672bd/scripts/train_tsm_kinetics_rgb_8f.sh)|[89.49](https://github.com/mit-han-lab/temporal-shift-module/blob/8d53d6fda40bea2f1b37a6095279c4b454d672bd/scripts/train_tsm_kinetics_rgb_8f.sh)|74.0 (8x1 frames)| 7079 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/tsm_r50_1x1x8_50e_kinetics400_rgb_20200607-af7fb746.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/20200607_211800.log)| [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/20200607_211800.log.json)|
 |[tsm_r50_1x1x8_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py) |short-side 256|8| ResNet50| ImageNet |70.59|89.52|x|x|x|7079|[ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x8_50e_kinetics400_rgb/tsm_r50_256p_1x1x8_50e_kinetics400_rgb_20200726-020785e2.pth)|[log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x8_50e_kinetics400_rgb/20200725_031623.log)|[json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x8_50e_kinetics400_rgb/20200725_031623.log.json)|
+|[tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb.py](configs/recognition/tsm/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb.py) |short-side 256|8| ResNet50| ImageNet |70.48|89.40|x|x|x|7076|[ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb_20210219-bf96e6cc.pth)|[log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb_20210219.log)|[json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb_20210219.json)|
 |[tsm_r50_video_1x1x8_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_video_1x1x8_50e_kinetics400_rgb.py) |short-side 256|8| ResNet50| ImageNet |70.25|89.66|[70.36](https://github.com/mit-han-lab/temporal-shift-module/blob/8d53d6fda40bea2f1b37a6095279c4b454d672bd/scripts/train_tsm_kinetics_rgb_8f.sh)|[89.49](https://github.com/mit-han-lab/temporal-shift-module/blob/8d53d6fda40bea2f1b37a6095279c4b454d672bd/scripts/train_tsm_kinetics_rgb_8f.sh)|74.0 (8x1 frames)| 7077 | [ckpt]( https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_100e_kinetics400_rgb/tsm_r50_video_1x1x8_100e_kinetics400_rgb_20200702-a77f4328.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_100e_kinetics400_rgb/tsm_r50_video_2d_1x1x8_50e_kinetics400_rgb.log)| [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_100e_kinetics400_rgb/tsm_r50_video_2d_1x1x8_50e_kinetics400_rgb.log.json)|
 |[tsm_r50_dense_1x1x8_100e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb.py) |340x256|8x4| ResNet50 | ImageNet|72.9|90.44|[72.22](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#dense-sample)|[90.37](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#dense-sample)|11.5 (8x10 frames)| 7079 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb/tsm_r50_dense_1x1x8_100e_kinetics400_rgb_20200626-91a54551.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb/20200626_213415.log)| [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb/20200626_213415.log.json)|
 |[tsm_r50_dense_1x1x8_100e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb.py) |short-side 256|8| ResNet50 | ImageNet|73.38|91.02|x|x|x|7079|[ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_256p_1x1x8_100e_kinetics400_rgb/tsm_r50_dense_256p_1x1x8_100e_kinetics400_rgb_20200727-e1e0c785.pth)|[log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_256p_1x1x8_100e_kinetics400_rgb/20200725_032043.log)|[json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_256p_1x1x8_100e_kinetics400_rgb/20200725_032043.log.json)|

diff --git a/configs/recognition/tsm/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb.py b/configs/recognition/tsm/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb.py
@@ -0,0 +1,93 @@
+_base_ = [
+    '../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_50e.py',
+    '../../_base_/default_runtime.py'
+]
+
+module_hooks = [
+    dict(
+        type='GPUNormalize',
+        hook_pos='forward_pre',
+        input_format='NCHW',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375])
+]
+
+# dataset settings
+dataset_type = 'RawframeDataset'
+data_root = 'data/kinetics400/rawframes_train'
+data_root_val = 'data/kinetics400/rawframes_val'
+ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
+ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
+
+train_pipeline = [
+    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(
+        type='MultiScaleCrop',
+        input_size=224,
+        scales=(1, 0.875, 0.75, 0.66),
+        random_crop=False,
+        max_wh_scale_gap=1,
+        num_fixed_crops=13),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs', 'label'])
+]
+val_pipeline = [
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=8,
+        test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='Flip', flip_ratio=0),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+test_pipeline = [
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=8,
+        test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='Flip', flip_ratio=0),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+data = dict(
+    videos_per_gpu=8,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=data_root,
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=data_root_val,
+        pipeline=val_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=data_root_val,
+        pipeline=test_pipeline))
+evaluation = dict(
+    interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
+
+# runtime settings
+checkpoint_config = dict(interval=5)
+work_dir = './work_dirs/tsm_r50_gpu_normalize_1x1x8_100e_kinetics400_rgb/'
diff --git a/mmaction/datasets/pipelines/formating.py b/mmaction/datasets/pipelines/formating.py
@@ -276,6 +276,8 @@ def __call__(self, results):
             results (dict): The resulting dict to be modified and passed
                 to the next transform in pipeline.
         """
+        if not isinstance(results['imgs'], np.ndarray):
+            results['imgs'] = np.array(results['imgs'])
         imgs = results['imgs']
         # [M x H x W x C]
         # M = 1 * N_crops * N_clips * L

diff --git a/mmaction/utils/__init__.py b/mmaction/utils/__init__.py
@@ -3,10 +3,11 @@
 from .gradcam_utils import GradCAM
 from .logger import get_root_logger
 from .misc import get_random_string, get_shm_dir, get_thread_id
+from .module_hooks import register_module_hooks
 from .precise_bn import PreciseBNHook
 
 __all__ = [
     'get_root_logger', 'collect_env', 'get_random_string', 'get_thread_id',
     'get_shm_dir', 'GradCAM', 'PreciseBNHook', 'import_module_error_class',
-    'import_module_error_func'
+    'import_module_error_func', 'register_module_hooks'
 ]
diff --git a/mmaction/utils/module_hooks.py b/mmaction/utils/module_hooks.py
@@ -0,0 +1,77 @@
+import torch
+from mmcv.utils import Registry, build_from_cfg
+
+MODULE_HOOKS = Registry('module_hooks')
+
+
+def register_module_hooks(Module, module_hooks_list):
+    handles = []
+    for module_hook_cfg in module_hooks_list:
+        hook_pos = module_hook_cfg.pop('hook_pos', 'forward_pre')
+        if hook_pos == 'forward_pre':
+            handle = Module.register_forward_pre_hook(
+                build_from_cfg(module_hook_cfg, MODULE_HOOKS).hook_func())
+        elif hook_pos == 'forward':
+            handle = Module.register_forward_hook(
+                build_from_cfg(module_hook_cfg, MODULE_HOOKS).hook_func())
+        elif hook_pos == 'backward':
+            handle = Module.register_backward_hook(
+                build_from_cfg(module_hook_cfg, MODULE_HOOKS).hook_func())
+        else:
+            raise ValueError(
+                f'hook_pos must be `forward_pre`, `forward` or `backward`, '
+                f'but get {hook_pos}')
+        handles.append(handle)
+    return handles
+
+
+@MODULE_HOOKS.register_module()
+class GPUNormalize:
+    """Normalize images with the given mean and std value on GPUs.
+
+    Call the member function ``hook_func`` will return the forward pre-hook
+    function for module registration.
+
+    Args:
+        mean (Sequence[float]): Mean values of different channels.
+        std (Sequence[float]): Std values of different channels.
+    """
+
+    def __init__(self, input_format, mean, std):
+        if input_format not in ['NCTHW', 'NCHW', 'NCHW_Flow', 'NPTCHW']:
+            raise ValueError(f'The input format {input_format} is invalid.')
+        self.input_format = input_format
+        _mean = torch.tensor(mean)
+        _std = torch.tensor(std)
+        if input_format == 'NCTHW':
+            self._mean = _mean[None, :, None, None, None]
+            self._std = _std[None, :, None, None, None]
+        elif input_format == 'NCHW':
+            self._mean = _mean[None, :, None, None]
+            self._std = _std[None, :, None, None]
+        elif input_format == 'NCHW_Flow':
+            self._mean = _mean[None, :, None, None]
+            self._std = _std[None, :, None, None]
+        elif input_format == 'NPTCHW':
+            self._mean = _mean[None, None, None, :, None, None]
+            self._std = _std[None, None, None, :, None, None]
+        else:
+            raise ValueError(f'The input format {input_format} is invalid.')
+
+    def hook_func(self):
+
+        def normalize_hook(Module, input):
+            x = input[0]
+            assert x.dtype == torch.uint8, (
+                f'The previous augmentation should use uint8 data type to '
+                f'speed up computation, but get {x.dtype}')
+
+            mean = self._mean.to(x.device)
+            std = self._std.to(x.device)
+
+            with torch.no_grad():
+                x = x.float().sub_(mean).div_(std)
+
+            return (x, *input[1:])
+
+        return normalize_hook
diff --git a/setup.cfg b/setup.cfg
@@ -19,6 +19,6 @@ line_length = 79
 multi_line_output = 0
 known_standard_library = pkg_resources,setuptools
 known_first_party = mmaction
-known_third_party = cv2,joblib,matplotlib,mmcv,numpy,pandas,pytest,scipy,seaborn,titlecase,torch,tqdm
+known_third_party = cv2,joblib,matplotlib,mmcv,numpy,pandas,pytest,scipy,seaborn,titlecase,torch,torchvision,tqdm
 no_lines_before = STDLIB,LOCALFOLDER
 default_section = THIRDPARTY
diff --git a/tests/test_runtime/test_train.py b/tests/test_runtime/test_train.py
@@ -73,7 +73,7 @@ def test_train_model():
         load_from=None,
         workflow=[('train', 1)],
         total_epochs=5,
-        evaluation=dict(interval=1, key_indicator='acc'),
+        evaluation=dict(interval=1, save_best='acc'),
         data=dict(
             videos_per_gpu=1,
             workers_per_gpu=0,

diff --git a/tests/test_utils/test_module_hooks.py b/tests/test_utils/test_module_hooks.py
@@ -0,0 +1,103 @@
+import copy
+
+import numpy as np
+import pytest
+import torch
+import torchvision.models as models
+
+from mmaction.utils import register_module_hooks
+from mmaction.utils.module_hooks import GPUNormalize
+
+
+def test_register_module_hooks():
+    _module_hooks = [
+        dict(
+            type='GPUNormalize',
+            hook_pos='forward_pre',
+            input_format='NCHW',
+            mean=[123.675, 116.28, 103.53],
+            std=[58.395, 57.12, 57.375])
+    ]
+
+    # case 1
+    module_hooks = copy.deepcopy(_module_hooks)
+    module_hooks[0]['hook_pos'] = 'forward_pre'
+    resnet = models.resnet50()
+    handles = register_module_hooks(resnet, module_hooks)
+    assert resnet._forward_pre_hooks[
+        handles[0].id].__name__ == 'normalize_hook'
+
+    # case 2
+    module_hooks = copy.deepcopy(_module_hooks)
+    module_hooks[0]['hook_pos'] = 'forward'
+    resnet = models.resnet50()
+    handles = register_module_hooks(resnet, module_hooks)
+    assert resnet._forward_hooks[handles[0].id].__name__ == 'normalize_hook'
+
+    # case 3
+    module_hooks = copy.deepcopy(_module_hooks)
+    module_hooks[0]['hook_pos'] = 'backward'
+    resnet = models.resnet50()
+    handles = register_module_hooks(resnet, module_hooks)
+    assert resnet._backward_hooks[handles[0].id].__name__ == 'normalize_hook'
+
+    # case 4
+    module_hooks = copy.deepcopy(_module_hooks)
+    module_hooks[0]['hook_pos'] = '_other_pos'
+    resnet = models.resnet50()
+    with pytest.raises(ValueError):
+        handles = register_module_hooks(resnet, module_hooks)
+
+
+def test_gpu_normalize():
+
+    def check_normalize(origin_imgs, result_imgs, norm_cfg):
+        """Check if the origin_imgs are normalized correctly into result_imgs
+        in a given norm_cfg."""
+        from numpy.testing import assert_array_almost_equal
+        target_imgs = result_imgs.copy()
+        target_imgs *= norm_cfg['std']
+        target_imgs += norm_cfg['mean']
+        assert_array_almost_equal(origin_imgs, target_imgs, decimal=4)
+
+    _gpu_normalize_cfg = dict(
+        input_format='NCTHW',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375])
+
+    # case 1
+    gpu_normalize_cfg = copy.deepcopy(_gpu_normalize_cfg)
+    gpu_normalize_cfg['input_format'] = 'NCHW'
+    gpu_normalize = GPUNormalize(**gpu_normalize_cfg)
+    assert gpu_normalize._mean.shape == (1, 3, 1, 1)
+    imgs = np.random.randint(256, size=(2, 240, 320, 3), dtype=np.uint8)
+    _input = (torch.tensor(imgs).permute(0, 3, 1, 2), )
+    resnet = models.resnet50()
+    normalize_hook = gpu_normalize.hook_func()
+    _input = normalize_hook(resnet, _input)
+    result_imgs = np.array(_input[0].permute(0, 2, 3, 1))
+    check_normalize(imgs, result_imgs, gpu_normalize_cfg)
+
+    # case 2
+    gpu_normalize_cfg = copy.deepcopy(_gpu_normalize_cfg)
+    gpu_normalize_cfg['input_format'] = 'NCTHW'
+    gpu_normalize = GPUNormalize(**gpu_normalize_cfg)
+    assert gpu_normalize._mean.shape == (1, 3, 1, 1, 1)
+
+    # case 3
+    gpu_normalize_cfg = copy.deepcopy(_gpu_normalize_cfg)
+    gpu_normalize_cfg['input_format'] = 'NCHW_Flow'
+    gpu_normalize = GPUNormalize(**gpu_normalize_cfg)
+    assert gpu_normalize._mean.shape == (1, 3, 1, 1)
+
+    # case 4
+    gpu_normalize_cfg = copy.deepcopy(_gpu_normalize_cfg)
+    gpu_normalize_cfg['input_format'] = 'NPTCHW'
+    gpu_normalize = GPUNormalize(**gpu_normalize_cfg)
+    assert gpu_normalize._mean.shape == (1, 1, 1, 3, 1, 1)
+
+    # case 5
+    gpu_normalize_cfg = copy.deepcopy(_gpu_normalize_cfg)
+    gpu_normalize_cfg['input_format'] = '_format'
+    with pytest.raises(ValueError):
+        gpu_normalize = GPUNormalize(**gpu_normalize_cfg)
diff --git a/tools/test.py b/tools/test.py
@@ -15,6 +15,7 @@
 from mmaction.apis import multi_gpu_test, single_gpu_test
 from mmaction.datasets import build_dataloader, build_dataset
 from mmaction.models import build_model
+from mmaction.utils import register_module_hooks
 
 
 def parse_args():
@@ -155,6 +156,9 @@ def main():
         distributed = True
         init_dist(args.launcher, **cfg.dist_params)
 
+    # The flag is used to register module's hooks
+    cfg.setdefault('module_hooks', [])
+
     # build the dataloader
     dataset = build_dataset(cfg.data.test, dict(test_mode=True))
     dataloader_setting = dict(
@@ -168,6 +172,9 @@ def main():
 
     # build the model and load checkpoint
     model = build_model(cfg.model, train_cfg=None, test_cfg=cfg.test_cfg)
+
+    register_module_hooks(model.backbone, cfg.module_hooks)
+
     fp16_cfg = cfg.get('fp16', None)
     if fp16_cfg is not None:
         wrap_fp16_model(model)

diff --git a/tools/train.py b/tools/train.py
@@ -15,7 +15,7 @@
 from mmaction.apis import train_model
 from mmaction.datasets import build_dataset
 from mmaction.models import build_model
-from mmaction.utils import collect_env, get_root_logger
+from mmaction.utils import collect_env, get_root_logger, register_module_hooks
 
 
 def parse_args():
@@ -103,6 +103,9 @@ def main():
     # The flag is used to determine whether it is omnisource training
     cfg.setdefault('omnisource', False)
 
+    # The flag is used to register module's hooks
+    cfg.setdefault('module_hooks', [])
+
     # create work_dir
     mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
     # dump config
@@ -140,6 +143,8 @@ def main():
     model = build_model(
         cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg)
 
+    register_module_hooks(model.backbone, cfg.module_hooks)
+
     if cfg.omnisource:
         # If omnisource flag is set, cfg.data.train should be a list
         assert type(cfg.data.train) is list