[Features] Support PV_RCNN modules (open-mmlab#1957)

* add pvrcnn module code * add voxelsa * fix * fix comments * fix comments * fix comments * add stack sa * fix * fix comments * fix comments * fix * add ut * fix comments
ZwwWayne · Dec 3, 2022 · aa544d4 · aa544d4
1 parent 1fd7153
commit aa544d4
Show file tree

Hide file tree

Showing 18 changed files with 2,310 additions and 11 deletions.
diff --git a/configs/pvrcnn/pvrcnn_8xb2-80e_kitti-3d-3class.py b/configs/pvrcnn/pvrcnn_8xb2-80e_kitti-3d-3class.py
@@ -0,0 +1,353 @@
+_base_ = [
+    '../_base_/datasets/kitti-3d-3class.py',
+    '../_base_/schedules/cyclic-40e.py', '../_base_/default_runtime.py'
+]
+
+voxel_size = [0.05, 0.05, 0.1]
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]
+
+data_root = 'data/kitti/'
+class_names = ['Pedestrian', 'Cyclist', 'Car']
+metainfo = dict(CLASSES=class_names)
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'kitti_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(Car=5, Pedestrian=5, Cyclist=5)),
+    classes=class_names,
+    sample_groups=dict(Car=15, Pedestrian=10, Cyclist=10),
+    points_loader=dict(
+        type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4))
+
+train_pipeline = [
+    dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(type='ObjectSample', db_sampler=db_sampler, use_ground_plane=True),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range)
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+
+model = dict(
+    type='PointVoxelRCNN',
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        voxel=True,
+        voxel_layer=dict(
+            max_num_points=5,  # max_points_per_voxel
+            point_cloud_range=point_cloud_range,
+            voxel_size=voxel_size,
+            max_voxels=(16000, 40000))),
+    voxel_encoder=dict(type='HardSimpleVFE'),
+    middle_encoder=dict(
+        type='SparseEncoder',
+        in_channels=4,
+        sparse_shape=[41, 1600, 1408],
+        order=('conv', 'norm', 'act'),
+        encoder_paddings=((0, 0, 0), ((1, 1, 1), 0, 0), ((1, 1, 1), 0, 0),
+                          ((0, 1, 1), 0, 0)),
+        return_middle_feats=True),
+    points_encoder=dict(
+        type='VoxelSetAbstraction',
+        num_keypoints=2048,
+        fused_out_channel=128,
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range,
+        voxel_sa_cfgs_list=[
+            dict(
+                type='StackedSAModuleMSG',
+                in_channels=16,
+                scale_factor=1,
+                radius=(0.4, 0.8),
+                sample_nums=(16, 16),
+                mlp_channels=((16, 16), (16, 16)),
+                use_xyz=True),
+            dict(
+                type='StackedSAModuleMSG',
+                in_channels=32,
+                scale_factor=2,
+                radius=(0.8, 1.2),
+                sample_nums=(16, 32),
+                mlp_channels=((32, 32), (32, 32)),
+                use_xyz=True),
+            dict(
+                type='StackedSAModuleMSG',
+                in_channels=64,
+                scale_factor=4,
+                radius=(1.2, 2.4),
+                sample_nums=(16, 32),
+                mlp_channels=((64, 64), (64, 64)),
+                use_xyz=True),
+            dict(
+                type='StackedSAModuleMSG',
+                in_channels=64,
+                scale_factor=8,
+                radius=(2.4, 4.8),
+                sample_nums=(16, 32),
+                mlp_channels=((64, 64), (64, 64)),
+                use_xyz=True)
+        ],
+        rawpoints_sa_cfgs=dict(
+            type='StackedSAModuleMSG',
+            in_channels=1,
+            radius=(0.4, 0.8),
+            sample_nums=(16, 16),
+            mlp_channels=((16, 16), (16, 16)),
+            use_xyz=True),
+        bev_feat_channel=256,
+        bev_scale_factor=8),
+    backbone=dict(
+        type='SECOND',
+        in_channels=256,
+        layer_nums=[5, 5],
+        layer_strides=[1, 2],
+        out_channels=[128, 256]),
+    neck=dict(
+        type='SECONDFPN',
+        in_channels=[128, 256],
+        upsample_strides=[1, 2],
+        out_channels=[256, 256]),
+    rpn_head=dict(
+        type='PartA2RPNHead',
+        num_classes=3,
+        in_channels=512,
+        feat_channels=512,
+        use_direction_classifier=True,
+        dir_offset=0.78539,
+        anchor_generator=dict(
+            type='Anchor3DRangeGenerator',
+            ranges=[[0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                    [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                    [0, -40.0, -1.78, 70.4, 40.0, -1.78]],
+            sizes=[[0.8, 0.6, 1.73], [1.76, 0.6, 1.73], [3.9, 1.6, 1.56]],
+            rotations=[0, 1.57],
+            reshape_out=False),
+        diff_rad_by_sin=True,
+        assigner_per_size=True,
+        assign_per_class=True,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False,
+            loss_weight=0.2)),
+    roi_head=dict(
+        type='PVRCNNRoiHead',
+        num_classes=3,
+        semantic_head=dict(
+            type='ForegroundSegmentationHead',
+            in_channels=640,
+            extra_width=0.1,
+            loss_seg=dict(
+                type='mmdet.FocalLoss',
+                use_sigmoid=True,
+                reduction='sum',
+                gamma=2.0,
+                alpha=0.25,
+                activated=True,
+                loss_weight=1.0)),
+        bbox_roi_extractor=dict(
+            type='Batch3DRoIGridExtractor',
+            grid_size=6,
+            roi_layer=dict(
+                type='StackedSAModuleMSG',
+                in_channels=128,
+                radius=(0.8, 1.6),
+                sample_nums=(16, 16),
+                mlp_channels=((64, 64), (64, 64)),
+                use_xyz=True,
+                pool_mod='max'),
+        ),
+        bbox_head=dict(
+            type='PVRCNNBBoxHead',
+            in_channels=128,
+            grid_size=6,
+            num_classes=3,
+            class_agnostic=True,
+            shared_fc_channels=(256, 256),
+            reg_channels=(256, 256),
+            cls_channels=(256, 256),
+            dropout_ratio=0.3,
+            with_corner_loss=True,
+            bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+            loss_bbox=dict(
+                type='mmdet.SmoothL1Loss',
+                beta=1.0 / 9.0,
+                reduction='sum',
+                loss_weight=1.0),
+            loss_cls=dict(
+                type='mmdet.CrossEntropyLoss',
+                use_sigmoid=True,
+                reduction='sum',
+                loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=[
+                dict(  # for Pedestrian
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.35,
+                    min_pos_iou=0.35,
+                    ignore_iof_thr=-1),
+                dict(  # for Cyclist
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.35,
+                    min_pos_iou=0.35,
+                    ignore_iof_thr=-1),
+                dict(  # for Car
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.6,
+                    neg_iou_thr=0.45,
+                    min_pos_iou=0.45,
+                    ignore_iof_thr=-1)
+            ],
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=9000,
+            nms_post=512,
+            max_num=512,
+            nms_thr=0.8,
+            score_thr=0,
+            use_rotate_nms=True),
+        rcnn=dict(
+            assigner=[
+                dict(  # for Pedestrian
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(
+                        type='BboxOverlaps3D', coordinate='lidar'),
+                    pos_iou_thr=0.55,
+                    neg_iou_thr=0.55,
+                    min_pos_iou=0.55,
+                    ignore_iof_thr=-1),
+                dict(  # for Cyclist
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(
+                        type='BboxOverlaps3D', coordinate='lidar'),
+                    pos_iou_thr=0.55,
+                    neg_iou_thr=0.55,
+                    min_pos_iou=0.55,
+                    ignore_iof_thr=-1),
+                dict(  # for Car
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(
+                        type='BboxOverlaps3D', coordinate='lidar'),
+                    pos_iou_thr=0.55,
+                    neg_iou_thr=0.55,
+                    min_pos_iou=0.55,
+                    ignore_iof_thr=-1)
+            ],
+            sampler=dict(
+                type='IoUNegPiecewiseSampler',
+                num=128,
+                pos_fraction=0.5,
+                neg_piece_fractions=[0.8, 0.2],
+                neg_iou_piece_thrs=[0.55, 0.1],
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False,
+                return_iou=True),
+            cls_pos_thr=0.75,
+            cls_neg_thr=0.25)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1024,
+            nms_post=100,
+            max_num=100,
+            nms_thr=0.7,
+            score_thr=0,
+            use_rotate_nms=True),
+        rcnn=dict(
+            use_rotate_nms=True,
+            use_raw_score=True,
+            nms_thr=0.1,
+            score_thr=0.1)))
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    dataset=dict(dataset=dict(pipeline=train_pipeline, metainfo=metainfo)))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline, metainfo=metainfo))
+eval_dataloader = dict(dataset=dict(pipeline=test_pipeline, metainfo=metainfo))
+lr = 0.001
+optim_wrapper = dict(optimizer=dict(lr=lr))
+param_scheduler = [
+    # learning rate scheduler
+    # During the first 16 epochs, learning rate increases from 0 to lr * 10
+    # during the next 24 epochs, learning rate decreases from lr * 10 to
+    # lr * 1e-4
+    dict(
+        type='CosineAnnealingLR',
+        T_max=15,
+        eta_min=lr * 10,
+        begin=0,
+        end=15,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=25,
+        eta_min=lr * 1e-4,
+        begin=15,
+        end=40,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    # momentum scheduler
+    # During the first 16 epochs, momentum increases from 0 to 0.85 / 0.95
+    # during the next 24 epochs, momentum increases from 0.85 / 0.95 to 1
+    dict(
+        type='CosineAnnealingMomentum',
+        T_max=15,
+        eta_min=0.85 / 0.95,
+        begin=0,
+        end=15,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingMomentum',
+        T_max=25,
+        eta_min=1,
+        begin=15,
+        end=40,
+        by_epoch=True,
+        convert_to_iter_based=True)
+]
diff --git a/mmdet3d/apis/inference.py b/mmdet3d/apis/inference.py
@@ -67,7 +67,6 @@ def init_model(config: Union[str, Path, Config],
 
     if checkpoint is not None:
         checkpoint = load_checkpoint(model, checkpoint, map_location='cpu')
-
         dataset_meta = checkpoint['meta'].get('dataset_meta', None)
         # save the dataset_meta in the model for convenience
         if 'dataset_meta' in checkpoint.get('meta', {}):

diff --git a/mmdet3d/models/detectors/__init__.py b/mmdet3d/models/detectors/__init__.py
@@ -14,6 +14,7 @@
 from .mvx_two_stage import MVXTwoStageDetector
 from .parta2 import PartA2
 from .point_rcnn import PointRCNN
+from .pv_rcnn import PointVoxelRCNN
 from .sassd import SASSD
 from .single_stage_mono3d import SingleStageMono3DDetector
 from .smoke_mono3d import SMOKEMono3D
@@ -26,5 +27,6 @@
     'DynamicMVXFasterRCNN', 'MVXFasterRCNN', 'PartA2', 'VoteNet', 'H3DNet',
     'CenterPoint', 'SSD3DNet', 'ImVoteNet', 'SingleStageMono3DDetector',
     'FCOSMono3D', 'ImVoxelNet', 'GroupFree3DNet', 'PointRCNN', 'SMOKEMono3D',
-    'SASSD', 'MinkSingleStage3DDetector', 'MultiViewDfM', 'DfM'
+    'SASSD', 'MinkSingleStage3DDetector', 'MultiViewDfM', 'DfM',
+    'PointVoxelRCNN'
 ]