diff --git a/.gitignore b/.gitignore
index 27cb9c7cb..2fefc6a90 100644
--- a/.gitignore
+++ b/.gitignore
@@ -134,3 +134,4 @@ data/sunrgbd/OFFICIAL_SUNRGBD/
 # Waymo evaluation
 mmdet3d/evaluation/functional/waymo_utils/compute_detection_metrics_main
 mmdet3d/evaluation/functional/waymo_utils/compute_detection_let_metrics_main
+mmdet3d/evaluation/functional/waymo_utils/compute_segmentation_metrics_main
diff --git a/configs/_base_/datasets/waymoD5-3d-3class.py b/configs/_base_/datasets/waymoD5-3d-3class.py
index e5240b629..f8f14998d 100644
--- a/configs/_base_/datasets/waymoD5-3d-3class.py
+++ b/configs/_base_/datasets/waymoD5-3d-3class.py
@@ -89,7 +89,10 @@
             dict(
                 type='PointsRangeFilter', point_cloud_range=point_cloud_range)
         ]),
-    dict(type='Pack3DDetInputs', keys=['points'])
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points'],
+        meta_keys=['box_type_3d', 'sample_idx', 'context_name', 'timestamp'])
 ]
 # construct a pipeline for data and gt loading in show function
 # please keep its loading function consistent with test_pipeline (e.g. client)
@@ -100,7 +103,10 @@
         load_dim=6,
         use_dim=5,
         backend_args=backend_args),
-    dict(type='Pack3DDetInputs', keys=['points']),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points'],
+        meta_keys=['box_type_3d', 'sample_idx', 'context_name', 'timestamp'])
 ]
 
 train_dataloader = dict(
@@ -164,12 +170,7 @@
         backend_args=backend_args))
 
 val_evaluator = dict(
-    type='WaymoMetric',
-    ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl',
-    waymo_bin_file='./data/waymo/waymo_format/gt.bin',
-    data_root='./data/waymo/waymo_format',
-    backend_args=backend_args,
-    convert_kitti_format=False)
+    type='WaymoMetric', waymo_bin_file='./data/waymo/waymo_format/gt.bin')
 test_evaluator = val_evaluator
 
 vis_backends = [dict(type='LocalVisBackend')]
diff --git a/configs/_base_/datasets/waymoD5-3d-car.py b/configs/_base_/datasets/waymoD5-3d-car.py
index f95ac1d81..972e9289b 100644
--- a/configs/_base_/datasets/waymoD5-3d-car.py
+++ b/configs/_base_/datasets/waymoD5-3d-car.py
@@ -62,7 +62,8 @@
     dict(type='PointShuffle'),
     dict(
         type='Pack3DDetInputs',
-        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+        keys=['points'],
+        meta_keys=['box_type_3d', 'sample_idx', 'context_name', 'timestamp'])
 ]
 test_pipeline = [
     dict(
@@ -86,7 +87,10 @@
             dict(
                 type='PointsRangeFilter', point_cloud_range=point_cloud_range)
         ]),
-    dict(type='Pack3DDetInputs', keys=['points'])
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points'],
+        meta_keys=['box_type_3d', 'sample_idx', 'context_name', 'timestamp'])
 ]
 # construct a pipeline for data and gt loading in show function
 # please keep its loading function consistent with test_pipeline (e.g. client)
@@ -161,12 +165,7 @@
         backend_args=backend_args))
 
 val_evaluator = dict(
-    type='WaymoMetric',
-    ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl',
-    waymo_bin_file='./data/waymo/waymo_format/gt.bin',
-    data_root='./data/waymo/waymo_format',
-    convert_kitti_format=False,
-    backend_args=backend_args)
+    type='WaymoMetric', waymo_bin_file='./data/waymo/waymo_format/gt.bin')
 test_evaluator = val_evaluator
 
 vis_backends = [dict(type='LocalVisBackend')]
diff --git a/docs/en/advanced_guides/datasets/waymo.md b/docs/en/advanced_guides/datasets/waymo.md
index 2e52b9dd1..f28ca253b 100644
--- a/docs/en/advanced_guides/datasets/waymo.md
+++ b/docs/en/advanced_guides/datasets/waymo.md
@@ -7,12 +7,7 @@ This page provides specific tutorials about the usage of MMDetection3D for Waymo
 Before preparing Waymo dataset, if you only installed requirements in `requirements/build.txt` and `requirements/runtime.txt` before, please install the official package for this dataset at first by running
 
 ```
-# tf 2.1.0.
-pip install waymo-open-dataset-tf-2-1-0==1.2.0
-# tf 2.0.0
-# pip install waymo-open-dataset-tf-2-0-0==1.2.0
-# tf 1.15.0
-# pip install waymo-open-dataset-tf-1-15-0==1.2.0
+pip install waymo-open-dataset-tf-2-6-0
 ```
 
 or
@@ -38,15 +33,19 @@ mmdetection3d
 │   │   │   ├── validation
 │   │   │   ├── testing
 │   │   │   ├── gt.bin
+│   │   │   ├── cam_gt.bin
+│   │   │   ├── fov_gt.bin
 │   │   ├── kitti_format
 │   │   │   ├── ImageSets
 
 ```
 
-You can download Waymo open dataset V1.2 [HERE](https://waymo.com/open/download/) and its data split [HERE](https://drive.google.com/drive/folders/18BVuF_RYJF0NjZpt8SnfzANiakoRMf0o?usp=sharing). Then put `tfrecord` files into corresponding folders in `data/waymo/waymo_format/` and put the data split txt files into `data/waymo/kitti_format/ImageSets`. Download ground truth bin files for validation set [HERE](https://console.cloud.google.com/storage/browser/waymo_open_dataset_v_1_2_0/validation/ground_truth_objects) and put it into `data/waymo/waymo_format/`. A tip is that you can use `gsutil` to download the large-scale dataset with commands. You can take this [tool](https://github.com/RalphMao/Waymo-Dataset-Tool) as an example for more details. Subsequently, prepare Waymo data by running
+You can download Waymo open dataset V1.4 [HERE](https://waymo.com/open/download/) and its data split [HERE](https://drive.google.com/drive/folders/18BVuF_RYJF0NjZpt8SnfzANiakoRMf0o?usp=sharing). Then put `tfrecord` files into corresponding folders in `data/waymo/waymo_format/` and put the data split txt files into `data/waymo/kitti_format/ImageSets`. Download ground truth bin files for validation set [HERE](https://console.cloud.google.com/storage/browser/waymo_open_dataset_v_1_2_0/validation/ground_truth_objects) and put it into `data/waymo/waymo_format/`. A tip is that you can use `gsutil` to download the large-scale dataset with commands. You can take this [tool](https://github.com/RalphMao/Waymo-Dataset-Tool) as an example for more details. Subsequently, prepare Waymo data by running
 
 ```bash
-python tools/create_data.py waymo --root-path ./data/waymo/ --out-dir ./data/waymo/ --workers 128 --extra-tag waymo
+# TF_CPP_MIN_LOG_LEVEL=3 will disable all logging output from TensorFlow.
+# The number of `--workers` depends on the maximum number of cores in your CPU.
+TF_CPP_MIN_LOG_LEVEL=3 python tools/create_data.py waymo --root-path ./data/waymo --out-dir ./data/waymo --workers 128 --extra-tag waymo --version v1.4
 ```
 
 Note that if your local disk does not have enough space for saving converted data, you can change the `--out-dir` to anywhere else. Just remember to create folders and prepare data there in advance and link them back to `data/waymo/kitti_format` after the data conversion.
@@ -65,22 +64,16 @@ mmdetection3d
 │   │   │   ├── validation
 │   │   │   ├── testing
 │   │   │   ├── gt.bin
+│   │   │   ├── cam_gt.bin
+│   │   │   ├── fov_gt.bin
 │   │   ├── kitti_format
 │   │   │   ├── ImageSets
 │   │   │   ├── training
-│   │   │   │   ├── calib
 │   │   │   │   ├── image_0
 │   │   │   │   ├── image_1
 │   │   │   │   ├── image_2
 │   │   │   │   ├── image_3
 │   │   │   │   ├── image_4
-│   │   │   │   ├── label_0
-│   │   │   │   ├── label_1
-│   │   │   │   ├── label_2
-│   │   │   │   ├── label_3
-│   │   │   │   ├── label_4
-│   │   │   │   ├── label_all
-│   │   │   │   ├── pose
 │   │   │   │   ├── velodyne
 │   │   │   ├── testing
 │   │   │   │   ├── (the same as training)
@@ -93,7 +86,48 @@ mmdetection3d
 
 ```
 
-Here because there are several cameras, we store the corresponding image and labels that can be projected to that camera respectively and save pose for further usage of consecutive frames point clouds. We use a coding way `{a}{bbb}{ccc}` to name the data for each frame, where `a` is the prefix for different split (`0` for training, `1` for validation and `2` for testing), `bbb` for segment index and `ccc` for frame index. You can easily locate the required frame according to this naming rule. We gather the data for training and validation together as KITTI and store the indices for different set in the `ImageSet` files.
+- `kitti_format/training/image_{0-4}/{a}{bbb}{ccc}.jpg` Here because there are several cameras, we store the corresponding images. We use a coding way `{a}{bbb}{ccc}` to name the data for each frame, where `a` is the prefix for different split (`0` for training, `1` for validation and `2` for testing), `bbb` for segment index and `ccc` for frame index. You can easily locate the required frame according to this naming rule. We gather the data for training and validation together as KITTI and store the indices for different set in the `ImageSet` files.
+- `kitti_format/training/velodyne/{a}{bbb}{ccc}.bin` point cloud data for each frame.
+- `kitti_format/waymo_gt_database/xxx_{Car/Pedestrian/Cyclist}_x.bin`. point cloud data included in each 3D bounding box of the training dataset. These point clouds will be used in data augmentation e.g. `ObjectSample`. `xxx` is the index of training samples and `x` is the index of objects in this frame.
+- `kitti_format/waymo_infos_train.pkl`. training dataset information, a dict contains two keys: `metainfo` and `data_list`.`metainfo` contains the basic information for the dataset itself, such as `dataset`, `version` and `info_version`, while `data_list` is a list of dict, each dict (hereinafter referred to as `info`) contains all the detailed information of single sample as follows:
+  - info\['sample_idx'\]: The index of this sample in the whole dataset.
+  - info\['ego2global'\]: The transformation matrix from the ego vehicle to global coordinates. (4x4 list).
+  - info\['timestamp'\]: Timestamp of the sample data.
+  - info\['context_name'\]: The context name of sample indices which `*.tfrecord` segment it extracted from.
+  - info\['lidar_points'\]: A dict containing all the information related to the lidar points.
+    - info\['lidar_points'\]\['lidar_path'\]: The filename of the lidar point cloud data.
+    - info\['lidar_points'\]\['num_pts_feats'\]: The feature dimension of point.
+  - info\['lidar_sweeps'\]: A list contains sweeps information of lidar
+    - info\['lidar_sweeps'\]\[i\]\['lidar_points'\]\['lidar_path'\]: The lidar data path of i-th sweep.
+    - info\['lidar_sweeps'\]\[i\]\['ego2global'\]: The transformation matrix from the ego vehicle to global coordinates. (4x4 list)
+    - info\['lidar_sweeps'\]\[i\]\['timestamp'\]: Timestamp of the sweep data.
+  - info\['images'\]: A dict contains five keys corresponding to each camera: `'CAM_FRONT'`, `'CAM_FRONT_RIGHT'`, `'CAM_FRONT_LEFT'`, `'CAM_SIDE_LEFT'`, `'CAM_SIDE_RIGHT'`. Each dict contains all data information related to  corresponding camera.
+    - info\['images'\]\['CAM_XXX'\]\['img_path'\]: The filename of the image.
+    - info\['images'\]\['CAM_XXX'\]\['height'\]: The height of the image.
+    - info\['images'\]\['CAM_XXX'\]\['width'\]: The width of the image.
+    - info\['images'\]\['CAM_XXX'\]\['cam2img'\]: The transformation matrix recording the intrinsic parameters when projecting 3D points to each image plane. (4x4 list)
+    - info\['images'\]\['CAM_XXX'\]\['lidar2cam'\]: The transformation matrix from lidar sensor to this camera. (4x4 list)
+    - info\['images'\]\['CAM_XXX'\]\['lidar2img'\]: The transformation matrix from lidar sensor to each image plane. (4x4 list)
+  - info\['image_sweeps'\]: A list containing sweeps information of images.
+    - info\['image_sweeps'\]\[i\]\['images'\]\['CAM_XXX'\]\['img_path'\]: The image path of i-th sweep.
+    - info\['image_sweeps'\]\[i\]\['ego2global'\]: The transformation matrix from the ego vehicle to global coordinates. (4x4 list)
+    - info\['image_sweeps'\]\[i\]\['timestamp'\]: Timestamp of the sweep data.
+  - info\['instances'\]: It is a list of dict. Each dict contains all annotation information of single instance. For the i-th instance:
+    - info\['instances'\]\[i\]\['bbox_3d'\]: List of 7 numbers representing the 3D bounding box of the instance, in (x, y, z, l, w, h, yaw) order.
+    - info\['instances'\]\[i\]\['bbox'\]: List of 4 numbers representing the 2D bounding box of the instance, in (x1, y1, x2, y2) order. (some instances may not have a corresponding 2D bounding box)
+    - info\['instances'\]\[i\]\['bbox_label_3d'\]: A int indicating the label of instance and the -1 indicating ignore.
+    - info\['instances'\]\[i\]\['bbox_label'\]: A int indicating the label of instance and the -1 indicating ignore.
+    - info\['instances'\]\[i\]\['num_lidar_pts'\]: Number of lidar points included in each 3D bounding box.
+    - info\['instances'\]\[i\]\['camera_id'\]: The index of the most visible camera for this instance.
+    - info\['instances'\]\[i\]\['group_id'\]: The index of this instance in this sample.
+  - info\['cam_sync_instances'\]: It is a list of dict. Each dict contains all annotation information of single instance. Its format is same with \['instances'\]. However, \['cam_sync_instances'\] is only for multi-view camera-based 3D Object Detection task.
+  - info\['cam_instances'\]: It is a dict containing keys `'CAM_FRONT'`, `'CAM_FRONT_RIGHT'`, `'CAM_FRONT_LEFT'`, `'CAM_SIDE_LEFT'`, `'CAM_SIDE_RIGHT'`. For monocular camera-based 3D Object Detection task, we split 3D annotations of the whole scenes according to the camera they belong to. For the i-th instance:
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox_3d'\]: List of 7 numbers representing the 3D bounding box of the instance, in (x, y, z, l, h, w, yaw) order.
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox'\]: 2D bounding box annotation (exterior rectangle of the projected 3D box), a list arrange as \[x1, y1, x2, y2\].
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox_label_3d'\]: Label of instance.
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox_label'\]: Label of instance.
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['center_2d'\]: Projected center location on the image, a list has shape (2,).
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['depth'\]: The depth of projected center.
 
 ## Training
 
@@ -101,7 +135,7 @@ Considering there are many similar frames in the original dataset, we can basica
 
 ## Evaluation
 
-For evaluation on Waymo, please follow the [instruction](https://github.com/waymo-research/waymo-open-dataset/blob/master/docs/quick_start.md) to build the binary file `compute_detection_metrics_main` for metrics computation and put it into `mmdet3d/core/evaluation/waymo_utils/`.  Basically, you can follow the commands below to install `bazel` and build the file.
+For evaluation on Waymo, please follow the [instruction](https://github.com/waymo-research/waymo-open-dataset/blob/r1.3/docs/quick_start.md) to build the binary file `compute_detection_metrics_main` for metrics computation and put it into `mmdet3d/core/evaluation/waymo_utils/`.  Basically, you can follow the commands below to install `bazel` and build the file.
 
 ```shell
 # download the code and enter the base directory
diff --git a/docs/zh_cn/advanced_guides/datasets/waymo.md b/docs/zh_cn/advanced_guides/datasets/waymo.md
index 577ec1513..8c0f0dfc0 100644
--- a/docs/zh_cn/advanced_guides/datasets/waymo.md
+++ b/docs/zh_cn/advanced_guides/datasets/waymo.md
@@ -7,12 +7,7 @@
 在准备 Waymo 数据集之前，如果您之前只安装了 `requirements/build.txt` 和 `requirements/runtime.txt` 中的依赖，请通过运行如下指令额外安装 Waymo 数据集所依赖的官方包：
 
 ```
-# tf 2.1.0.
-pip install waymo-open-dataset-tf-2-1-0==1.2.0
-# tf 2.0.0
-# pip install waymo-open-dataset-tf-2-0-0==1.2.0
-# tf 1.15.0
-# pip install waymo-open-dataset-tf-1-15-0==1.2.0
+pip install waymo-open-dataset-tf-2-6-0
 ```
 
 或者
@@ -38,6 +33,8 @@ mmdetection3d
 │   │   │   ├── validation
 │   │   │   ├── testing
 │   │   │   ├── gt.bin
+│   │   │   ├── cam_gt.bin
+│   │   │   ├── fov_gt.bin
 │   │   ├── kitti_format
 │   │   │   ├── ImageSets
 
@@ -46,7 +43,9 @@ mmdetection3d
 您可以在[这里](https://waymo.com/open/download/)下载 1.2 版本的 Waymo 公开数据集，并在[这里](https://drive.google.com/drive/folders/18BVuF_RYJF0NjZpt8SnfzANiakoRMf0o?usp=sharing)下载其训练/验证/测试集拆分文件。接下来，请将 `tfrecord` 文件放入 `data/waymo/waymo_format/` 下的对应文件夹，并将 txt 格式的数据集拆分文件放入 `data/waymo/kitti_format/ImageSets`。在[这里](https://console.cloud.google.com/storage/browser/waymo_open_dataset_v_1_2_0/validation/ground_truth_objects)下载验证集使用的 bin 格式真实标注 (Ground Truth) 文件并放入 `data/waymo/waymo_format/`。小窍门：您可以使用 `gsutil` 来在命令行下载大规模数据集。您可以将该[工具](https://github.com/RalphMao/Waymo-Dataset-Tool) 作为一个例子来查看更多细节。之后，通过运行如下指令准备 Waymo 数据：
 
 ```bash
-python tools/create_data.py waymo --root-path ./data/waymo/ --out-dir ./data/waymo/ --workers 128 --extra-tag waymo
+# TF_CPP_MIN_LOG_LEVEL=3 will disable all logging output from TensorFlow.
+# The number of `--workers` depends on the maximum number of cores in your CPU.
+TF_CPP_MIN_LOG_LEVEL=3 python tools/create_data.py waymo --root-path ./data/waymo --out-dir ./data/waymo --workers 128 --extra-tag waymo --version v1.4
 ```
 
 请注意，如果您的本地磁盘没有足够空间保存转换后的数据，您可以将 `--out-dir` 改为其他目录；只要在创建文件夹、准备数据并转换格式后，将数据文件链接到 `data/waymo/kitti_format` 即可。
@@ -65,22 +64,16 @@ mmdetection3d
 │   │   │   ├── validation
 │   │   │   ├── testing
 │   │   │   ├── gt.bin
+│   │   │   ├── cam_gt.bin
+│   │   │   ├── fov_gt.bin
 │   │   ├── kitti_format
 │   │   │   ├── ImageSets
 │   │   │   ├── training
-│   │   │   │   ├── calib
 │   │   │   │   ├── image_0
 │   │   │   │   ├── image_1
 │   │   │   │   ├── image_2
 │   │   │   │   ├── image_3
 │   │   │   │   ├── image_4
-│   │   │   │   ├── label_0
-│   │   │   │   ├── label_1
-│   │   │   │   ├── label_2
-│   │   │   │   ├── label_3
-│   │   │   │   ├── label_4
-│   │   │   │   ├── label_all
-│   │   │   │   ├── pose
 │   │   │   │   ├── velodyne
 │   │   │   ├── testing
 │   │   │   │   ├── (the same as training)
@@ -93,7 +86,48 @@ mmdetection3d
 
 ```
 
-因为 Waymo 数据的来源包含数个相机，这里我们将每个相机对应的图像和标签文件分别存储，并将相机位姿 (pose) 文件存储下来以供后续处理连续多帧的点云。我们使用 `{a}{bbb}{ccc}` 的名称编码方式为每帧数据命名，其中 `a` 是不同数据拆分的前缀（`0` 指代训练集，`1` 指代验证集，`2` 指代测试集），`bbb` 是分割部分 (segment) 的索引，而 `ccc` 是帧索引。您可以轻而易举地按照如上命名规则定位到所需的帧。我们将训练和验证所需数据按 KITTI 的方式集合在一起，然后将训练集/验证集/测试集的索引存储在 `ImageSet` 下的文件中。
+- `kitti_format/training/image_{0-4}/{a}{bbb}{ccc}.jpg` 因为 Waymo 数据的来源包含数个相机，这里我们将每个相机对应的图像和标签文件分别存储，并将相机位姿 (pose) 文件存储下来以供后续处理连续多帧的点云。我们使用 `{a}{bbb}{ccc}` 的名称编码方式为每帧数据命名，其中 `a` 是不同数据拆分的前缀（`0` 指代训练集，`1` 指代验证集，`2` 指代测试集），`bbb` 是分割部分 (segment) 的索引，而 `ccc` 是帧索引。您可以轻而易举地按照如上命名规则定位到所需的帧。我们将训练和验证所需数据按 KITTI 的方式集合在一起，然后将训练集/验证集/测试集的索引存储在 `ImageSet` 下的文件中。
+- `kitti_format/training/velodyne/{a}{bbb}{ccc}.bin` 当前样本的点云数据
+- `kitti_format/waymo_gt_database/xxx_{Car/Pedestrian/Cyclist}_x.bin`. 训练数据集的每个 3D 包围框中包含的点云数据。这些点云会在数据增强中被使用，例如. `ObjectSample`. `xxx` 表示训练样本的索引，`x` 表示实例在当前样本中的索引。
+- `kitti_format/waymo_infos_train.pkl`. 训练数据集，该字典包含了两个键值：`metainfo` 和 `data_list`。`metainfo` 包含数据集的基本信息，例如 `dataset`、`version` 和 `info_version`。`data_list` 是由字典组成的列表，每个字典（以下简称 `info`）包含了单个样本的所有详细信息。:
+  - info\['sample_idx'\]: 样本在整个数据集的索引。
+  - info\['ego2global'\]: 自车到全局坐标的变换矩阵。（4x4 列表）
+  - info\['timestamp'\]：样本数据时间戳。
+  - info\['context_name'\]: 语境名，表示样本从哪个 `*.tfrecord` 片段中提取的。
+  - info\['lidar_points'\]: 是一个字典，包含了所有与激光雷达点相关的信息。
+    - info\['lidar_points'\]\['lidar_path'\]: 激光雷达点云数据的文件名。
+    - info\['lidar_points'\]\['num_pts_feats'\]: 点的特征维度。
+  - info\['lidar_sweeps'\]: 是一个列表，包含了历史帧信息。
+    - info\['lidar_sweeps'\]\[i\]\['lidar_points'\]\['lidar_path'\]: 第 i 帧的激光雷达数据的文件路径。
+    - info\['lidar_sweeps'\]\[i\]\['ego2global'\]: 第 i 帧的激光雷达传感器到自车的变换矩阵。（4x4 列表）
+    - info\['lidar_sweeps'\]\[i\]\['timestamp'\]: 第 i 帧的样本数据时间戳。
+  - info\['images'\]: 是一个字典，包含与每个相机对应的六个键值：`'CAM_FRONT'`, `'CAM_FRONT_RIGHT'`, `'CAM_FRONT_LEFT'`, `'CAM_SIDE_LEFT'`, `'CAM_SIDE_RIGHT'`。每个字典包含了对应相机的所有数据信息。
+    - info\['images'\]\['CAM_XXX'\]\['img_path'\]: 图像的文件名。
+    - info\['images'\]\['CAM_XXX'\]\['height'\]: 图像的高
+    - info\['images'\]\['CAM_XXX'\]\['width'\]: 图像的宽
+    - info\['images'\]\['CAM_XXX'\]\['cam2img'\]: 当 3D 点投影到图像平面时需要的内参信息相关的变换矩阵。（3x3 列表）
+    - info\['images'\]\['CAM_XXX'\]\['lidar2cam'\]: 激光雷达传感器到该相机的变换矩阵。（4x4 列表）
+    - info\['images'\]\['CAM_XXX'\]\['lidar2img'\]: 激光雷达传感器到图像平面的变换矩阵。（4x4 列表）
+  - info\['image_sweeps'\]: 是一个列表，包含了历史帧信息。
+    - info\['image_sweeps'\]\[i\]\['images'\]\['CAM_XXX'\]\['img_path'\]: 第i帧的图像的文件名.
+    - info\['image_sweeps'\]\[i\]\['ego2global'\]: 第 i 帧的自车到全局坐标的变换矩阵。（4x4 列表）
+    - info\['image_sweeps'\]\[i\]\['timestamp'\]: 第 i 帧的样本数据时间戳。
+  - info\['instances'\]: 是一个字典组成的列表。每个字典包含单个实例的所有标注信息。对于其中的第 i 个实例，我们有：
+    - info\['instances'\]\[i\]\['bbox_3d'\]: 长度为 7 的列表，以 (x, y, z, l, w, h, yaw) 的顺序表示实例的 3D 边界框。
+    - info\['instances'\]\[i\]\['bbox'\]: 2D 边界框标注（，顺序为 \[x1, y1, x2, y2\] 的列表。有些实例可能没有对应的 2D 边界框标注。
+    - info\['instances'\]\[i\]\['bbox_label_3d'\]: 整数表示实例的标签，-1 代表忽略。
+    - info\['instances'\]\[i\]\['bbox_label'\]: 整数表示实例的标签，-1 代表忽略。
+    - info\['instances'\]\[i\]\['num_lidar_pts'\]: 每个 3D 边界框内包含的激光雷达点数。
+    - info\['instances'\]\[i\]\['camera_id'\]: 当前实例最可见相机的索引。
+    - info\['instances'\]\[i\]\['group_id'\]: 当前实例在当前样本中的索引。
+  - info\['cam_sync_instances'\]: 是一个字典组成的列表。每个字典包含单个实例的所有标注信息。它的形式与 \['instances'\]相同. 但是, \['cam_sync_instances'\] 专门用于基于多视角相机的三维目标检测任务。
+  - info\['cam_instances'\]: 是一个字典，包含以下键值： `'CAM_FRONT'`, `'CAM_FRONT_RIGHT'`, `'CAM_FRONT_LEFT'`, `'CAM_SIDE_LEFT'`, `'CAM_SIDE_RIGHT'`. 对于基于视觉的 3D 目标检测任务，我们将整个场景的 3D 标注划分至它们所属于的相应相机中。对于其中的第 i 个实例，我们有：
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox_3d'\]: 长度为 7 的列表，以 (x, y, z, l, h, w, yaw) 的顺序表示实例的 3D 边界框。
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox'\]: 2D 边界框标注（3D 框投影的矩形框），顺序为 \[x1, y1, x2, y2\] 的列表。
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox_label_3d'\]: 实例标签。
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox_label'\]: 实例标签。
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['center_2d'\]: 3D 框投影到图像上的中心点，大小为 (2, ) 的列表。
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['depth'\]: 3D 框投影中心的深度。
 
 ## 训练
 
diff --git a/mmdet3d/datasets/det3d_dataset.py b/mmdet3d/datasets/det3d_dataset.py
index 11caae472..c701a893f 100644
--- a/mmdet3d/datasets/det3d_dataset.py
+++ b/mmdet3d/datasets/det3d_dataset.py
@@ -113,7 +113,7 @@ def __init__(self,
                 ori_label = self.METAINFO['classes'].index(name)
                 self.label_mapping[ori_label] = label_idx
 
-            self.num_ins_per_cat = {name: 0 for name in metainfo['classes']}
+            self.num_ins_per_cat = [0] * len(metainfo['classes'])
         else:
             self.label_mapping = {
                 i: i
@@ -121,10 +121,7 @@ def __init__(self,
             }
             self.label_mapping[-1] = -1
 
-            self.num_ins_per_cat = {
-                name: 0
-                for name in self.METAINFO['classes']
-            }
+            self.num_ins_per_cat = [0] * len(self.METAINFO['classes'])
 
         super().__init__(
             ann_file=ann_file,
@@ -146,9 +143,12 @@ def __init__(self,
 
             # show statistics of this dataset
             print_log('-' * 30, 'current')
-            print_log(f'The length of the dataset: {len(self)}', 'current')
+            print_log(
+                f'The length of {"test" if self.test_mode else "training"} dataset: {len(self)}',  # noqa: E501
+                'current')
             content_show = [['category', 'number']]
-            for cat_name, num in self.num_ins_per_cat.items():
+            for label, num in enumerate(self.num_ins_per_cat):
+                cat_name = self.metainfo['classes'][label]
                 content_show.append([cat_name, num])
             table = AsciiTable(content_show)
             print_log(
@@ -256,8 +256,7 @@ def parse_ann_info(self, info: dict) -> Union[dict, None]:
 
             for label in ann_info['gt_labels_3d']:
                 if label != -1:
-                    cat_name = self.metainfo['classes'][label]
-                    self.num_ins_per_cat[cat_name] += 1
+                    self.num_ins_per_cat[label] += 1
 
         return ann_info
 
diff --git a/mmdet3d/datasets/waymo_dataset.py b/mmdet3d/datasets/waymo_dataset.py
index 5b3a83824..cda27e42e 100644
--- a/mmdet3d/datasets/waymo_dataset.py
+++ b/mmdet3d/datasets/waymo_dataset.py
@@ -3,9 +3,11 @@
 from typing import Callable, List, Union
 
 import numpy as np
+from mmengine import print_log
+from mmengine.fileio import load
 
 from mmdet3d.registry import DATASETS
-from mmdet3d.structures import CameraInstance3DBoxes
+from mmdet3d.structures import CameraInstance3DBoxes, LiDARInstance3DBoxes
 from .det3d_dataset import Det3DDataset
 from .kitti_dataset import KittiDataset
 
@@ -163,13 +165,10 @@ def parse_ann_info(self, info: dict) -> dict:
             centers_2d = np.zeros((0, 2), dtype=np.float32)
             depths = np.zeros((0), dtype=np.float32)
 
-        # in waymo, lidar2cam = R0_rect @ Tr_velo_to_cam
-        # convert gt_bboxes_3d to velodyne coordinates with `lidar2cam`
-        lidar2cam = np.array(info['images'][self.default_cam_key]['lidar2cam'])
-        gt_bboxes_3d = CameraInstance3DBoxes(
-            ann_info['gt_bboxes_3d']).convert_to(self.box_mode_3d,
-                                                 np.linalg.inv(lidar2cam))
-        ann_info['gt_bboxes_3d'] = gt_bboxes_3d
+        if self.load_type == 'frame_based':
+            gt_bboxes_3d = LiDARInstance3DBoxes(ann_info['gt_bboxes_3d'])
+        else:
+            gt_bboxes_3d = CameraInstance3DBoxes(ann_info['gt_bboxes_3d'])
 
         anns_results = dict(
             gt_bboxes_3d=gt_bboxes_3d,
@@ -182,9 +181,58 @@ def parse_ann_info(self, info: dict) -> dict:
         return anns_results
 
     def load_data_list(self) -> List[dict]:
-        """Add the load interval."""
-        data_list = super().load_data_list()
-        data_list = data_list[::self.load_interval]
+        """Add the load interval.
+
+        Returns:
+            list[dict]: A list of annotation.
+        """  # noqa: E501
+        # `self.ann_file` denotes the absolute annotation file path if
+        # `self.root=None` or relative path if `self.root=/path/to/data/`.
+        annotations = load(self.ann_file)
+        if not isinstance(annotations, dict):
+            raise TypeError(f'The annotations loaded from annotation file '
+                            f'should be a dict, but got {type(annotations)}!')
+        if 'data_list' not in annotations or 'metainfo' not in annotations:
+            raise ValueError('Annotation must have data_list and metainfo '
+                             'keys')
+        metainfo = annotations['metainfo']
+        raw_data_list = annotations['data_list']
+        raw_data_list = raw_data_list[::self.load_interval]
+        if self.load_interval > 1:
+            print_log(
+                f'Sample size will be reduced to 1/{self.load_interval} of'
+                ' the original data sample',
+                logger='current')
+
+        # Meta information load from annotation file will not influence the
+        # existed meta information load from `BaseDataset.METAINFO` and
+        # `metainfo` arguments defined in constructor.
+        for k, v in metainfo.items():
+            self._metainfo.setdefault(k, v)
+
+        # load and parse data_infos.
+        data_list = []
+        for raw_data_info in raw_data_list:
+            # parse raw data information to target format
+            data_info = self.parse_data_info(raw_data_info)
+            if isinstance(data_info, dict):
+                # For image tasks, `data_info` should information if single
+                # image, such as dict(img_path='xxx', width=360, ...)
+                data_list.append(data_info)
+            elif isinstance(data_info, list):
+                # For video tasks, `data_info` could contain image
+                # information of multiple frames, such as
+                # [dict(video_path='xxx', timestamps=...),
+                #  dict(video_path='xxx', timestamps=...)]
+                for item in data_info:
+                    if not isinstance(item, dict):
+                        raise TypeError('data_info must be list of dict, but '
+                                        f'got {type(item)}')
+                data_list.extend(data_info)
+            else:
+                raise TypeError('data_info should be a dict or list of dict, '
+                                f'but got {type(data_info)}')
+
         return data_list
 
     def parse_data_info(self, info: dict) -> Union[dict, List[dict]]:
@@ -203,44 +251,39 @@ def parse_data_info(self, info: dict) -> Union[dict, List[dict]]:
                 info['images'][self.default_cam_key]
             info['images'] = new_image_info
             info['instances'] = info['cam_instances'][self.default_cam_key]
-            return super().parse_data_info(info)
+            return Det3DDataset.parse_data_info(self, info)
         else:
             # in the mono3d, the instances is from cam sync.
+            # Convert frame-based infos to multi-view image-based
             data_list = []
-            if self.modality['use_lidar']:
-                info['lidar_points']['lidar_path'] =  \
-                    osp.join(
-                        self.data_prefix.get('pts', ''),
-                        info['lidar_points']['lidar_path'])
-
-            if self.modality['use_camera']:
-                for cam_key, img_info in info['images'].items():
-                    if 'img_path' in img_info:
-                        cam_prefix = self.data_prefix.get(cam_key, '')
-                        img_info['img_path'] = osp.join(
-                            cam_prefix, img_info['img_path'])
-
             for (cam_key, img_info) in info['images'].items():
                 camera_info = dict()
+                camera_info['sample_idx'] = info['sample_idx']
+                camera_info['timestamp'] = info['timestamp']
+                camera_info['context_name'] = info['context_name']
                 camera_info['images'] = dict()
                 camera_info['images'][cam_key] = img_info
-                if 'cam_instances' in info \
-                        and cam_key in info['cam_instances']:
-                    camera_info['instances'] = info['cam_instances'][cam_key]
+                if 'img_path' in img_info:
+                    cam_prefix = self.data_prefix.get(cam_key, '')
+                    camera_info['images'][cam_key]['img_path'] = osp.join(
+                        cam_prefix, img_info['img_path'])
+                if 'lidar2cam' in img_info:
+                    camera_info['lidar2cam'] = np.array(img_info['lidar2cam'])
+                if 'cam2img' in img_info:
+                    camera_info['cam2img'] = np.array(img_info['cam2img'])
+                if 'lidar2img' in img_info:
+                    camera_info['lidar2img'] = np.array(img_info['lidar2img'])
                 else:
-                    camera_info['instances'] = []
-                camera_info['ego2global'] = info['ego2global']
-                if 'image_sweeps' in info:
-                    camera_info['image_sweeps'] = info['image_sweeps']
-
-                # TODO check if need to modify the sample id
-                # TODO check when will use it except for evaluation.
-                camera_info['sample_idx'] = info['sample_idx']
+                    camera_info['lidar2img'] = camera_info[
+                        'cam2img'] @ camera_info['lidar2cam']
 
                 if not self.test_mode:
                     # used in training
+                    camera_info['instances'] = info['cam_instances'][cam_key]
                     camera_info['ann_info'] = self.parse_ann_info(camera_info)
                 if self.test_mode and self.load_eval_anns:
-                    info['eval_ann_info'] = self.parse_ann_info(info)
+                    camera_info['instances'] = info['cam_instances'][cam_key]
+                    camera_info['eval_ann_info'] = self.parse_ann_info(
+                        camera_info)
                 data_list.append(camera_info)
             return data_list
diff --git a/mmdet3d/engine/hooks/visualization_hook.py b/mmdet3d/engine/hooks/visualization_hook.py
index ffec1addc..9de46d969 100644
--- a/mmdet3d/engine/hooks/visualization_hook.py
+++ b/mmdet3d/engine/hooks/visualization_hook.py
@@ -78,11 +78,11 @@ def __init__(self,
                           'needs to be excluded.')
         self.vis_task = vis_task
 
-        if wait_time == -1:
+        if show and wait_time == -1:
             print_log(
                 'Manual control mode, press [Right] to next sample.',
                 logger='current')
-        else:
+        elif show:
             print_log(
                 'Autoplay mode, press [SPACE] to pause.', logger='current')
         self.wait_time = wait_time
diff --git a/mmdet3d/evaluation/functional/waymo_utils/prediction_to_waymo.py b/mmdet3d/evaluation/functional/waymo_utils/prediction_to_waymo.py
index b9da8043d..3c79d6f6c 100644
--- a/mmdet3d/evaluation/functional/waymo_utils/prediction_to_waymo.py
+++ b/mmdet3d/evaluation/functional/waymo_utils/prediction_to_waymo.py
@@ -4,7 +4,6 @@
 """
 
 try:
-    from waymo_open_dataset import dataset_pb2 as open_dataset
     from waymo_open_dataset import label_pb2
     from waymo_open_dataset.protos import metrics_pb2
     from waymo_open_dataset.protos.metrics_pb2 import Objects
@@ -14,13 +13,10 @@
         'Please run "pip install waymo-open-dataset-tf-2-1-0==1.2.0" '
         'to install the official devkit first.')
 
-from glob import glob
-from os.path import join
-from typing import List, Optional
+from typing import List
 
 import mmengine
-import numpy as np
-import tensorflow as tf
+from mmengine import print_log
 
 
 class Prediction2Waymo(object):
@@ -32,54 +28,22 @@ class Prediction2Waymo(object):
 
     Args:
         results (list[dict]): Prediction results.
-        waymo_tfrecords_dir (str): Directory to load waymo raw data.
         waymo_results_save_dir (str): Directory to save converted predictions
             in waymo format (.bin files).
         waymo_results_final_path (str): Path to save combined
             predictions in waymo format (.bin file), like 'a/b/c.bin'.
-        prefix (str): Prefix of filename. In general, 0 for training, 1 for
-            validation and 2 for testing.
-        classes (dict): A list of class name.
-        workers (str): Number of parallel processes. Defaults to 2.
-        backend_args (dict, optional): Arguments to instantiate the
-            corresponding backend. Defaults to None.
-        from_kitti_format (bool, optional): Whether the reuslts are kitti
-            format. Defaults to False.
-        idx2metainfo (Optional[dict], optional): The mapping from sample_idx to
-            metainfo. The metainfo must contain the keys: 'idx2contextname' and
-            'idx2timestamp'. Defaults to None.
+        num_workers (str): Number of parallel processes. Defaults to 4.
     """
 
     def __init__(self,
                  results: List[dict],
-                 waymo_tfrecords_dir: str,
-                 waymo_results_save_dir: str,
                  waymo_results_final_path: str,
-                 prefix: str,
                  classes: dict,
-                 workers: int = 2,
-                 backend_args: Optional[dict] = None,
-                 from_kitti_format: bool = False,
-                 idx2metainfo: Optional[dict] = None):
-
+                 num_workers: int = 4):
         self.results = results
-        self.waymo_tfrecords_dir = waymo_tfrecords_dir
-        self.waymo_results_save_dir = waymo_results_save_dir
         self.waymo_results_final_path = waymo_results_final_path
-        self.prefix = prefix
         self.classes = classes
-        self.workers = int(workers)
-        self.backend_args = backend_args
-        self.from_kitti_format = from_kitti_format
-        if idx2metainfo is not None:
-            self.idx2metainfo = idx2metainfo
-            # If ``fast_eval``, the metainfo does not need to be read from
-            # original data online. It's preprocessed offline.
-            self.fast_eval = True
-        else:
-            self.fast_eval = False
-
-        self.name2idx = {}
+        self.num_workers = num_workers
 
         self.k2w_cls_map = {
             'Car': label_pb2.Label.TYPE_VEHICLE,
@@ -88,213 +52,23 @@ def __init__(self,
             'Cyclist': label_pb2.Label.TYPE_CYCLIST,
         }
 
-        if self.from_kitti_format:
-            self.T_ref_to_front_cam = np.array([[0.0, 0.0, 1.0, 0.0],
-                                                [-1.0, 0.0, 0.0, 0.0],
-                                                [0.0, -1.0, 0.0, 0.0],
-                                                [0.0, 0.0, 0.0, 1.0]])
-            # ``sample_idx`` of the sample in kitti-format is an array
-            for idx, result in enumerate(results):
-                if len(result['sample_idx']) > 0:
-                    self.name2idx[str(result['sample_idx'][0])] = idx
-        else:
-            # ``sample_idx`` of the sample in the original prediction
-            # is an int value.
-            for idx, result in enumerate(results):
-                self.name2idx[str(result['sample_idx'])] = idx
-
-        if not self.fast_eval:
-            # need to read original '.tfrecord' file
-            self.get_file_names()
-            # turn on eager execution for older tensorflow versions
-            if int(tf.__version__.split('.')[0]) < 2:
-                tf.enable_eager_execution()
-
-        self.create_folder()
-
-    def get_file_names(self):
-        """Get file names of waymo raw data."""
-        if 'path_mapping' in self.backend_args:
-            for path in self.backend_args['path_mapping'].keys():
-                if path in self.waymo_tfrecords_dir:
-                    self.waymo_tfrecords_dir = \
-                        self.waymo_tfrecords_dir.replace(
-                            path, self.backend_args['path_mapping'][path])
-            from petrel_client.client import Client
-            client = Client()
-            contents = client.list(self.waymo_tfrecords_dir)
-            self.waymo_tfrecord_pathnames = list()
-            for content in sorted(list(contents)):
-                if content.endswith('tfrecord'):
-                    self.waymo_tfrecord_pathnames.append(
-                        join(self.waymo_tfrecords_dir, content))
-        else:
-            self.waymo_tfrecord_pathnames = sorted(
-                glob(join(self.waymo_tfrecords_dir, '*.tfrecord')))
-        print(len(self.waymo_tfrecord_pathnames), 'tfrecords found.')
-
-    def create_folder(self):
-        """Create folder for data conversion."""
-        mmengine.mkdir_or_exist(self.waymo_results_save_dir)
-
-    def parse_objects(self, kitti_result, T_k2w, context_name,
-                      frame_timestamp_micros):
-        """Parse one prediction with several instances in kitti format and
-        convert them to `Object` proto.
-
-        Args:
-            kitti_result (dict): Predictions in kitti format.
-
-                - name (np.ndarray): Class labels of predictions.
-                - dimensions (np.ndarray): Height, width, length of boxes.
-                - location (np.ndarray): Bottom center of boxes (x, y, z).
-                - rotation_y (np.ndarray): Orientation of boxes.
-                - score (np.ndarray): Scores of predictions.
-            T_k2w (np.ndarray): Transformation matrix from kitti to waymo.
-            context_name (str): Context name of the frame.
-            frame_timestamp_micros (int): Frame timestamp.
-
-        Returns:
-            :obj:`Object`: Predictions in waymo dataset Object proto.
-        """
-
-        def parse_one_object(instance_idx):
-            """Parse one instance in kitti format and convert them to `Object`
-            proto.
-
-            Args:
-                instance_idx (int): Index of the instance to be converted.
-
-            Returns:
-                :obj:`Object`: Predicted instance in waymo dataset
-                    Object proto.
-            """
-            cls = kitti_result['name'][instance_idx]
-            length = round(kitti_result['dimensions'][instance_idx, 0], 4)
-            height = round(kitti_result['dimensions'][instance_idx, 1], 4)
-            width = round(kitti_result['dimensions'][instance_idx, 2], 4)
-            x = round(kitti_result['location'][instance_idx, 0], 4)
-            y = round(kitti_result['location'][instance_idx, 1], 4)
-            z = round(kitti_result['location'][instance_idx, 2], 4)
-            rotation_y = round(kitti_result['rotation_y'][instance_idx], 4)
-            score = round(kitti_result['score'][instance_idx], 4)
-
-            # y: downwards; move box origin from bottom center (kitti) to
-            # true center (waymo)
-            y -= height / 2
-            # frame transformation: kitti -> waymo
-            x, y, z = self.transform(T_k2w, x, y, z)
-
-            # different conventions
-            heading = -(rotation_y + np.pi / 2)
-            while heading < -np.pi:
-                heading += 2 * np.pi
-            while heading > np.pi:
-                heading -= 2 * np.pi
-
-            box = label_pb2.Label.Box()
-            box.center_x = x
-            box.center_y = y
-            box.center_z = z
-            box.length = length
-            box.width = width
-            box.height = height
-            box.heading = heading
-
-            o = metrics_pb2.Object()
-            o.object.box.CopyFrom(box)
-            o.object.type = self.k2w_cls_map[cls]
-            o.score = score
-
-            o.context_name = context_name
-            o.frame_timestamp_micros = frame_timestamp_micros
-
-            return o
-
-        objects = metrics_pb2.Objects()
-
-        for instance_idx in range(len(kitti_result['name'])):
-            o = parse_one_object(instance_idx)
-            objects.objects.append(o)
-
-        return objects
-
-    def convert_one(self, file_idx):
-        """Convert action for single file.
-
-        Args:
-            file_idx (int): Index of the file to be converted.
-        """
-        file_pathname = self.waymo_tfrecord_pathnames[file_idx]
-        if 's3://' in file_pathname and tf.__version__ >= '2.6.0':
-            try:
-                import tensorflow_io as tfio  # noqa: F401
-            except ImportError:
-                raise ImportError(
-                    "Please run 'pip install tensorflow-io' to install tensorflow_io first."  # noqa: E501
-                )
-        file_data = tf.data.TFRecordDataset(file_pathname, compression_type='')
-
-        for frame_num, frame_data in enumerate(file_data):
-            frame = open_dataset.Frame()
-            frame.ParseFromString(bytearray(frame_data.numpy()))
-
-            filename = f'{self.prefix}{file_idx:03d}{frame_num:03d}'
-
-            context_name = frame.context.name
-            frame_timestamp_micros = frame.timestamp_micros
-
-            if filename in self.name2idx:
-                if self.from_kitti_format:
-                    for camera in frame.context.camera_calibrations:
-                        # FRONT = 1, see dataset.proto for details
-                        if camera.name == 1:
-                            T_front_cam_to_vehicle = np.array(
-                                camera.extrinsic.transform).reshape(4, 4)
-
-                    T_k2w = T_front_cam_to_vehicle @ self.T_ref_to_front_cam
-
-                    kitti_result = \
-                        self.results[self.name2idx[filename]]
-                    objects = self.parse_objects(kitti_result, T_k2w,
-                                                 context_name,
-                                                 frame_timestamp_micros)
-                else:
-                    index = self.name2idx[filename]
-                    objects = self.parse_objects_from_origin(
-                        self.results[index], context_name,
-                        frame_timestamp_micros)
-
-            else:
-                print(filename, 'not found.')
-                objects = metrics_pb2.Objects()
-
-            with open(
-                    join(self.waymo_results_save_dir, f'{filename}.bin'),
-                    'wb') as f:
-                f.write(objects.SerializeToString())
-
-    def convert_one_fast(self, res_index: int):
+    def convert_one(self, res_idx: int):
         """Convert action for single file. It read the metainfo from the
         preprocessed file offline and will be faster.
 
         Args:
-            res_index (int): The indices of the results.
+            res_idx (int): The indices of the results.
         """
-        sample_idx = self.results[res_index]['sample_idx']
-        if len(self.results[res_index]['pred_instances_3d']) > 0:
+        sample_idx = self.results[res_idx]['sample_idx']
+        if len(self.results[res_idx]['labels_3d']) > 0:
             objects = self.parse_objects_from_origin(
-                self.results[res_index],
-                self.idx2metainfo[str(sample_idx)]['contextname'],
-                self.idx2metainfo[str(sample_idx)]['timestamp'])
+                self.results[res_idx], self.results[res_idx]['context_name'],
+                self.results[res_idx]['timestamp'])
         else:
             print(sample_idx, 'not found.')
             objects = metrics_pb2.Objects()
 
-        with open(
-                join(self.waymo_results_save_dir, f'{sample_idx}.bin'),
-                'wb') as f:
-            f.write(objects.SerializeToString())
+        return objects
 
     def parse_objects_from_origin(self, result: dict, contextname: str,
                                   timestamp: str) -> Objects:
@@ -308,112 +82,56 @@ def parse_objects_from_origin(self, result: dict, contextname: str,
         Returns:
             metrics_pb2.Objects: The parsed object.
         """
-        lidar_boxes = result['pred_instances_3d']['bboxes_3d'].tensor
-        scores = result['pred_instances_3d']['scores_3d']
-        labels = result['pred_instances_3d']['labels_3d']
-
-        def parse_one_object(index):
-            class_name = self.classes[labels[index].item()]
+        lidar_boxes = result['bboxes_3d']
+        scores = result['scores_3d']
+        labels = result['labels_3d']
 
+        objects = metrics_pb2.Objects()
+        for lidar_box, score, label in zip(lidar_boxes, scores, labels):
+            # Parse one object
             box = label_pb2.Label.Box()
-            height = lidar_boxes[index][5].item()
-            heading = lidar_boxes[index][6].item()
-
-            while heading < -np.pi:
-                heading += 2 * np.pi
-            while heading > np.pi:
-                heading -= 2 * np.pi
-
-            box.center_x = lidar_boxes[index][0].item()
-            box.center_y = lidar_boxes[index][1].item()
-            box.center_z = lidar_boxes[index][2].item() + height / 2
-            box.length = lidar_boxes[index][3].item()
-            box.width = lidar_boxes[index][4].item()
+            height = lidar_box[5]
+            heading = lidar_box[6]
+
+            box.center_x = lidar_box[0]
+            box.center_y = lidar_box[1]
+            box.center_z = lidar_box[2] + height / 2
+            box.length = lidar_box[3]
+            box.width = lidar_box[4]
             box.height = height
             box.heading = heading
 
-            o = metrics_pb2.Object()
-            o.object.box.CopyFrom(box)
-            o.object.type = self.k2w_cls_map[class_name]
-            o.score = scores[index].item()
-            o.context_name = contextname
-            o.frame_timestamp_micros = timestamp
+            object = metrics_pb2.Object()
+            object.object.box.CopyFrom(box)
 
-            return o
-
-        objects = metrics_pb2.Objects()
-        for i in range(len(lidar_boxes)):
-            objects.objects.append(parse_one_object(i))
+            class_name = self.classes[label]
+            object.object.type = self.k2w_cls_map[class_name]
+            object.score = score
+            object.context_name = contextname
+            object.frame_timestamp_micros = timestamp
+            objects.objects.append(object)
 
         return objects
 
     def convert(self):
         """Convert action."""
-        print('Start converting ...')
-        convert_func = self.convert_one_fast if self.fast_eval else \
-            self.convert_one
+        print_log('Start converting ...', logger='current')
 
-        # from torch.multiprocessing import set_sharing_strategy
-        # # Force using "file_system" sharing strategy for stability
-        # set_sharing_strategy("file_system")
+        # TODO: use parallel processes.
+        # objects_list = mmengine.track_parallel_progress(
+        #     self.convert_one, range(len(self)), self.num_workers)
 
-        # mmengine.track_parallel_progress(convert_func, range(len(self)),
-        #                                  self.workers)
+        objects_list = mmengine.track_progress(self.convert_one,
+                                               range(len(self)))
 
-        # TODO: Support multiprocessing. Now, multiprocessing evaluation will
-        # cause shared memory error in torch-1.10 and torch-1.11. Details can
-        # be seen in https://github.com/pytorch/pytorch/issues/67864.
-        prog_bar = mmengine.ProgressBar(len(self))
-        for i in range(len(self)):
-            convert_func(i)
-            prog_bar.update()
-
-        print('\nFinished ...')
-
-        # combine all files into one .bin
-        pathnames = sorted(glob(join(self.waymo_results_save_dir, '*.bin')))
-        combined = self.combine(pathnames)
+        combined = metrics_pb2.Objects()
+        for objects in objects_list:
+            for o in objects.objects:
+                combined.objects.append(o)
 
         with open(self.waymo_results_final_path, 'wb') as f:
             f.write(combined.SerializeToString())
 
     def __len__(self):
         """Length of the filename list."""
-        return len(self.results) if self.fast_eval else len(
-            self.waymo_tfrecord_pathnames)
-
-    def transform(self, T, x, y, z):
-        """Transform the coordinates with matrix T.
-
-        Args:
-            T (np.ndarray): Transformation matrix.
-            x(float): Coordinate in x axis.
-            y(float): Coordinate in y axis.
-            z(float): Coordinate in z axis.
-
-        Returns:
-            list: Coordinates after transformation.
-        """
-        pt_bef = np.array([x, y, z, 1.0]).reshape(4, 1)
-        pt_aft = np.matmul(T, pt_bef)
-        return pt_aft[:3].flatten().tolist()
-
-    def combine(self, pathnames):
-        """Combine predictions in waymo format for each sample together.
-
-        Args:
-            pathnames (str): Paths to save predictions.
-
-        Returns:
-            :obj:`Objects`: Combined predictions in Objects proto.
-        """
-        combined = metrics_pb2.Objects()
-
-        for pathname in pathnames:
-            objects = metrics_pb2.Objects()
-            with open(pathname, 'rb') as f:
-                objects.ParseFromString(f.read())
-            for o in objects.objects:
-                combined.objects.append(o)
-
-        return combined
+        return len(self.results)
diff --git a/mmdet3d/evaluation/metrics/waymo_metric.py b/mmdet3d/evaluation/metrics/waymo_metric.py
index 0dd69a5c2..cdbc4a58d 100644
--- a/mmdet3d/evaluation/metrics/waymo_metric.py
+++ b/mmdet3d/evaluation/metrics/waymo_metric.py
@@ -1,54 +1,30 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import tempfile
 from os import path as osp
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Sequence, Tuple, Union
 
-import mmengine
 import numpy as np
 import torch
-from mmengine import Config, load
+from mmengine import Config
+from mmengine.device import get_device
+from mmengine.evaluator import BaseMetric
 from mmengine.logging import MMLogger, print_log
 
 from mmdet3d.models.layers import box3d_multiclass_nms
 from mmdet3d.registry import METRICS
 from mmdet3d.structures import (Box3DMode, CameraInstance3DBoxes,
-                                LiDARInstance3DBoxes, bbox3d2result,
-                                points_cam2img, xywhr2xyxyr)
-from .kitti_metric import KittiMetric
+                                LiDARInstance3DBoxes, points_cam2img,
+                                xywhr2xyxyr)
 
 
 @METRICS.register_module()
-class WaymoMetric(KittiMetric):
+class WaymoMetric(BaseMetric):
     """Waymo evaluation metric.
 
     Args:
-        ann_file (str): The path of the annotation file in kitti format.
         waymo_bin_file (str): The path of the annotation file in waymo format.
-        data_root (str): Path of dataset root. Used for storing waymo
-            evaluation programs.
-        split (str): The split of the evaluation set. Defaults to 'training'.
         metric (str or List[str]): Metrics to be evaluated. Defaults to 'mAP'.
-        pcd_limit_range (List[float]): The range of point cloud used to filter
-            invalid predicted boxes. Defaults to [-85, -85, -5, 85, 85, 5].
-        convert_kitti_format (bool): Whether to convert the results to kitti
-            format. Now, in order to be compatible with camera-based methods,
-            defaults to True.
-        prefix (str, optional): The prefix that will be added in the metric
-            names to disambiguate homonymous metrics of different evaluators.
-            If prefix is not provided in the argument, self.default_prefix will
-            be used instead. Defaults to None.
-        format_only (bool): Format the output results without perform
-            evaluation. It is useful when you want to format the result to a
-            specific format and submit it to the test server.
-            Defaults to False.
-        pklfile_prefix (str, optional): The prefix of pkl files, including the
-            file path and the prefix of filename, e.g., "a/b/prefix". If not
-            specified, a temp file will be created. Defaults to None.
-        submission_prefix (str, optional): The prefix of submission data. If
-            not specified, the submission data will not be generated.
-            Defaults to None.
         load_type (str): Type of loading mode during training.
-
             - 'frame_based': Load all of the instances in the frame.
             - 'mv_image_based': Load all of the instances in the frame and need
               to convert to the FOV-based data type to support image-based
@@ -56,73 +32,98 @@ class WaymoMetric(KittiMetric):
             - 'fov_image_based': Only load the instances inside the default cam
               and need to convert to the FOV-based data type to support image-
               based detector.
-        default_cam_key (str): The default camera for lidar to camera
-            conversion. By default, KITTI: 'CAM2', Waymo: 'CAM_FRONT'.
-            Defaults to 'CAM_FRONT'.
-        use_pred_sample_idx (bool): In formating results, use the sample index
-            from the prediction or from the load annotations. By default,
-            KITTI: True, Waymo: False, Waymo has a conversion process, which
-            needs to use the sample idx from load annotation.
-            Defaults to False.
-        collect_device (str): Device name used for collecting results from
-            different ranks during distributed training. Must be 'cpu' or
-            'gpu'. Defaults to 'cpu'.
-        backend_args (dict, optional): Arguments to instantiate the
-            corresponding backend. Defaults to None.
-        idx2metainfo (str, optional): The file path of the metainfo in waymo.
-            It stores the mapping from sample_idx to metainfo. The metainfo
-            must contain the keys: 'idx2contextname' and 'idx2timestamp'.
+        result_prefix (str, optional): The prefix of result '*.bin' file,
+            including the file path and the prefix of filename, e.g.,
+            "a/b/prefix". If not specified, a temp file will be created.
             Defaults to None.
+        format_only (bool): Format the output results without perform
+            evaluation. It is useful when you want to format the result to a
+            specific format and submit it to the test server.
+            Defaults to False.
+        nms_cfg (dict): The configuration of non-maximum suppression for
+            the mergence of multi-image predicted bboxes, only use when
+            load_type == 'mv_image_based'. Defaults to None.
     """
     num_cams = 5
+    default_prefix = 'Waymo metric'
 
     def __init__(self,
-                 ann_file: str,
                  waymo_bin_file: str,
-                 data_root: str,
-                 split: str = 'training',
                  metric: Union[str, List[str]] = 'mAP',
-                 pcd_limit_range: List[float] = [-85, -85, -5, 85, 85, 5],
-                 convert_kitti_format: bool = True,
-                 prefix: Optional[str] = None,
-                 format_only: bool = False,
-                 pklfile_prefix: Optional[str] = None,
-                 submission_prefix: Optional[str] = None,
                  load_type: str = 'frame_based',
-                 default_cam_key: str = 'CAM_FRONT',
-                 use_pred_sample_idx: bool = False,
-                 collect_device: str = 'cpu',
-                 backend_args: Optional[dict] = None,
-                 idx2metainfo: Optional[str] = None) -> None:
+                 result_prefix: Optional[str] = None,
+                 format_only: bool = False,
+                 nms_cfg=None,
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
         self.waymo_bin_file = waymo_bin_file
-        self.data_root = data_root
-        self.split = split
+        self.metrics = metric if isinstance(metric, list) else [metric]
         self.load_type = load_type
-        self.use_pred_sample_idx = use_pred_sample_idx
-        self.convert_kitti_format = convert_kitti_format
-
-        if idx2metainfo is not None:
-            self.idx2metainfo = mmengine.load(idx2metainfo)
-        else:
-            self.idx2metainfo = None
-
-        super(WaymoMetric, self).__init__(
-            ann_file=ann_file,
-            metric=metric,
-            pcd_limit_range=pcd_limit_range,
-            prefix=prefix,
-            pklfile_prefix=pklfile_prefix,
-            submission_prefix=submission_prefix,
-            default_cam_key=default_cam_key,
-            collect_device=collect_device,
-            backend_args=backend_args)
+        self.result_prefix = result_prefix
         self.format_only = format_only
         if self.format_only:
-            assert pklfile_prefix is not None, 'pklfile_prefix must be not '
+            assert result_prefix is not None, 'result_prefix must be not '
             'None when format_only is True, otherwise the result files will '
             'be saved to a temp directory which will be cleaned up at the end.'
+        if nms_cfg is not None:
+            assert load_type == 'mv_image_based', 'nms_cfg in WaymoMetric '
+            'only use when load_type == \'mv_image_based\'.'
+            self.nms_cfg = Config(nms_cfg)
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions.
 
-        self.default_prefix = 'Waymo metric'
+        The processed results should be stored in ``self.results``, which will
+        be used to compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from the model.
+        """
+
+        for data_sample in data_samples:
+            result = dict()
+            bboxes_3d = data_sample['pred_instances_3d']['bboxes_3d']
+            bboxes_3d.limit_yaw(offset=0.5, period=np.pi * 2)
+            scores_3d = data_sample['pred_instances_3d']['scores_3d']
+            labels_3d = data_sample['pred_instances_3d']['labels_3d']
+            # TODO: check lidar post-processing
+            if isinstance(bboxes_3d, CameraInstance3DBoxes):
+                box_corners = bboxes_3d.corners
+                cam2img = box_corners.new_tensor(
+                    np.array(data_sample['cam2img']))
+                box_corners_in_image = points_cam2img(box_corners, cam2img)
+                # box_corners_in_image: [N, 8, 2]
+                minxy = torch.min(box_corners_in_image, dim=1)[0]
+                maxxy = torch.max(box_corners_in_image, dim=1)[0]
+                # check minxy & maxxy
+                # if the projected 2d bbox has intersection
+                # with the image, we keep it, otherwise, we omit it.
+                img_shape = data_sample['img_shape']
+                valid_inds = ((minxy[:, 0] < img_shape[1]) &
+                              (minxy[:, 1] < img_shape[0]) & (maxxy[:, 0] > 0)
+                              & (maxxy[:, 1] > 0))
+
+                if valid_inds.sum() > 0:
+                    lidar2cam = data_sample['lidar2cam']
+                    bboxes_3d = bboxes_3d.convert_to(
+                        Box3DMode.LIDAR,
+                        np.linalg.inv(lidar2cam),
+                        correct_yaw=True)
+                    bboxes_3d = bboxes_3d[valid_inds]
+                    scores_3d = scores_3d[valid_inds]
+                    labels_3d = labels_3d[valid_inds]
+                else:
+                    bboxes_3d = torch.zeros([0, 7])
+                    scores_3d = torch.zeros([0])
+                    labels_3d = torch.zeros([0])
+            result['bboxes_3d'] = bboxes_3d.tensor.cpu().numpy()
+            result['scores_3d'] = scores_3d.cpu().numpy()
+            result['labels_3d'] = labels_3d.cpu().numpy()
+            result['sample_idx'] = data_sample['sample_idx']
+            result['context_name'] = data_sample['context_name']
+            result['timestamp'] = data_sample['timestamp']
+            self.results.append(result)
 
     def compute_metrics(self, results: List[dict]) -> Dict[str, float]:
         """Compute the metrics from processed results.
@@ -137,80 +138,49 @@ def compute_metrics(self, results: List[dict]) -> Dict[str, float]:
         logger: MMLogger = MMLogger.get_current_instance()
         self.classes = self.dataset_meta['classes']
 
-        # load annotations
-        self.data_infos = load(self.ann_file)['data_list']
-        assert len(results) == len(self.data_infos), \
-            'invalid list length of network outputs'
         # different from kitti, waymo do not need to convert the ann file
         # handle the mv_image_based load_mode
         if self.load_type == 'mv_image_based':
-            new_data_infos = []
-            for info in self.data_infos:
-                height = info['images'][self.default_cam_key]['height']
-                width = info['images'][self.default_cam_key]['width']
-                for (cam_key, img_info) in info['images'].items():
-                    camera_info = dict()
-                    camera_info['images'] = dict()
-                    camera_info['images'][cam_key] = img_info
-                    # TODO remove the check by updating the data info;
-                    if 'height' not in img_info:
-                        img_info['height'] = height
-                        img_info['width'] = width
-                    if 'cam_instances' in info \
-                            and cam_key in info['cam_instances']:
-                        camera_info['instances'] = info['cam_instances'][
-                            cam_key]
-                    else:
-                        camera_info['instances'] = []
-                    camera_info['ego2global'] = info['ego2global']
-                    if 'image_sweeps' in info:
-                        camera_info['image_sweeps'] = info['image_sweeps']
-
-                    # TODO check if need to modify the sample idx
-                    # TODO check when will use it except for evaluation.
-                    camera_info['sample_idx'] = info['sample_idx']
-                    new_data_infos.append(camera_info)
-            self.data_infos = new_data_infos
-
-        if self.pklfile_prefix is None:
+            assert len(results) % 5 == 0, 'The multi-view image-based results'
+            ' must be 5 times as large as the original frame-based results.'
+            frame_results = [
+                results[i:i + 5] for i in range(0, len(results), 5)
+            ]
+            results = self.merge_multi_view_boxes(frame_results)
+
+        if self.result_prefix is None:
             eval_tmp_dir = tempfile.TemporaryDirectory()
-            pklfile_prefix = osp.join(eval_tmp_dir.name, 'results')
+            result_prefix = osp.join(eval_tmp_dir.name, 'results')
         else:
             eval_tmp_dir = None
-            pklfile_prefix = self.pklfile_prefix
+            result_prefix = self.result_prefix
 
-        result_dict, tmp_dir = self.format_results(
-            results,
-            pklfile_prefix=pklfile_prefix,
-            submission_prefix=self.submission_prefix,
-            classes=self.classes)
+        self.format_results(results, result_prefix=result_prefix)
 
         metric_dict = {}
 
         if self.format_only:
             logger.info('results are saved in '
-                        f'{osp.dirname(self.pklfile_prefix)}')
+                        f'{osp.dirname(self.result_prefix)}')
             return metric_dict
 
         for metric in self.metrics:
             ap_dict = self.waymo_evaluate(
-                pklfile_prefix, metric=metric, logger=logger)
+                result_prefix, metric=metric, logger=logger)
             metric_dict.update(ap_dict)
         if eval_tmp_dir is not None:
             eval_tmp_dir.cleanup()
 
-        if tmp_dir is not None:
-            tmp_dir.cleanup()
         return metric_dict
 
     def waymo_evaluate(self,
-                       pklfile_prefix: str,
+                       result_prefix: str,
                        metric: Optional[str] = None,
                        logger: Optional[MMLogger] = None) -> Dict[str, float]:
         """Evaluation in Waymo protocol.
 
         Args:
-            pklfile_prefix (str): The location that stored the prediction
+            result_prefix (str): The location that stored the prediction
                 results.
             metric (str, optional): Metric to be evaluated. Defaults to None.
             logger (MMLogger, optional): Logger used for printing related
@@ -224,7 +194,7 @@ def waymo_evaluate(self,
 
         if metric == 'mAP':
             eval_str = 'mmdet3d/evaluation/functional/waymo_utils/' + \
-                f'compute_detection_metrics_main {pklfile_prefix}.bin ' + \
+                f'compute_detection_metrics_main {result_prefix}.bin ' + \
                 f'{self.waymo_bin_file}'
             print(eval_str)
             ret_bytes = subprocess.check_output(eval_str, shell=True)
@@ -275,7 +245,7 @@ def waymo_evaluate(self,
                     ap_dict['Cyclist/L2 mAPH']) / 3
         elif metric == 'LET_mAP':
             eval_str = 'mmdet3d/evaluation/functional/waymo_utils/' + \
-                f'compute_detection_let_metrics_main {pklfile_prefix}.bin ' + \
+                f'compute_detection_let_metrics_main {result_prefix}.bin ' + \
                 f'{self.waymo_bin_file}'
 
             print(eval_str)
@@ -325,76 +295,26 @@ def waymo_evaluate(self,
     def format_results(
         self,
         results: List[dict],
-        pklfile_prefix: Optional[str] = None,
-        submission_prefix: Optional[str] = None,
-        classes: Optional[List[str]] = None
+        result_prefix: Optional[str] = None
     ) -> Tuple[dict, Union[tempfile.TemporaryDirectory, None]]:
         """Format the results to bin file.
 
         Args:
             results (List[dict]): Testing results of the dataset.
-            pklfile_prefix (str, optional): The prefix of pkl files. It
+            result_prefix (str, optional): The prefix of result file. It
                 includes the file path and the prefix of filename, e.g.,
                 "a/b/prefix". If not specified, a temp file will be created.
                 Defaults to None.
-            submission_prefix (str, optional): The prefix of submitted files.
-                It includes the file path and the prefix of filename, e.g.,
-                "a/b/prefix". If not specified, a temp file will be created.
-                Defaults to None.
-            classes (List[str], optional): A list of class name.
-                Defaults to None.
-
-        Returns:
-            tuple: (result_dict, tmp_dir), result_dict is a dict containing the
-            formatted result, tmp_dir is the temporal directory created for
-            saving json files when jsonfile_prefix is not specified.
         """
-        waymo_save_tmp_dir = tempfile.TemporaryDirectory()
-        waymo_results_save_dir = waymo_save_tmp_dir.name
-        waymo_results_final_path = f'{pklfile_prefix}.bin'
-
-        if self.convert_kitti_format:
-            results_kitti_format, tmp_dir = super().format_results(
-                results, pklfile_prefix, submission_prefix, classes)
-            final_results = results_kitti_format['pred_instances_3d']
-        else:
-            final_results = results
-            for i, res in enumerate(final_results):
-                # Actually, `sample_idx` here is the filename without suffix.
-                # It's for identitying the sample in formating.
-                res['sample_idx'] = self.data_infos[i]['sample_idx']
-                res['pred_instances_3d']['bboxes_3d'].limit_yaw(
-                    offset=0.5, period=np.pi * 2)
-
-        waymo_root = self.data_root
-        if self.split == 'training':
-            waymo_tfrecords_dir = osp.join(waymo_root, 'validation')
-            prefix = '1'
-        elif self.split == 'testing':
-            waymo_tfrecords_dir = osp.join(waymo_root, 'testing')
-            prefix = '2'
-        else:
-            raise ValueError('Not supported split value.')
+        waymo_results_final_path = f'{result_prefix}.bin'
 
         from ..functional.waymo_utils.prediction_to_waymo import \
             Prediction2Waymo
-        converter = Prediction2Waymo(
-            final_results,
-            waymo_tfrecords_dir,
-            waymo_results_save_dir,
-            waymo_results_final_path,
-            prefix,
-            classes,
-            backend_args=self.backend_args,
-            from_kitti_format=self.convert_kitti_format,
-            idx2metainfo=self.idx2metainfo)
+        converter = Prediction2Waymo(results, waymo_results_final_path,
+                                     self.classes)
         converter.convert()
-        waymo_save_tmp_dir.cleanup()
-
-        return final_results, waymo_save_tmp_dir
 
-    def merge_multi_view_boxes(self, box_dict_per_frame: List[dict],
-                               cam0_info: dict) -> dict:
+    def merge_multi_view_boxes(self, frame_results: List[dict]) -> dict:
         """Merge bounding boxes predicted from multi-view images.
 
         Args:
@@ -403,308 +323,43 @@ def merge_multi_view_boxes(self, box_dict_per_frame: List[dict],
             cam0_info (dict): Store the sample idx for the given frame.
 
         Returns:
-            dict: Merged results.
-        """
-        box_dict = dict()
-        # convert list[dict] to dict[list]
-        for key in box_dict_per_frame[0].keys():
-            box_dict[key] = list()
-            for cam_idx in range(self.num_cams):
-                box_dict[key].append(box_dict_per_frame[cam_idx][key])
-        # merge each elements
-        box_dict['sample_idx'] = cam0_info['image_id']
-        for key in ['bbox', 'box3d_lidar', 'scores', 'label_preds']:
-            box_dict[key] = np.concatenate(box_dict[key])
-
-        # apply nms to box3d_lidar (box3d_camera are in different systems)
-        # TODO: move this global setting into config
-        nms_cfg = dict(
-            use_rotate_nms=True,
-            nms_across_levels=False,
-            nms_pre=500,
-            nms_thr=0.05,
-            score_thr=0.001,
-            min_bbox_size=0,
-            max_per_frame=100)
-        nms_cfg = Config(nms_cfg)
-        lidar_boxes3d = LiDARInstance3DBoxes(
-            torch.from_numpy(box_dict['box3d_lidar']).cuda())
-        scores = torch.from_numpy(box_dict['scores']).cuda()
-        labels = torch.from_numpy(box_dict['label_preds']).long().cuda()
-        nms_scores = scores.new_zeros(scores.shape[0], len(self.classes) + 1)
-        indices = labels.new_tensor(list(range(scores.shape[0])))
-        nms_scores[indices, labels] = scores
-        lidar_boxes3d_for_nms = xywhr2xyxyr(lidar_boxes3d.bev)
-        boxes3d = lidar_boxes3d.tensor
-        # generate attr scores from attr labels
-        boxes3d, scores, labels = box3d_multiclass_nms(
-            boxes3d, lidar_boxes3d_for_nms, nms_scores, nms_cfg.score_thr,
-            nms_cfg.max_per_frame, nms_cfg)
-        lidar_boxes3d = LiDARInstance3DBoxes(boxes3d)
-        det = bbox3d2result(lidar_boxes3d, scores, labels)
-        box_preds_lidar = det['bboxes_3d']
-        scores = det['scores_3d']
-        labels = det['labels_3d']
-        # box_preds_camera is in the cam0 system
-        lidar2cam = cam0_info['images'][self.default_cam_key]['lidar2img']
-        lidar2cam = np.array(lidar2cam).astype(np.float32)
-        box_preds_camera = box_preds_lidar.convert_to(
-            Box3DMode.CAM, lidar2cam, correct_yaw=True)
-        # Note: bbox is meaningless in final evaluation, set to 0
-        merged_box_dict = dict(
-            bbox=np.zeros([box_preds_lidar.tensor.shape[0], 4]),
-            box3d_camera=box_preds_camera.numpy(),
-            box3d_lidar=box_preds_lidar.numpy(),
-            scores=scores.numpy(),
-            label_preds=labels.numpy(),
-            sample_idx=box_dict['sample_idx'],
-        )
-        return merged_box_dict
-
-    def bbox2result_kitti(
-            self,
-            net_outputs: List[dict],
-            sample_idx_list: List[int],
-            class_names: List[str],
-            pklfile_prefix: Optional[str] = None,
-            submission_prefix: Optional[str] = None) -> List[dict]:
-        """Convert 3D detection results to kitti format for evaluation and test
-        submission.
-
-        Args:
-            net_outputs (List[dict]): List of dict storing the inferenced
-                bounding boxes and scores.
-            sample_idx_list (List[int]): List of input sample idx.
-            class_names (List[str]): A list of class names.
-            pklfile_prefix (str, optional): The prefix of pkl file.
-                Defaults to None.
-            submission_prefix (str, optional): The prefix of submission file.
-                Defaults to None.
-
-        Returns:
-            List[dict]: A list of dictionaries with the kitti format.
+            Dict: Merged results.
         """
-        if submission_prefix is not None:
-            mmengine.mkdir_or_exist(submission_prefix)
-
-        det_annos = []
-        print('\nConverting prediction to KITTI format')
-        for idx, pred_dicts in enumerate(
-                mmengine.track_iter_progress(net_outputs)):
-            sample_idx = sample_idx_list[idx]
-            info = self.data_infos[sample_idx]
-
-            if self.load_type == 'mv_image_based':
-                if idx % self.num_cams == 0:
-                    box_dict_per_frame = []
-                    cam0_key = list(info['images'].keys())[0]
-                    cam0_info = info
-                    # Here in mono3d, we use the 'CAM_FRONT' "the first
-                    # index in the camera" as the default image shape.
-                    # If you want to another camera, please modify it.
-                    image_shape = (info['images'][cam0_key]['height'],
-                                   info['images'][cam0_key]['width'])
-                box_dict = self.convert_valid_bboxes(pred_dicts, info)
-            else:
-                box_dict = self.convert_valid_bboxes(pred_dicts, info)
-                # Here default used 'CAM_FRONT' to compute metric.
-                # If you want to use another camera, please modify it.
-                image_shape = (info['images'][self.default_cam_key]['height'],
-                               info['images'][self.default_cam_key]['width'])
-            if self.load_type == 'mv_image_based':
-                box_dict_per_frame.append(box_dict)
-                if (idx + 1) % self.num_cams != 0:
-                    continue
-                box_dict = self.merge_multi_view_boxes(box_dict_per_frame,
-                                                       cam0_info)
-
-            anno = {
-                'name': [],
-                'truncated': [],
-                'occluded': [],
-                'alpha': [],
-                'bbox': [],
-                'dimensions': [],
-                'location': [],
-                'rotation_y': [],
-                'score': []
-            }
-            if len(box_dict['bbox']) > 0:
-                box_2d_preds = box_dict['bbox']
-                box_preds = box_dict['box3d_camera']
-                scores = box_dict['scores']
-                box_preds_lidar = box_dict['box3d_lidar']
-                label_preds = box_dict['label_preds']
-
-                for box, box_lidar, bbox, score, label in zip(
-                        box_preds, box_preds_lidar, box_2d_preds, scores,
-                        label_preds):
-                    bbox[2:] = np.minimum(bbox[2:], image_shape[::-1])
-                    bbox[:2] = np.maximum(bbox[:2], [0, 0])
-                    anno['name'].append(class_names[int(label)])
-                    anno['truncated'].append(0.0)
-                    anno['occluded'].append(0)
-                    anno['alpha'].append(
-                        -np.arctan2(-box_lidar[1], box_lidar[0]) + box[6])
-                    anno['bbox'].append(bbox)
-                    anno['dimensions'].append(box[3:6])
-                    anno['location'].append(box[:3])
-                    anno['rotation_y'].append(box[6])
-                    anno['score'].append(score)
-
-                anno = {k: np.stack(v) for k, v in anno.items()}
-            else:
-                anno = {
-                    'name': np.array([]),
-                    'truncated': np.array([]),
-                    'occluded': np.array([]),
-                    'alpha': np.array([]),
-                    'bbox': np.zeros([0, 4]),
-                    'dimensions': np.zeros([0, 3]),
-                    'location': np.zeros([0, 3]),
-                    'rotation_y': np.array([]),
-                    'score': np.array([]),
-                }
-
-            if submission_prefix is not None:
-                curr_file = f'{submission_prefix}/{sample_idx:06d}.txt'
-                with open(curr_file, 'w') as f:
-                    bbox = anno['bbox']
-                    loc = anno['location']
-                    dims = anno['dimensions']  # lhw -> hwl
-
-                    for idx in range(len(bbox)):
-                        print(
-                            '{} -1 -1 {:.4f} {:.4f} {:.4f} {:.4f} '
-                            '{:.4f} {:.4f} {:.4f} '
-                            '{:.4f} {:.4f} {:.4f} {:.4f} {:.4f} {:.4f}'.format(
-                                anno['name'][idx], anno['alpha'][idx],
-                                bbox[idx][0], bbox[idx][1], bbox[idx][2],
-                                bbox[idx][3], dims[idx][1], dims[idx][2],
-                                dims[idx][0], loc[idx][0], loc[idx][1],
-                                loc[idx][2], anno['rotation_y'][idx],
-                                anno['score'][idx]),
-                            file=f)
-            if self.use_pred_sample_idx:
-                save_sample_idx = sample_idx
-            else:
-                # use the sample idx in the info file
-                # In waymo validation sample_idx in prediction is 000xxx
-                # but in info file it is 1000xxx
-                save_sample_idx = box_dict['sample_idx']
-            anno['sample_idx'] = np.array(
-                [save_sample_idx] * len(anno['score']), dtype=np.int64)
-
-            det_annos.append(anno)
-
-        if pklfile_prefix is not None:
-            if not pklfile_prefix.endswith(('.pkl', '.pickle')):
-                out = f'{pklfile_prefix}.pkl'
-            else:
-                out = pklfile_prefix
-            mmengine.dump(det_annos, out)
-            print(f'Result is saved to {out}.')
-
-        return det_annos
-
-    def convert_valid_bboxes(self, box_dict: dict, info: dict) -> dict:
-        """Convert the predicted boxes into valid ones. Should handle the
-        load_model (frame_based, mv_image_based, fov_image_based), separately.
-
-        Args:
-            box_dict (dict): Box dictionaries to be converted.
-
-                - bboxes_3d (:obj:`BaseInstance3DBoxes`): 3D bounding boxes.
-                - scores_3d (Tensor): Scores of boxes.
-                - labels_3d (Tensor): Class labels of boxes.
-            info (dict): Data info.
-
-        Returns:
-            dict: Valid predicted boxes.
-
-            - bbox (np.ndarray): 2D bounding boxes.
-            - box3d_camera (np.ndarray): 3D bounding boxes in camera
-              coordinate.
-            - box3d_lidar (np.ndarray): 3D bounding boxes in LiDAR coordinate.
-            - scores (np.ndarray): Scores of boxes.
-            - label_preds (np.ndarray): Class label predictions.
-            - sample_idx (int): Sample index.
-        """
-        # TODO: refactor this function
-        box_preds = box_dict['bboxes_3d']
-        scores = box_dict['scores_3d']
-        labels = box_dict['labels_3d']
-        sample_idx = info['sample_idx']
-        box_preds.limit_yaw(offset=0.5, period=np.pi * 2)
-
-        if len(box_preds) == 0:
-            return dict(
-                bbox=np.zeros([0, 4]),
-                box3d_camera=np.zeros([0, 7]),
-                box3d_lidar=np.zeros([0, 7]),
-                scores=np.zeros([0]),
-                label_preds=np.zeros([0, 4]),
-                sample_idx=sample_idx)
-        # Here default used 'CAM_FRONT' to compute metric. If you want to
-        # use another camera, please modify it.
-        if self.load_type in ['frame_based', 'fov_image_based']:
-            cam_key = self.default_cam_key
-        elif self.load_type == 'mv_image_based':
-            cam_key = list(info['images'].keys())[0]
-        else:
-            raise NotImplementedError
-
-        lidar2cam = np.array(info['images'][cam_key]['lidar2cam']).astype(
-            np.float32)
-        P2 = np.array(info['images'][cam_key]['cam2img']).astype(np.float32)
-        img_shape = (info['images'][cam_key]['height'],
-                     info['images'][cam_key]['width'])
-        P2 = box_preds.tensor.new_tensor(P2)
-
-        if isinstance(box_preds, LiDARInstance3DBoxes):
-            box_preds_camera = box_preds.convert_to(Box3DMode.CAM, lidar2cam)
-            box_preds_lidar = box_preds
-        elif isinstance(box_preds, CameraInstance3DBoxes):
-            box_preds_camera = box_preds
-            box_preds_lidar = box_preds.convert_to(Box3DMode.LIDAR,
-                                                   np.linalg.inv(lidar2cam))
-
-        box_corners = box_preds_camera.corners
-        box_corners_in_image = points_cam2img(box_corners, P2)
-        # box_corners_in_image: [N, 8, 2]
-        minxy = torch.min(box_corners_in_image, dim=1)[0]
-        maxxy = torch.max(box_corners_in_image, dim=1)[0]
-        box_2d_preds = torch.cat([minxy, maxxy], dim=1)
-        # Post-processing
-        # check box_preds_camera
-        image_shape = box_preds.tensor.new_tensor(img_shape)
-        valid_cam_inds = ((box_2d_preds[:, 0] < image_shape[1]) &
-                          (box_2d_preds[:, 1] < image_shape[0]) &
-                          (box_2d_preds[:, 2] > 0) & (box_2d_preds[:, 3] > 0))
-        # check box_preds_lidar
-        if self.load_type in ['frame_based']:
-            limit_range = box_preds.tensor.new_tensor(self.pcd_limit_range)
-            valid_pcd_inds = ((box_preds_lidar.center > limit_range[:3]) &
-                              (box_preds_lidar.center < limit_range[3:]))
-            valid_inds = valid_pcd_inds.all(-1)
-        elif self.load_type in ['mv_image_based', 'fov_image_based']:
-            valid_inds = valid_cam_inds
-
-        if valid_inds.sum() > 0:
-            return dict(
-                bbox=box_2d_preds[valid_inds, :].numpy(),
-                pred_box_type_3d=type(box_preds),
-                box3d_camera=box_preds_camera[valid_inds].numpy(),
-                box3d_lidar=box_preds_lidar[valid_inds].numpy(),
-                scores=scores[valid_inds].numpy(),
-                label_preds=labels[valid_inds].numpy(),
-                sample_idx=sample_idx)
-        else:
-            return dict(
-                bbox=np.zeros([0, 4]),
-                pred_box_type_3d=type(box_preds),
-                box3d_camera=np.zeros([0, 7]),
-                box3d_lidar=np.zeros([0, 7]),
-                scores=np.zeros([0]),
-                label_preds=np.zeros([0]),
-                sample_idx=sample_idx)
+        merged_results = []
+        for frame_result in frame_results:
+            merged_result = dict()
+            merged_result['sample_idx'] = frame_result[0]['sample_idx'] // 5
+            merged_result['context_name'] = frame_result[0]['context_name']
+            merged_result['timestamp'] = frame_result[0]['timestamp']
+            bboxes_3d, scores_3d, labels_3d = [], [], []
+            for result in frame_result:
+                assert result['timestamp'] == merged_result['timestamp']
+                bboxes_3d.append(result['bboxes_3d'])
+                scores_3d.append(result['scores_3d'])
+                labels_3d.append(result['labels_3d'])
+
+            bboxes_3d = np.concatenate(bboxes_3d)
+            scores_3d = np.concatenate(scores_3d)
+            labels_3d = np.concatenate(labels_3d)
+
+            device = get_device()
+            lidar_boxes3d = LiDARInstance3DBoxes(
+                torch.from_numpy(bboxes_3d).to(device))
+            scores = torch.from_numpy(scores_3d).to(device)
+            labels = torch.from_numpy(labels_3d).long().to(device)
+            nms_scores = scores.new_zeros(scores.shape[0],
+                                          len(self.classes) + 1)
+            indices = labels.new_tensor(list(range(scores.shape[0])))
+            nms_scores[indices, labels] = scores
+            lidar_boxes3d_for_nms = xywhr2xyxyr(lidar_boxes3d.bev)
+            boxes3d = lidar_boxes3d.tensor
+            bboxes_3d, scores_3d, labels_3d = box3d_multiclass_nms(
+                boxes3d, lidar_boxes3d_for_nms, nms_scores,
+                self.nms_cfg.score_thr, self.nms_cfg.max_per_frame,
+                self.nms_cfg)
+
+            merged_result['bboxes_3d'] = bboxes_3d.cpu().numpy()
+            merged_result['scores_3d'] = scores_3d.cpu().numpy()
+            merged_result['labels_3d'] = labels_3d.cpu().numpy()
+            merged_results.append(merged_result)
+        return merged_results
diff --git a/projects/CenterFormer/configs/centerformer_voxel01_second-attn_secfpn-attn_4xb4-cyclic-20e_waymoD5-3d-3class.py b/projects/CenterFormer/configs/centerformer_voxel01_second-attn_secfpn-attn_4xb4-cyclic-20e_waymoD5-3d-3class.py
index 14bcbb929..5b207c799 100644
--- a/projects/CenterFormer/configs/centerformer_voxel01_second-attn_secfpn-attn_4xb4-cyclic-20e_waymoD5-3d-3class.py
+++ b/projects/CenterFormer/configs/centerformer_voxel01_second-attn_secfpn-attn_4xb4-cyclic-20e_waymoD5-3d-3class.py
@@ -179,7 +179,10 @@
             dict(
                 type='PointsRangeFilter', point_cloud_range=point_cloud_range)
         ]),
-    dict(type='Pack3DDetInputs', keys=['points'])
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points'],
+        meta_keys=['box_type_3d', 'sample_idx', 'context_name', 'timestamp'])
 ]
 
 dataset_type = 'WaymoDataset'
@@ -223,13 +226,7 @@
 test_dataloader = val_dataloader
 
 val_evaluator = dict(
-    type='WaymoMetric',
-    ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl',
-    waymo_bin_file='./data/waymo/waymo_format/gt.bin',
-    data_root='./data/waymo/waymo_format',
-    backend_args=backend_args,
-    convert_kitti_format=False,
-    idx2metainfo='./data/waymo/waymo_format/idx2metainfo.pkl')
+    type='WaymoMetric', waymo_bin_file='./data/waymo/waymo_format/gt.bin')
 test_evaluator = val_evaluator
 
 vis_backends = [dict(type='LocalVisBackend')]
diff --git a/tests/data/waymo/kitti_format/waymo_infos_train.pkl b/tests/data/waymo/kitti_format/waymo_infos_train.pkl
index e89255005..f2f587c6a 100644
Binary files a/tests/data/waymo/kitti_format/waymo_infos_train.pkl and b/tests/data/waymo/kitti_format/waymo_infos_train.pkl differ
diff --git a/tests/data/waymo/kitti_format/waymo_infos_val.pkl b/tests/data/waymo/kitti_format/waymo_infos_val.pkl
index 0ce2230bb..82a6ed470 100644
Binary files a/tests/data/waymo/kitti_format/waymo_infos_val.pkl and b/tests/data/waymo/kitti_format/waymo_infos_val.pkl differ
diff --git a/tests/test_datasets/test_waymo_dataset.py b/tests/test_datasets/test_waymo_dataset.py
new file mode 100644
index 000000000..20ec1fc17
--- /dev/null
+++ b/tests/test_datasets/test_waymo_dataset.py
@@ -0,0 +1,80 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import numpy as np
+import torch
+from mmcv.transforms.base import BaseTransform
+from mmengine.registry import TRANSFORMS
+from mmengine.structures import InstanceData
+
+from mmdet3d.datasets import WaymoDataset
+from mmdet3d.structures import Det3DDataSample, LiDARInstance3DBoxes
+
+
+def _generate_waymo_dataset_config():
+    data_root = 'tests/data/waymo/kitti_format'
+    ann_file = 'waymo_infos_train.pkl'
+    classes = ['Car', 'Pedestrian', 'Cyclist']
+    # wait for pipline refactor
+
+    if 'Identity' not in TRANSFORMS:
+
+        @TRANSFORMS.register_module()
+        class Identity(BaseTransform):
+
+            def transform(self, info):
+                if 'ann_info' in info:
+                    info['gt_labels_3d'] = info['ann_info']['gt_labels_3d']
+                data_sample = Det3DDataSample()
+                gt_instances_3d = InstanceData()
+                gt_instances_3d.labels_3d = info['gt_labels_3d']
+                data_sample.gt_instances_3d = gt_instances_3d
+                info['data_samples'] = data_sample
+                return info
+
+    pipeline = [
+        dict(type='Identity'),
+    ]
+
+    modality = dict(use_lidar=True, use_camera=True)
+    data_prefix = data_prefix = dict(
+        pts='training/velodyne', CAM_FRONT='training/image_0')
+    return data_root, ann_file, classes, data_prefix, pipeline, modality
+
+
+def test_getitem():
+    data_root, ann_file, classes, data_prefix, \
+        pipeline, modality, = _generate_waymo_dataset_config()
+
+    waymo_dataset = WaymoDataset(
+        data_root,
+        ann_file,
+        data_prefix=data_prefix,
+        pipeline=pipeline,
+        metainfo=dict(classes=classes),
+        modality=modality)
+
+    waymo_dataset.prepare_data(0)
+    input_dict = waymo_dataset.get_data_info(0)
+    waymo_dataset[0]
+    # assert the the path should contains data_prefix and data_root
+    assert data_prefix['pts'] in input_dict['lidar_points']['lidar_path']
+    assert data_root in input_dict['lidar_points']['lidar_path']
+    for cam_id, img_info in input_dict['images'].items():
+        if 'img_path' in img_info:
+            assert data_prefix['CAM_FRONT'] in img_info['img_path']
+            assert data_root in img_info['img_path']
+
+    ann_info = waymo_dataset.parse_ann_info(input_dict)
+
+    # only one instance
+    assert 'gt_labels_3d' in ann_info
+    assert ann_info['gt_labels_3d'].dtype == np.int64
+
+    assert 'gt_bboxes_3d' in ann_info
+    assert isinstance(ann_info['gt_bboxes_3d'], LiDARInstance3DBoxes)
+    assert torch.allclose(ann_info['gt_bboxes_3d'].tensor.sum(),
+                          torch.tensor(43.3103))
+    assert 'centers_2d' in ann_info
+    assert ann_info['centers_2d'].dtype == np.float32
+    assert 'depths' in ann_info
+    assert ann_info['depths'].dtype == np.float32
diff --git a/tools/create_data.py b/tools/create_data.py
index 34356c2a8..d8c6495d6 100644
--- a/tools/create_data.py
+++ b/tools/create_data.py
@@ -2,6 +2,8 @@
 import argparse
 from os import path as osp
 
+from mmengine import print_log
+
 from tools.dataset_converters import indoor_converter as indoor
 from tools.dataset_converters import kitti_converter as kitti
 from tools.dataset_converters import lyft_converter as lyft_converter
@@ -171,8 +173,19 @@ def waymo_data_prep(root_path,
                     version,
                     out_dir,
                     workers,
-                    max_sweeps=5):
-    """Prepare the info file for waymo dataset.
+                    max_sweeps=10,
+                    only_gt_database=False,
+                    save_senor_data=False,
+                    skip_cam_instances_infos=False):
+    """Prepare waymo dataset. There are 3 steps as follows:
+
+    Step 1. Extract camera images and lidar point clouds from waymo raw
+        data in '*.tfreord' and save as kitti format.
+    Step 2. Generate waymo train/val/test infos and save as pickle file.
+    Step 3. Generate waymo ground truth database (point clouds within
+        each 3D bounding box) for data augmentation in training.
+    Steps 1 and 2 will be done in Waymo2KITTI, and step 3 will be done in
+    GTDatabaseCreater.
 
     Args:
         root_path (str): Path of dataset root.
@@ -180,44 +193,55 @@ def waymo_data_prep(root_path,
         out_dir (str): Output directory of the generated info file.
         workers (int): Number of threads to be used.
         max_sweeps (int, optional): Number of input consecutive frames.
-            Default: 5. Here we store pose information of these frames
-            for later use.
+            Default to 10. Here we store ego2global information of these
+            frames for later use.
+        only_gt_database (bool, optional): Whether to only generate ground
+            truth database. Default to False.
+        save_senor_data (bool, optional): Whether to skip saving
+            image and lidar. Default to False.
+        skip_cam_instances_infos (bool, optional): Whether to skip
+            gathering cam_instances infos in Step 2. Default to False.
     """
     from tools.dataset_converters import waymo_converter as waymo
 
-    splits = [
-        'training', 'validation', 'testing', 'testing_3d_camera_only_detection'
-    ]
-    for i, split in enumerate(splits):
-        load_dir = osp.join(root_path, 'waymo_format', split)
-        if split == 'validation':
-            save_dir = osp.join(out_dir, 'kitti_format', 'training')
-        else:
-            save_dir = osp.join(out_dir, 'kitti_format', split)
-        converter = waymo.Waymo2KITTI(
-            load_dir,
-            save_dir,
-            prefix=str(i),
-            workers=workers,
-            test_mode=(split
-                       in ['testing', 'testing_3d_camera_only_detection']))
-        converter.convert()
-
-    from tools.dataset_converters.waymo_converter import \
-        create_ImageSets_img_ids
-    create_ImageSets_img_ids(osp.join(out_dir, 'kitti_format'), splits)
-    # Generate waymo infos
+    if version == 'v1.4':
+        splits = [
+            'training', 'validation', 'testing',
+            'testing_3d_camera_only_detection'
+        ]
+    elif version == 'v1.4-mini':
+        splits = ['training', 'validation']
+    else:
+        raise NotImplementedError(f'Unsupported Waymo version {version}!')
     out_dir = osp.join(out_dir, 'kitti_format')
-    kitti.create_waymo_info_file(
-        out_dir, info_prefix, max_sweeps=max_sweeps, workers=workers)
-    info_train_path = osp.join(out_dir, f'{info_prefix}_infos_train.pkl')
-    info_val_path = osp.join(out_dir, f'{info_prefix}_infos_val.pkl')
-    info_trainval_path = osp.join(out_dir, f'{info_prefix}_infos_trainval.pkl')
-    info_test_path = osp.join(out_dir, f'{info_prefix}_infos_test.pkl')
-    update_pkl_infos('waymo', out_dir=out_dir, pkl_path=info_train_path)
-    update_pkl_infos('waymo', out_dir=out_dir, pkl_path=info_val_path)
-    update_pkl_infos('waymo', out_dir=out_dir, pkl_path=info_trainval_path)
-    update_pkl_infos('waymo', out_dir=out_dir, pkl_path=info_test_path)
+
+    if not only_gt_database:
+        for i, split in enumerate(splits):
+            load_dir = osp.join(root_path, 'waymo_format', split)
+            if split == 'validation':
+                save_dir = osp.join(out_dir, 'training')
+            else:
+                save_dir = osp.join(out_dir, split)
+            converter = waymo.Waymo2KITTI(
+                load_dir,
+                save_dir,
+                prefix=str(i),
+                workers=workers,
+                test_mode=(split
+                           in ['testing', 'testing_3d_camera_only_detection']),
+                info_prefix=info_prefix,
+                max_sweeps=max_sweeps,
+                split=split,
+                save_senor_data=save_senor_data,
+                save_cam_instances=not skip_cam_instances_infos)
+            converter.convert()
+            if split == 'validation':
+                converter.merge_trainval_infos()
+
+        from tools.dataset_converters.waymo_converter import \
+            create_ImageSets_img_ids
+        create_ImageSets_img_ids(out_dir, splits)
+
     GTDatabaseCreater(
         'WaymoDataset',
         out_dir,
@@ -227,6 +251,8 @@ def waymo_data_prep(root_path,
         with_mask=False,
         num_worker=workers).create()
 
+    print_log('Successfully preparing Waymo Open Dataset')
+
 
 def semantickitti_data_prep(info_prefix, out_dir):
     """Prepare the info file for SemanticKITTI dataset.
@@ -274,12 +300,23 @@ def semantickitti_data_prep(info_prefix, out_dir):
 parser.add_argument(
     '--only-gt-database',
     action='store_true',
-    help='Whether to only generate ground truth database.')
+    help='''Whether to only generate ground truth database.
+        Only used when dataset is NuScenes or Waymo!''')
+parser.add_argument(
+    '--skip-cam_instances-infos',
+    action='store_true',
+    help='''Whether to skip gathering cam_instances infos.
+        Only used when dataset is Waymo!''')
+parser.add_argument(
+    '--skip-saving-sensor-data',
+    action='store_true',
+    help='''Whether to skip saving image and lidar.
+        Only used when dataset is Waymo!''')
 args = parser.parse_args()
 
 if __name__ == '__main__':
-    from mmdet3d.utils import register_all_modules
-    register_all_modules()
+    from mmengine.registry import init_default_scope
+    init_default_scope('mmdet3d')
 
     if args.dataset == 'kitti':
         if args.only_gt_database:
@@ -334,6 +371,17 @@ def semantickitti_data_prep(info_prefix, out_dir):
                 dataset_name='NuScenesDataset',
                 out_dir=args.out_dir,
                 max_sweeps=args.max_sweeps)
+    elif args.dataset == 'waymo':
+        waymo_data_prep(
+            root_path=args.root_path,
+            info_prefix=args.extra_tag,
+            version=args.version,
+            out_dir=args.out_dir,
+            workers=args.workers,
+            max_sweeps=args.max_sweeps,
+            only_gt_database=args.only_gt_database,
+            save_senor_data=not args.skip_saving_sensor_data,
+            skip_cam_instances_infos=args.skip_cam_instances_infos)
     elif args.dataset == 'lyft':
         train_version = f'{args.version}-train'
         lyft_data_prep(
@@ -347,14 +395,6 @@ def semantickitti_data_prep(info_prefix, out_dir):
             info_prefix=args.extra_tag,
             version=test_version,
             max_sweeps=args.max_sweeps)
-    elif args.dataset == 'waymo':
-        waymo_data_prep(
-            root_path=args.root_path,
-            info_prefix=args.extra_tag,
-            version=args.version,
-            out_dir=args.out_dir,
-            workers=args.workers,
-            max_sweeps=args.max_sweeps)
     elif args.dataset == 'scannet':
         scannet_data_prep(
             root_path=args.root_path,
diff --git a/tools/create_data.sh b/tools/create_data.sh
index 9a57852f7..0a1946585 100755
--- a/tools/create_data.sh
+++ b/tools/create_data.sh
@@ -6,10 +6,11 @@ export PYTHONPATH=`pwd`:$PYTHONPATH
 PARTITION=$1
 JOB_NAME=$2
 DATASET=$3
+WORKERS=$4
 GPUS=${GPUS:-1}
 GPUS_PER_NODE=${GPUS_PER_NODE:-1}
 SRUN_ARGS=${SRUN_ARGS:-""}
-JOB_NAME=create_data
+PY_ARGS=${@:5}
 
 srun -p ${PARTITION} \
     --job-name=${JOB_NAME} \
@@ -21,4 +22,6 @@ srun -p ${PARTITION} \
     python -u tools/create_data.py ${DATASET} \
             --root-path ./data/${DATASET} \
             --out-dir ./data/${DATASET} \
-            --extra-tag ${DATASET}
+            --workers ${WORKERS} \
+            --extra-tag ${DATASET} \
+            ${PY_ARGS}
diff --git a/tools/dataset_converters/create_gt_database.py b/tools/dataset_converters/create_gt_database.py
index ae452eb54..fb84256fd 100644
--- a/tools/dataset_converters/create_gt_database.py
+++ b/tools/dataset_converters/create_gt_database.py
@@ -7,7 +7,7 @@
 import numpy as np
 from mmcv.ops import roi_align
 from mmdet.evaluation import bbox_overlaps
-from mmengine import track_iter_progress
+from mmengine import print_log, track_iter_progress
 from pycocotools import mask as maskUtils
 from pycocotools.coco import COCO
 
@@ -504,7 +504,9 @@ def create_single(self, input_dict):
         return single_db_infos
 
     def create(self):
-        print(f'Create GT Database of {self.dataset_class_name}')
+        print_log(
+            f'Create GT Database of {self.dataset_class_name}',
+            logger='current')
         dataset_cfg = dict(
             type=self.dataset_class_name,
             data_root=self.data_path,
@@ -610,12 +612,19 @@ def loop_dataset(i):
             input_dict['box_mode_3d'] = self.dataset.box_mode_3d
             return input_dict
 
-        multi_db_infos = mmengine.track_parallel_progress(
-            self.create_single,
-            ((loop_dataset(i)
-              for i in range(len(self.dataset))), len(self.dataset)),
-            self.num_worker)
-        print('Make global unique group id')
+        if self.num_worker == 0:
+            multi_db_infos = mmengine.track_progress(
+                self.create_single,
+                ((loop_dataset(i)
+                  for i in range(len(self.dataset))), len(self.dataset)))
+        else:
+            multi_db_infos = mmengine.track_parallel_progress(
+                self.create_single,
+                ((loop_dataset(i)
+                  for i in range(len(self.dataset))), len(self.dataset)),
+                self.num_worker,
+                chunksize=1000)
+        print_log('Make global unique group id', logger='current')
         group_counter_offset = 0
         all_db_infos = dict()
         for single_db_infos in track_iter_progress(multi_db_infos):
@@ -630,7 +639,8 @@ def loop_dataset(i):
             group_counter_offset += (group_id + 1)
 
         for k, v in all_db_infos.items():
-            print(f'load {len(v)} {k} database infos')
+            print_log(f'load {len(v)} {k} database infos', logger='current')
 
+        print_log(f'Saving GT database infos into {self.db_info_save_path}')
         with open(self.db_info_save_path, 'wb') as f:
             pickle.dump(all_db_infos, f)
diff --git a/tools/dataset_converters/waymo_converter.py b/tools/dataset_converters/waymo_converter.py
index 87f9c54b5..e383c238d 100644
--- a/tools/dataset_converters/waymo_converter.py
+++ b/tools/dataset_converters/waymo_converter.py
@@ -9,23 +9,33 @@
     raise ImportError('Please run "pip install waymo-open-dataset-tf-2-6-0" '
                       '>1.4.5 to install the official devkit first.')
 
+import copy
 import os
+import os.path as osp
 from glob import glob
+from io import BytesIO
 from os.path import exists, join
 
 import mmengine
 import numpy as np
 import tensorflow as tf
+from mmengine import print_log
+from nuscenes.utils.geometry_utils import view_points
+from PIL import Image
 from waymo_open_dataset.utils import range_image_utils, transform_utils
 from waymo_open_dataset.utils.frame_utils import \
     parse_range_image_and_camera_projection
 
+from mmdet3d.datasets.convert_utils import post_process_coords
+from mmdet3d.structures import Box3DMode, LiDARInstance3DBoxes, points_cam2img
+
 
 class Waymo2KITTI(object):
-    """Waymo to KITTI converter.
+    """Waymo to KITTI converter. There are 2 steps as follows:
 
-    This class serves as the converter to change the waymo raw data to KITTI
-    format.
+    Step 1. Extract camera images and lidar point clouds from waymo raw data in
+        '*.tfreord' and save as kitti format.
+    Step 2. Generate waymo train/val/test infos and save as pickle file.
 
     Args:
         load_dir (str): Directory to load waymo raw data.
@@ -36,8 +46,16 @@ class Waymo2KITTI(object):
             Defaults to 64.
         test_mode (bool, optional): Whether in the test_mode.
             Defaults to False.
-        save_cam_sync_labels (bool, optional): Whether to save cam sync labels.
-            Defaults to True.
+        save_senor_data (bool, optional): Whether to save image and lidar
+            data. Defaults to True.
+        save_cam_sync_instances (bool, optional): Whether to save cam sync
+            instances. Defaults to True.
+        save_cam_instances (bool, optional): Whether to save cam instances.
+            Defaults to False.
+        info_prefix (str, optional): Prefix of info filename.
+            Defaults to 'waymo'.
+        max_sweeps (int, optional): Max length of sweeps. Defaults to 10.
+        split (str, optional): Split of the data. Defaults to 'training'.
     """
 
     def __init__(self,
@@ -46,18 +64,12 @@ def __init__(self,
                  prefix,
                  workers=64,
                  test_mode=False,
-                 save_cam_sync_labels=True):
-        self.filter_empty_3dboxes = True
-        self.filter_no_label_zone_points = True
-
-        self.selected_waymo_classes = ['VEHICLE', 'PEDESTRIAN', 'CYCLIST']
-
-        # Only data collected in specific locations will be converted
-        # If set None, this filter is disabled
-        # Available options: location_sf (main dataset)
-        self.selected_waymo_locations = None
-        self.save_track_id = False
-
+                 save_senor_data=True,
+                 save_cam_sync_instances=True,
+                 save_cam_instances=True,
+                 info_prefix='waymo',
+                 max_sweeps=10,
+                 split='training'):
         # turn on eager execution for older tensorflow versions
         if int(tf.__version__.split('.')[0]) < 2:
             tf.enable_eager_execution()
@@ -74,12 +86,21 @@ def __init__(self,
         self.type_list = [
             'UNKNOWN', 'VEHICLE', 'PEDESTRIAN', 'SIGN', 'CYCLIST'
         ]
-        self.waymo_to_kitti_class_map = {
-            'UNKNOWN': 'DontCare',
-            'PEDESTRIAN': 'Pedestrian',
-            'VEHICLE': 'Car',
-            'CYCLIST': 'Cyclist',
-            'SIGN': 'Sign'  # not in kitti
+
+        # MMDetection3D unified camera keys & class names
+        self.camera_types = [
+            'CAM_FRONT',
+            'CAM_FRONT_LEFT',
+            'CAM_FRONT_RIGHT',
+            'CAM_SIDE_LEFT',
+            'CAM_SIDE_RIGHT',
+        ]
+        self.selected_waymo_classes = ['VEHICLE', 'PEDESTRIAN', 'CYCLIST']
+        self.info_map = {
+            'training': '_infos_train.pkl',
+            'validation': '_infos_val.pkl',
+            'testing': '_infos_test.pkl',
+            'testing_3d_camera_only_detection': '_infos_test_cam_only.pkl'
         }
 
         self.load_dir = load_dir
@@ -87,61 +108,87 @@ def __init__(self,
         self.prefix = prefix
         self.workers = int(workers)
         self.test_mode = test_mode
-        self.save_cam_sync_labels = save_cam_sync_labels
+        self.save_senor_data = save_senor_data
+        self.save_cam_sync_instances = save_cam_sync_instances
+        self.save_cam_instances = save_cam_instances
+        self.info_prefix = info_prefix
+        self.max_sweeps = max_sweeps
+        self.split = split
+
+        # TODO: Discuss filter_empty_3dboxes and filter_no_label_zone_points
+        self.filter_empty_3dboxes = True
+        self.filter_no_label_zone_points = True
+        self.save_track_id = False
 
         self.tfrecord_pathnames = sorted(
             glob(join(self.load_dir, '*.tfrecord')))
 
-        self.label_save_dir = f'{self.save_dir}/label_'
-        self.label_all_save_dir = f'{self.save_dir}/label_all'
         self.image_save_dir = f'{self.save_dir}/image_'
-        self.calib_save_dir = f'{self.save_dir}/calib'
         self.point_cloud_save_dir = f'{self.save_dir}/velodyne'
-        self.pose_save_dir = f'{self.save_dir}/pose'
-        self.timestamp_save_dir = f'{self.save_dir}/timestamp'
-        if self.save_cam_sync_labels:
-            self.cam_sync_label_save_dir = f'{self.save_dir}/cam_sync_label_'
-            self.cam_sync_label_all_save_dir = \
-                f'{self.save_dir}/cam_sync_label_all'
 
-        self.create_folder()
+        # Create folder for saving KITTI format camera images and
+        # lidar point clouds.
+        if 'testing_3d_camera_only_detection' not in self.load_dir:
+            mmengine.mkdir_or_exist(self.point_cloud_save_dir)
+        for i in range(5):
+            mmengine.mkdir_or_exist(f'{self.image_save_dir}{str(i)}')
 
     def convert(self):
         """Convert action."""
-        print('Start converting ...')
-        mmengine.track_parallel_progress(self.convert_one, range(len(self)),
-                                         self.workers)
-        print('\nFinished ...')
+        print_log(f'Start converting {self.split} dataset', logger='current')
+        if self.workers == 0:
+            data_infos = mmengine.track_progress(self.convert_one,
+                                                 range(len(self)))
+        else:
+            data_infos = mmengine.track_parallel_progress(
+                self.convert_one, range(len(self)), self.workers)
+        data_list = []
+        for data_info in data_infos:
+            data_list.extend(data_info)
+        metainfo = dict()
+        metainfo['dataset'] = 'waymo'
+        metainfo['version'] = 'waymo_v1.4'
+        metainfo['info_version'] = 'mmdet3d_v1.4'
+        waymo_infos = dict(data_list=data_list, metainfo=metainfo)
+        filenames = osp.join(
+            osp.dirname(self.save_dir),
+            f'{self.info_prefix + self.info_map[self.split]}')
+        print_log(f'Saving {self.split} dataset infos into {filenames}')
+        mmengine.dump(waymo_infos, filenames)
 
     def convert_one(self, file_idx):
-        """Convert action for single file.
+        """Convert one '*.tfrecord' file to kitti format. Each file stores all
+        the frames (about 200 frames) in current scene. We treat each frame as
+        a sample, save their images and point clouds in kitti format, and then
+        create info for all frames.
 
         Args:
             file_idx (int): Index of the file to be converted.
+
+        Returns:
+            List[dict]: Waymo infos for all frames in current file.
         """
         pathname = self.tfrecord_pathnames[file_idx]
         dataset = tf.data.TFRecordDataset(pathname, compression_type='')
 
+        # NOTE: file_infos is not shared between processes, only stores frame
+        # infos within the current file.
+        file_infos = []
         for frame_idx, data in enumerate(dataset):
 
             frame = dataset_pb2.Frame()
             frame.ParseFromString(bytearray(data.numpy()))
-            if (self.selected_waymo_locations is not None
-                    and frame.context.stats.location
-                    not in self.selected_waymo_locations):
-                continue
 
-            self.save_image(frame, file_idx, frame_idx)
-            self.save_calib(frame, file_idx, frame_idx)
-            self.save_lidar(frame, file_idx, frame_idx)
-            self.save_pose(frame, file_idx, frame_idx)
-            self.save_timestamp(frame, file_idx, frame_idx)
+            # Step 1. Extract camera images and lidar point clouds from waymo
+            # raw data in '*.tfreord' and save as kitti format.
+            if self.save_senor_data:
+                self.save_image(frame, file_idx, frame_idx)
+                self.save_lidar(frame, file_idx, frame_idx)
 
-            if not self.test_mode:
-                # TODO save the depth image for waymo challenge solution.
-                self.save_label(frame, file_idx, frame_idx)
-                if self.save_cam_sync_labels:
-                    self.save_label(frame, file_idx, frame_idx, cam_sync=True)
+            # Step 2. Generate waymo train/val/test infos and save as pkl file.
+            # TODO save the depth image for waymo challenge solution.
+            self.create_waymo_info_file(frame, file_idx, frame_idx, file_infos)
+        return file_infos
 
     def __len__(self):
         """Length of the filename list."""
@@ -162,62 +209,6 @@ def save_image(self, frame, file_idx, frame_idx):
             with open(img_path, 'wb') as fp:
                 fp.write(img.image)
 
-    def save_calib(self, frame, file_idx, frame_idx):
-        """Parse and save the calibration data.
-
-        Args:
-            frame (:obj:`Frame`): Open dataset frame proto.
-            file_idx (int): Current file index.
-            frame_idx (int): Current frame index.
-        """
-        # waymo front camera to kitti reference camera
-        T_front_cam_to_ref = np.array([[0.0, -1.0, 0.0], [0.0, 0.0, -1.0],
-                                       [1.0, 0.0, 0.0]])
-        camera_calibs = []
-        R0_rect = [f'{i:e}' for i in np.eye(3).flatten()]
-        Tr_velo_to_cams = []
-        calib_context = ''
-
-        for camera in frame.context.camera_calibrations:
-            # extrinsic parameters
-            T_cam_to_vehicle = np.array(camera.extrinsic.transform).reshape(
-                4, 4)
-            T_vehicle_to_cam = np.linalg.inv(T_cam_to_vehicle)
-            Tr_velo_to_cam = \
-                self.cart_to_homo(T_front_cam_to_ref) @ T_vehicle_to_cam
-            if camera.name == 1:  # FRONT = 1, see dataset.proto for details
-                self.T_velo_to_front_cam = Tr_velo_to_cam.copy()
-            Tr_velo_to_cam = Tr_velo_to_cam[:3, :].reshape((12, ))
-            Tr_velo_to_cams.append([f'{i:e}' for i in Tr_velo_to_cam])
-
-            # intrinsic parameters
-            camera_calib = np.zeros((3, 4))
-            camera_calib[0, 0] = camera.intrinsic[0]
-            camera_calib[1, 1] = camera.intrinsic[1]
-            camera_calib[0, 2] = camera.intrinsic[2]
-            camera_calib[1, 2] = camera.intrinsic[3]
-            camera_calib[2, 2] = 1
-            camera_calib = list(camera_calib.reshape(12))
-            camera_calib = [f'{i:e}' for i in camera_calib]
-            camera_calibs.append(camera_calib)
-
-        # all camera ids are saved as id-1 in the result because
-        # camera 0 is unknown in the proto
-        for i in range(5):
-            calib_context += 'P' + str(i) + ': ' + \
-                ' '.join(camera_calibs[i]) + '\n'
-        calib_context += 'R0_rect' + ': ' + ' '.join(R0_rect) + '\n'
-        for i in range(5):
-            calib_context += 'Tr_velo_to_cam_' + str(i) + ': ' + \
-                ' '.join(Tr_velo_to_cams[i]) + '\n'
-
-        with open(
-                f'{self.calib_save_dir}/{self.prefix}' +
-                f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt',
-                'w+') as fp_calib:
-            fp_calib.write(calib_context)
-            fp_calib.close()
-
     def save_lidar(self, frame, file_idx, frame_idx):
         """Parse and save the lidar data in psd format.
 
@@ -275,194 +266,6 @@ def save_lidar(self, frame, file_idx, frame_idx):
             f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.bin'
         point_cloud.astype(np.float32).tofile(pc_path)
 
-    def save_label(self, frame, file_idx, frame_idx, cam_sync=False):
-        """Parse and save the label data in txt format.
-        The relation between waymo and kitti coordinates is noteworthy:
-        1. x, y, z correspond to l, w, h (waymo) -> l, h, w (kitti)
-        2. x-y-z: front-left-up (waymo) -> right-down-front(kitti)
-        3. bbox origin at volumetric center (waymo) -> bottom center (kitti)
-        4. rotation: +x around y-axis (kitti) -> +x around z-axis (waymo)
-
-        Args:
-            frame (:obj:`Frame`): Open dataset frame proto.
-            file_idx (int): Current file index.
-            frame_idx (int): Current frame index.
-            cam_sync (bool, optional): Whether to save the cam sync labels.
-                Defaults to False.
-        """
-        label_all_path = f'{self.label_all_save_dir}/{self.prefix}' + \
-            f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt'
-        if cam_sync:
-            label_all_path = label_all_path.replace('label_',
-                                                    'cam_sync_label_')
-        fp_label_all = open(label_all_path, 'w+')
-        id_to_bbox = dict()
-        id_to_name = dict()
-        for labels in frame.projected_lidar_labels:
-            name = labels.name
-            for label in labels.labels:
-                # TODO: need a workaround as bbox may not belong to front cam
-                bbox = [
-                    label.box.center_x - label.box.length / 2,
-                    label.box.center_y - label.box.width / 2,
-                    label.box.center_x + label.box.length / 2,
-                    label.box.center_y + label.box.width / 2
-                ]
-                id_to_bbox[label.id] = bbox
-                id_to_name[label.id] = name - 1
-
-        for obj in frame.laser_labels:
-            bounding_box = None
-            name = None
-            id = obj.id
-            for proj_cam in self.cam_list:
-                if id + proj_cam in id_to_bbox:
-                    bounding_box = id_to_bbox.get(id + proj_cam)
-                    name = str(id_to_name.get(id + proj_cam))
-                    break
-
-            # NOTE: the 2D labels do not have strict correspondence with
-            # the projected 2D lidar labels
-            # e.g.: the projected 2D labels can be in camera 2
-            # while the most_visible_camera can have id 4
-            if cam_sync:
-                if obj.most_visible_camera_name:
-                    name = str(
-                        self.cam_list.index(
-                            f'_{obj.most_visible_camera_name}'))
-                    box3d = obj.camera_synced_box
-                else:
-                    continue
-            else:
-                box3d = obj.box
-
-            if bounding_box is None or name is None:
-                name = '0'
-                bounding_box = (0, 0, 0, 0)
-
-            my_type = self.type_list[obj.type]
-
-            if my_type not in self.selected_waymo_classes:
-                continue
-
-            if self.filter_empty_3dboxes and obj.num_lidar_points_in_box < 1:
-                continue
-
-            my_type = self.waymo_to_kitti_class_map[my_type]
-
-            height = box3d.height
-            width = box3d.width
-            length = box3d.length
-
-            x = box3d.center_x
-            y = box3d.center_y
-            z = box3d.center_z - height / 2
-
-            # project bounding box to the virtual reference frame
-            pt_ref = self.T_velo_to_front_cam @ \
-                np.array([x, y, z, 1]).reshape((4, 1))
-            x, y, z, _ = pt_ref.flatten().tolist()
-
-            rotation_y = -box3d.heading - np.pi / 2
-            track_id = obj.id
-
-            # not available
-            truncated = 0
-            occluded = 0
-            alpha = -10
-
-            line = my_type + \
-                ' {} {} {} {} {} {} {} {} {} {} {} {} {} {}\n'.format(
-                    round(truncated, 2), occluded, round(alpha, 2),
-                    round(bounding_box[0], 2), round(bounding_box[1], 2),
-                    round(bounding_box[2], 2), round(bounding_box[3], 2),
-                    round(height, 2), round(width, 2), round(length, 2),
-                    round(x, 2), round(y, 2), round(z, 2),
-                    round(rotation_y, 2))
-
-            if self.save_track_id:
-                line_all = line[:-1] + ' ' + name + ' ' + track_id + '\n'
-            else:
-                line_all = line[:-1] + ' ' + name + '\n'
-
-            label_path = f'{self.label_save_dir}{name}/{self.prefix}' + \
-                f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt'
-            if cam_sync:
-                label_path = label_path.replace('label_', 'cam_sync_label_')
-            fp_label = open(label_path, 'a')
-            fp_label.write(line)
-            fp_label.close()
-
-            fp_label_all.write(line_all)
-
-        fp_label_all.close()
-
-    def save_pose(self, frame, file_idx, frame_idx):
-        """Parse and save the pose data.
-
-        Note that SDC's own pose is not included in the regular training
-        of KITTI dataset. KITTI raw dataset contains ego motion files
-        but are not often used. Pose is important for algorithms that
-        take advantage of the temporal information.
-
-        Args:
-            frame (:obj:`Frame`): Open dataset frame proto.
-            file_idx (int): Current file index.
-            frame_idx (int): Current frame index.
-        """
-        pose = np.array(frame.pose.transform).reshape(4, 4)
-        np.savetxt(
-            join(f'{self.pose_save_dir}/{self.prefix}' +
-                 f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt'),
-            pose)
-
-    def save_timestamp(self, frame, file_idx, frame_idx):
-        """Save the timestamp data in a separate file instead of the
-        pointcloud.
-
-        Note that SDC's own pose is not included in the regular training
-        of KITTI dataset. KITTI raw dataset contains ego motion files
-        but are not often used. Pose is important for algorithms that
-        take advantage of the temporal information.
-
-        Args:
-            frame (:obj:`Frame`): Open dataset frame proto.
-            file_idx (int): Current file index.
-            frame_idx (int): Current frame index.
-        """
-        with open(
-                join(f'{self.timestamp_save_dir}/{self.prefix}' +
-                     f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt'),
-                'w') as f:
-            f.write(str(frame.timestamp_micros))
-
-    def create_folder(self):
-        """Create folder for data preprocessing."""
-        if not self.test_mode:
-            dir_list1 = [
-                self.label_all_save_dir,
-                self.calib_save_dir,
-                self.pose_save_dir,
-                self.timestamp_save_dir,
-            ]
-            dir_list2 = [self.label_save_dir, self.image_save_dir]
-            if self.save_cam_sync_labels:
-                dir_list1.append(self.cam_sync_label_all_save_dir)
-                dir_list2.append(self.cam_sync_label_save_dir)
-        else:
-            dir_list1 = [
-                self.calib_save_dir, self.pose_save_dir,
-                self.timestamp_save_dir
-            ]
-            dir_list2 = [self.image_save_dir]
-        if 'testing_3d_camera_only_detection' not in self.load_dir:
-            dir_list1.append(self.point_cloud_save_dir)
-        for d in dir_list1:
-            mmengine.mkdir_or_exist(d)
-        for d in dir_list2:
-            for i in range(5):
-                mmengine.mkdir_or_exist(f'{d}{str(i)}')
-
     def convert_range_image_to_point_cloud(self,
                                            frame,
                                            range_images,
@@ -604,29 +407,317 @@ def cart_to_homo(self, mat):
             raise ValueError(mat.shape)
         return ret
 
+    def create_waymo_info_file(self, frame, file_idx, frame_idx, file_infos):
+        r"""Generate waymo train/val/test infos.
+
+        For more details about infos, please refer to:
+        https://mmdetection3d.readthedocs.io/en/latest/advanced_guides/datasets/waymo.html
+        """  # noqa: E501
+        frame_infos = dict()
+
+        # Gather frame infos
+        sample_idx = \
+            f'{self.prefix}{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}'
+        frame_infos['sample_idx'] = int(sample_idx)
+        frame_infos['timestamp'] = frame.timestamp_micros
+        frame_infos['ego2global'] = np.array(frame.pose.transform).reshape(
+            4, 4).astype(np.float32).tolist()
+        frame_infos['context_name'] = frame.context.name
+
+        # Gather camera infos
+        frame_infos['images'] = dict()
+        # waymo front camera to kitti reference camera
+        T_front_cam_to_ref = np.array([[0.0, -1.0, 0.0], [0.0, 0.0, -1.0],
+                                       [1.0, 0.0, 0.0]])
+        camera_calibs = []
+        Tr_velo_to_cams = []
+        for camera in frame.context.camera_calibrations:
+            # extrinsic parameters
+            T_cam_to_vehicle = np.array(camera.extrinsic.transform).reshape(
+                4, 4)
+            T_vehicle_to_cam = np.linalg.inv(T_cam_to_vehicle)
+            Tr_velo_to_cam = \
+                self.cart_to_homo(T_front_cam_to_ref) @ T_vehicle_to_cam
+            Tr_velo_to_cams.append(Tr_velo_to_cam)
+
+            # intrinsic parameters
+            camera_calib = np.zeros((3, 4))
+            camera_calib[0, 0] = camera.intrinsic[0]
+            camera_calib[1, 1] = camera.intrinsic[1]
+            camera_calib[0, 2] = camera.intrinsic[2]
+            camera_calib[1, 2] = camera.intrinsic[3]
+            camera_calib[2, 2] = 1
+            camera_calibs.append(camera_calib)
+
+        for i, (cam_key, camera_calib, Tr_velo_to_cam) in enumerate(
+                zip(self.camera_types, camera_calibs, Tr_velo_to_cams)):
+            cam_infos = dict()
+            cam_infos['img_path'] = str(sample_idx) + '.jpg'
+            # NOTE: frames.images order is different
+            for img in frame.images:
+                if img.name == i + 1:
+                    width, height = Image.open(BytesIO(img.image)).size
+            cam_infos['height'] = height
+            cam_infos['width'] = width
+            cam_infos['lidar2cam'] = Tr_velo_to_cam.astype(np.float32).tolist()
+            cam_infos['cam2img'] = camera_calib.astype(np.float32).tolist()
+            cam_infos['lidar2img'] = (camera_calib @ Tr_velo_to_cam).astype(
+                np.float32).tolist()
+            frame_infos['images'][cam_key] = cam_infos
+
+        # Gather lidar infos
+        lidar_infos = dict()
+        lidar_infos['lidar_path'] = str(sample_idx) + '.bin'
+        lidar_infos['num_pts_feats'] = 6
+        frame_infos['lidar_points'] = lidar_infos
+
+        # Gather lidar sweeps and camera sweeps infos
+        # TODO: Add lidar2img in image sweeps infos when we need it.
+        # TODO: Consider merging lidar sweeps infos and image sweeps infos.
+        lidar_sweeps_infos, image_sweeps_infos = [], []
+        for prev_offset in range(-1, -self.max_sweeps - 1, -1):
+            prev_lidar_infos = dict()
+            prev_image_infos = dict()
+            if frame_idx + prev_offset >= 0:
+                prev_frame_infos = file_infos[prev_offset]
+                prev_lidar_infos['timestamp'] = prev_frame_infos['timestamp']
+                prev_lidar_infos['ego2global'] = prev_frame_infos['ego2global']
+                prev_lidar_infos['lidar_points'] = dict()
+                lidar_path = prev_frame_infos['lidar_points']['lidar_path']
+                prev_lidar_infos['lidar_points']['lidar_path'] = lidar_path
+                lidar_sweeps_infos.append(prev_lidar_infos)
+
+                prev_image_infos['timestamp'] = prev_frame_infos['timestamp']
+                prev_image_infos['ego2global'] = prev_frame_infos['ego2global']
+                prev_image_infos['images'] = dict()
+                for cam_key in self.camera_types:
+                    prev_image_infos['images'][cam_key] = dict()
+                    img_path = prev_frame_infos['images'][cam_key]['img_path']
+                    prev_image_infos['images'][cam_key]['img_path'] = img_path
+                image_sweeps_infos.append(prev_image_infos)
+        if lidar_sweeps_infos:
+            frame_infos['lidar_sweeps'] = lidar_sweeps_infos
+        if image_sweeps_infos:
+            frame_infos['image_sweeps'] = image_sweeps_infos
+
+        if not self.test_mode:
+            # Gather instances infos which is used for lidar-based 3D detection
+            frame_infos['instances'] = self.gather_instance_info(frame)
+            # Gather cam_sync_instances infos which is used for image-based
+            # (multi-view) 3D detection.
+            if self.save_cam_sync_instances:
+                frame_infos['cam_sync_instances'] = self.gather_instance_info(
+                    frame, cam_sync=True)
+            # Gather cam_instances infos which is used for image-based
+            # (monocular) 3D detection (optional).
+            # TODO: Should we use cam_sync_instances to generate cam_instances?
+            if self.save_cam_instances:
+                frame_infos['cam_instances'] = self.gather_cam_instance_info(
+                    copy.deepcopy(frame_infos['instances']),
+                    frame_infos['images'])
+        file_infos.append(frame_infos)
+
+    def gather_instance_info(self, frame, cam_sync=False):
+        """Generate instances and cam_sync_instances infos.
+
+        For more details about infos, please refer to:
+        https://mmdetection3d.readthedocs.io/en/latest/advanced_guides/datasets/waymo.html
+        """  # noqa: E501
+        id_to_bbox = dict()
+        id_to_name = dict()
+        for labels in frame.projected_lidar_labels:
+            name = labels.name
+            for label in labels.labels:
+                # TODO: need a workaround as bbox may not belong to front cam
+                bbox = [
+                    label.box.center_x - label.box.length / 2,
+                    label.box.center_y - label.box.width / 2,
+                    label.box.center_x + label.box.length / 2,
+                    label.box.center_y + label.box.width / 2
+                ]
+                id_to_bbox[label.id] = bbox
+                id_to_name[label.id] = name - 1
+
+        group_id = 0
+        instance_infos = []
+        for obj in frame.laser_labels:
+            instance_info = dict()
+            bounding_box = None
+            name = None
+            id = obj.id
+            for proj_cam in self.cam_list:
+                if id + proj_cam in id_to_bbox:
+                    bounding_box = id_to_bbox.get(id + proj_cam)
+                    name = id_to_name.get(id + proj_cam)
+                    break
+
+            # NOTE: the 2D labels do not have strict correspondence with
+            # the projected 2D lidar labels
+            # e.g.: the projected 2D labels can be in camera 2
+            # while the most_visible_camera can have id 4
+            if cam_sync:
+                if obj.most_visible_camera_name:
+                    name = self.cam_list.index(
+                        f'_{obj.most_visible_camera_name}')
+                    box3d = obj.camera_synced_box
+                else:
+                    continue
+            else:
+                box3d = obj.box
+
+            if bounding_box is None or name is None:
+                name = 0
+                bounding_box = [0.0, 0.0, 0.0, 0.0]
+
+            my_type = self.type_list[obj.type]
+
+            if my_type not in self.selected_waymo_classes:
+                continue
+            else:
+                label = self.selected_waymo_classes.index(my_type)
+
+            if self.filter_empty_3dboxes and obj.num_lidar_points_in_box < 1:
+                continue
+
+            group_id += 1
+            instance_info['group_id'] = group_id
+            instance_info['camera_id'] = name
+            instance_info['bbox'] = bounding_box
+            instance_info['bbox_label'] = label
+
+            height = box3d.height
+            width = box3d.width
+            length = box3d.length
+
+            # NOTE: We save the bottom center of 3D bboxes.
+            x = box3d.center_x
+            y = box3d.center_y
+            z = box3d.center_z - height / 2
+
+            rotation_y = box3d.heading
+
+            instance_info['bbox_3d'] = np.array(
+                [x, y, z, length, width, height,
+                 rotation_y]).astype(np.float32).tolist()
+            instance_info['bbox_label_3d'] = label
+            instance_info['num_lidar_pts'] = obj.num_lidar_points_in_box
+
+            if self.save_track_id:
+                instance_info['track_id'] = obj.id
+            instance_infos.append(instance_info)
+        return instance_infos
+
+    def gather_cam_instance_info(self, instances: dict, images: dict):
+        """Generate cam_instances infos.
+
+        For more details about infos, please refer to:
+        https://mmdetection3d.readthedocs.io/en/latest/advanced_guides/datasets/waymo.html
+        """  # noqa: E501
+        cam_instances = dict()
+        for cam_type in self.camera_types:
+            lidar2cam = np.array(images[cam_type]['lidar2cam'])
+            cam2img = np.array(images[cam_type]['cam2img'])
+            cam_instances[cam_type] = []
+            for instance in instances:
+                cam_instance = dict()
+                gt_bboxes_3d = np.array(instance['bbox_3d'])
+                # Convert lidar coordinates to camera coordinates
+                gt_bboxes_3d = LiDARInstance3DBoxes(
+                    gt_bboxes_3d[None, :]).convert_to(
+                        Box3DMode.CAM, lidar2cam, correct_yaw=True)
+                corners_3d = gt_bboxes_3d.corners.numpy()
+                corners_3d = corners_3d[0].T  # (1, 8, 3) -> (3, 8)
+                in_camera = np.argwhere(corners_3d[2, :] > 0).flatten()
+                corners_3d = corners_3d[:, in_camera]
+                # Project 3d box to 2d.
+                corner_coords = view_points(corners_3d, cam2img,
+                                            True).T[:, :2].tolist()
+
+                # Keep only corners that fall within the image.
+                # TODO: imsize should be determined by the current image size
+                # CAM_FRONT: (1920, 1280)
+                # CAM_FRONT_LEFT: (1920, 1280)
+                # CAM_SIDE_LEFT: (1920, 886)
+                final_coords = post_process_coords(
+                    corner_coords,
+                    imsize=(images['CAM_FRONT']['width'],
+                            images['CAM_FRONT']['height']))
+
+                # Skip if the convex hull of the re-projected corners
+                # does not intersect the image canvas.
+                if final_coords is None:
+                    continue
+                else:
+                    min_x, min_y, max_x, max_y = final_coords
+
+                cam_instance['bbox'] = [min_x, min_y, max_x, max_y]
+                cam_instance['bbox_label'] = instance['bbox_label']
+                cam_instance['bbox_3d'] = gt_bboxes_3d.numpy().squeeze(
+                ).astype(np.float32).tolist()
+                cam_instance['bbox_label_3d'] = instance['bbox_label_3d']
+
+                center_3d = gt_bboxes_3d.gravity_center.numpy()
+                center_2d_with_depth = points_cam2img(
+                    center_3d, cam2img, with_depth=True)
+                center_2d_with_depth = center_2d_with_depth.squeeze().tolist()
+
+                # normalized center2D + depth
+                # if samples with depth < 0 will be removed
+                if center_2d_with_depth[2] <= 0:
+                    continue
+                cam_instance['center_2d'] = center_2d_with_depth[:2]
+                cam_instance['depth'] = center_2d_with_depth[2]
+
+                # TODO: Discuss whether following info is necessary
+                cam_instance['bbox_3d_isvalid'] = True
+                cam_instance['velocity'] = -1
+                cam_instances[cam_type].append(cam_instance)
+
+        return cam_instances
+
+    def merge_trainval_infos(self):
+        """Merge training and validation infos into a single file."""
+        train_infos_path = osp.join(
+            osp.dirname(self.save_dir), f'{self.info_prefix}_infos_train.pkl')
+        val_infos_path = osp.join(
+            osp.dirname(self.save_dir), f'{self.info_prefix}_infos_val.pkl')
+        train_infos = mmengine.load(train_infos_path)
+        val_infos = mmengine.load(val_infos_path)
+        trainval_infos = dict(
+            metainfo=train_infos['metainfo'],
+            data_list=train_infos['data_list'] + val_infos['data_list'])
+        mmengine.dump(
+            trainval_infos,
+            osp.join(
+                osp.dirname(self.save_dir),
+                f'{self.info_prefix}_infos_trainval.pkl'))
+
 
 def create_ImageSets_img_ids(root_dir, splits):
+    """Create txt files indicating what to collect in each split."""
     save_dir = join(root_dir, 'ImageSets/')
     if not exists(save_dir):
         os.mkdir(save_dir)
 
-    idx_all = [[] for i in splits]
+    idx_all = [[] for _ in splits]
     for i, split in enumerate(splits):
-        path = join(root_dir, splits[i], 'calib')
+        path = join(root_dir, split, 'image_0')
         if not exists(path):
             RawNames = []
         else:
             RawNames = os.listdir(path)
 
         for name in RawNames:
-            if name.endswith('.txt'):
-                idx = name.replace('.txt', '\n')
+            if name.endswith('.jpg'):
+                idx = name.replace('.jpg', '\n')
                 idx_all[int(idx[0])].append(idx)
         idx_all[i].sort()
 
     open(save_dir + 'train.txt', 'w').writelines(idx_all[0])
     open(save_dir + 'val.txt', 'w').writelines(idx_all[1])
     open(save_dir + 'trainval.txt', 'w').writelines(idx_all[0] + idx_all[1])
-    open(save_dir + 'test.txt', 'w').writelines(idx_all[2])
-    # open(save_dir+'test_cam_only.txt','w').writelines(idx_all[3])
+    if len(idx_all) >= 3:
+        open(save_dir + 'test.txt', 'w').writelines(idx_all[2])
+    if len(idx_all) >= 4:
+        open(save_dir + 'test_cam_only.txt', 'w').writelines(idx_all[3])
     print('created txt files indicating what to collect in ', splits)