diff --git a/configs/wholebody_2d_keypoint/rtmpose/cocktail13/rtmw-x_8xb320-270e_cocktail13-384x288.py b/configs/wholebody_2d_keypoint/rtmpose/cocktail13/rtmw-x_8xb320-270e_cocktail13-384x288.py
new file mode 100644
index 0000000000..d525df746b
--- /dev/null
+++ b/configs/wholebody_2d_keypoint/rtmpose/cocktail13/rtmw-x_8xb320-270e_cocktail13-384x288.py
@@ -0,0 +1,607 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from mmpose.configs._base_.default_runtime import *  # noqa
+
+from albumentations.augmentations import Blur, CoarseDropout, MedianBlur
+from mmdet.engine.hooks import PipelineSwitchHook
+from mmengine.dataset import DefaultSampler
+from mmengine.hooks import EMAHook
+from mmengine.model import PretrainedInit
+from mmengine.optim import CosineAnnealingLR, LinearLR, OptimWrapper
+from torch.nn import SiLU, SyncBatchNorm
+from torch.optim import AdamW
+
+from mmpose.codecs import SimCCLabel
+from mmpose.datasets import (AicDataset, CocoWholeBodyDataset, COFWDataset,
+                             CombinedDataset, CrowdPoseDataset,
+                             Face300WDataset, GenerateTarget,
+                             GetBBoxCenterScale, HalpeDataset,
+                             HumanArt21Dataset, JhmdbDataset,
+                             KeypointConverter, LapaDataset, LoadImage,
+                             MpiiDataset, PackPoseInputs, PoseTrack18Dataset,
+                             RandomFlip, RandomHalfBody, TopdownAffine,
+                             UBody2dDataset, WFLWDataset)
+from mmpose.datasets.transforms.common_transforms import (
+    Albumentation, PhotometricDistortion, RandomBBoxTransform)
+from mmpose.engine.hooks import ExpMomentumEMA
+from mmpose.evaluation import CocoWholeBodyMetric
+from mmpose.models import (CSPNeXt, CSPNeXtPAFPN, KLDiscretLoss,
+                           PoseDataPreprocessor, RTMWHead,
+                           TopdownPoseEstimator)
+
+# common setting
+num_keypoints = 133
+input_size = (288, 384)
+
+# runtime
+max_epochs = 270
+stage2_num_epochs = 10
+base_lr = 5e-4
+train_batch_size = 320
+val_batch_size = 32
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+    type=OptimWrapper,
+    optimizer=dict(type=AdamW, lr=base_lr, weight_decay=0.05),
+    clip_grad=dict(max_norm=35, norm_type=2),
+    paramwise_cfg=dict(
+        norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type=LinearLR, start_factor=1.0e-5, by_epoch=False, begin=0, end=1000),
+    dict(
+        type=CosineAnnealingLR,
+        eta_min=base_lr * 0.05,
+        begin=max_epochs // 2,
+        end=max_epochs,
+        T_max=max_epochs // 2,
+        by_epoch=True,
+        convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=5632)
+
+# codec settings
+codec = dict(
+    type=SimCCLabel,
+    input_size=input_size,
+    sigma=(6., 6.93),
+    simcc_split_ratio=2.0,
+    normalize=False,
+    use_dark=False)
+
+# model settings
+model = dict(
+    type=TopdownPoseEstimator,
+    data_preprocessor=dict(
+        type=PoseDataPreprocessor,
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True),
+    backbone=dict(
+        type=CSPNeXt,
+        arch='P5',
+        expand_ratio=0.5,
+        deepen_factor=1.33,
+        widen_factor=1.25,
+        channel_attention=True,
+        norm_cfg=dict(type='BN'),
+        act_cfg=dict(type=SiLU),
+        init_cfg=dict(
+            type=PretrainedInit,
+            prefix='backbone.',
+            checkpoint='https://download.openmmlab.com/mmpose/v1/'
+            'wholebody_2d_keypoint/rtmpose/ubody/rtmpose-x_simcc-ucoco_pt-aic-coco_270e-384x288-f5b50679_20230822.pth'  # noqa
+        )),
+    neck=dict(
+        type=CSPNeXtPAFPN,
+        in_channels=[320, 640, 1280],
+        out_channels=None,
+        out_indices=(
+            1,
+            2,
+        ),
+        num_csp_blocks=2,
+        expand_ratio=0.5,
+        norm_cfg=dict(type=SyncBatchNorm),
+        act_cfg=dict(type=SiLU, inplace=True)),
+    head=dict(
+        type=RTMWHead,
+        in_channels=1280,
+        out_channels=num_keypoints,
+        input_size=input_size,
+        in_featuremap_size=tuple([s // 32 for s in input_size]),
+        simcc_split_ratio=codec['simcc_split_ratio'],
+        final_layer_kernel_size=7,
+        gau_cfg=dict(
+            hidden_dims=256,
+            s=128,
+            expansion_factor=2,
+            dropout_rate=0.,
+            drop_path=0.,
+            act_fn=SiLU,
+            use_rel_bias=False,
+            pos_enc=False),
+        loss=dict(
+            type=KLDiscretLoss,
+            use_target_weight=True,
+            beta=10.,
+            label_softmax=True),
+        decoder=codec),
+    test_cfg=dict(flip_test=True))
+
+# base dataset settings
+dataset_type = CocoWholeBodyDataset
+data_mode = 'topdown'
+data_root = 'data/'
+
+backend_args = dict(backend='local')
+
+# pipelines
+train_pipeline = [
+    dict(type=LoadImage, backend_args=backend_args),
+    dict(type=GetBBoxCenterScale),
+    dict(type=RandomFlip, direction='horizontal'),
+    dict(type=RandomHalfBody),
+    dict(type=RandomBBoxTransform, scale_factor=[0.5, 1.5], rotate_factor=90),
+    dict(type=TopdownAffine, input_size=codec['input_size']),
+    dict(type=PhotometricDistortion),
+    dict(
+        type=Albumentation,
+        transforms=[
+            dict(type=Blur, p=0.1),
+            dict(type=MedianBlur, p=0.1),
+            dict(
+                type=CoarseDropout,
+                max_holes=1,
+                max_height=0.4,
+                max_width=0.4,
+                min_holes=1,
+                min_height=0.2,
+                min_width=0.2,
+                p=1.0),
+        ]),
+    dict(
+        type=GenerateTarget, encoder=codec, use_dataset_keypoint_weights=True),
+    dict(type=PackPoseInputs)
+]
+val_pipeline = [
+    dict(type=LoadImage, backend_args=backend_args),
+    dict(type=GetBBoxCenterScale),
+    dict(type=TopdownAffine, input_size=codec['input_size']),
+    dict(type=PackPoseInputs)
+]
+
+train_pipeline_stage2 = [
+    dict(type=LoadImage, backend_args=backend_args),
+    dict(type=GetBBoxCenterScale),
+    dict(type=RandomFlip, direction='horizontal'),
+    dict(type=RandomHalfBody),
+    dict(
+        type=RandomBBoxTransform,
+        shift_factor=0.,
+        scale_factor=[0.5, 1.5],
+        rotate_factor=90),
+    dict(type=TopdownAffine, input_size=codec['input_size']),
+    dict(
+        type=Albumentation,
+        transforms=[
+            dict(type=Blur, p=0.1),
+            dict(type=MedianBlur, p=0.1),
+            dict(
+                type=CoarseDropout,
+                max_holes=1,
+                max_height=0.4,
+                max_width=0.4,
+                min_holes=1,
+                min_height=0.2,
+                min_width=0.2,
+                p=0.5),
+        ]),
+    dict(
+        type=GenerateTarget, encoder=codec, use_dataset_keypoint_weights=True),
+    dict(type=PackPoseInputs)
+]
+
+# mapping
+
+aic_coco133 = [(0, 6), (1, 8), (2, 10), (3, 5), (4, 7), (5, 9), (6, 12),
+               (7, 14), (8, 16), (9, 11), (10, 13), (11, 15)]
+
+crowdpose_coco133 = [(0, 5), (1, 6), (2, 7), (3, 8), (4, 9), (5, 10), (6, 11),
+                     (7, 12), (8, 13), (9, 14), (10, 15), (11, 16)]
+
+mpii_coco133 = [
+    (0, 16),
+    (1, 14),
+    (2, 12),
+    (3, 11),
+    (4, 13),
+    (5, 15),
+    (8, 18),
+    (9, 17),
+    (10, 10),
+    (11, 8),
+    (12, 6),
+    (13, 5),
+    (14, 7),
+    (15, 9),
+]
+
+jhmdb_coco133 = [
+    (0, 18),
+    (2, 17),
+    (3, 6),
+    (4, 5),
+    (5, 12),
+    (6, 11),
+    (7, 8),
+    (8, 7),
+    (9, 14),
+    (10, 13),
+    (11, 10),
+    (12, 9),
+    (13, 16),
+    (14, 15),
+]
+
+halpe_coco133 = [(i, i)
+                 for i in range(17)] + [(20, 17), (21, 20), (22, 18), (23, 21),
+                                        (24, 19),
+                                        (25, 22)] + [(i, i - 3)
+                                                     for i in range(26, 136)]
+
+posetrack_coco133 = [
+    (0, 0),
+    (2, 17),
+    (3, 3),
+    (4, 4),
+    (5, 5),
+    (6, 6),
+    (7, 7),
+    (8, 8),
+    (9, 9),
+    (10, 10),
+    (11, 11),
+    (12, 12),
+    (13, 13),
+    (14, 14),
+    (15, 15),
+    (16, 16),
+]
+
+humanart_coco133 = [(i, i) for i in range(17)] + [(17, 99), (18, 120),
+                                                  (19, 17), (20, 20)]
+
+# train datasets
+dataset_coco = dict(
+    type=dataset_type,
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='coco/annotations/coco_wholebody_train_v1.0.json',
+    data_prefix=dict(img='detection/coco/train2017/'),
+    pipeline=[],
+)
+
+dataset_aic = dict(
+    type=AicDataset,
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='aic/annotations/aic_train.json',
+    data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
+                     '_train_20170902/keypoint_train_images_20170902/'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=aic_coco133)
+    ],
+)
+
+dataset_crowdpose = dict(
+    type=CrowdPoseDataset,
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
+    data_prefix=dict(img='pose/CrowdPose/images/'),
+    pipeline=[
+        dict(
+            type=KeypointConverter,
+            num_keypoints=num_keypoints,
+            mapping=crowdpose_coco133)
+    ],
+)
+
+dataset_mpii = dict(
+    type=MpiiDataset,
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='mpii/annotations/mpii_train.json',
+    data_prefix=dict(img='pose/MPI/images/'),
+    pipeline=[
+        dict(
+            type=KeypointConverter,
+            num_keypoints=num_keypoints,
+            mapping=mpii_coco133)
+    ],
+)
+
+dataset_jhmdb = dict(
+    type=JhmdbDataset,
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='jhmdb/annotations/Sub1_train.json',
+    data_prefix=dict(img='pose/JHMDB/'),
+    pipeline=[
+        dict(
+            type=KeypointConverter,
+            num_keypoints=num_keypoints,
+            mapping=jhmdb_coco133)
+    ],
+)
+
+dataset_halpe = dict(
+    type=HalpeDataset,
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='halpe/annotations/halpe_train_v1.json',
+    data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
+    pipeline=[
+        dict(
+            type=KeypointConverter,
+            num_keypoints=num_keypoints,
+            mapping=halpe_coco133)
+    ],
+)
+
+dataset_posetrack = dict(
+    type=PoseTrack18Dataset,
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='posetrack18/annotations/posetrack18_train.json',
+    data_prefix=dict(img='pose/PoseChallenge2018/'),
+    pipeline=[
+        dict(
+            type=KeypointConverter,
+            num_keypoints=num_keypoints,
+            mapping=posetrack_coco133)
+    ],
+)
+
+dataset_humanart = dict(
+    type=HumanArt21Dataset,
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='HumanArt/annotations/training_humanart.json',
+    filter_cfg=dict(scenes=['real_human']),
+    data_prefix=dict(img='pose/'),
+    pipeline=[
+        dict(
+            type=KeypointConverter,
+            num_keypoints=num_keypoints,
+            mapping=humanart_coco133)
+    ])
+
+ubody_scenes = [
+    'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow',
+    'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing',
+    'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference'
+]
+
+ubody_datasets = []
+for scene in ubody_scenes:
+    each = dict(
+        type=UBody2dDataset,
+        data_root=data_root,
+        data_mode=data_mode,
+        ann_file=f'Ubody/annotations/{scene}/train_annotations.json',
+        data_prefix=dict(img='pose/UBody/images/'),
+        pipeline=[],
+        sample_interval=10)
+    ubody_datasets.append(each)
+
+dataset_ubody = dict(
+    type=CombinedDataset,
+    metainfo=dict(from_file='configs/_base_/datasets/ubody2d.py'),
+    datasets=ubody_datasets,
+    pipeline=[],
+    test_mode=False,
+)
+
+face_pipeline = [
+    dict(type=LoadImage, backend_args=backend_args),
+    dict(type=GetBBoxCenterScale),
+    dict(
+        type=RandomBBoxTransform,
+        shift_factor=0.,
+        scale_factor=[0.3, 0.5],
+        rotate_factor=0),
+    dict(type=TopdownAffine, input_size=codec['input_size']),
+]
+
+wflw_coco133 = [(i * 2, 20 + i)
+                for i in range(17)] + [(33 + i, 41 + i) for i in range(5)] + [
+                    (42 + i, 46 + i) for i in range(5)
+                ] + [(51 + i, 50 + i)
+                     for i in range(9)] + [(60, 59), (61, 60), (63, 61),
+                                           (64, 62), (65, 63), (67, 64),
+                                           (68, 65), (69, 66), (71, 67),
+                                           (72, 68), (73, 69),
+                                           (75, 70)] + [(76 + i, 71 + i)
+                                                        for i in range(20)]
+dataset_wflw = dict(
+    type=WFLWDataset,
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='wflw/annotations/face_landmarks_wflw_train.json',
+    data_prefix=dict(img='pose/WFLW/images/'),
+    pipeline=[
+        dict(
+            type=KeypointConverter,
+            num_keypoints=num_keypoints,
+            mapping=wflw_coco133), *face_pipeline
+    ],
+)
+
+mapping_300w_coco133 = [(i, 20 + i) for i in range(68)]
+dataset_300w = dict(
+    type=Face300WDataset,
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='300w/annotations/face_landmarks_300w_train.json',
+    data_prefix=dict(img='pose/300w/images/'),
+    pipeline=[
+        dict(
+            type=KeypointConverter,
+            num_keypoints=num_keypoints,
+            mapping=mapping_300w_coco133), *face_pipeline
+    ],
+)
+
+cofw_coco133 = [(0, 41), (2, 45), (4, 43), (1, 47), (3, 49), (6, 45), (8, 59),
+                (10, 62), (9, 68), (11, 65), (18, 54), (19, 58), (20, 53),
+                (21, 56), (22, 71), (23, 77), (24, 74), (25, 85), (26, 89),
+                (27, 80), (28, 28)]
+dataset_cofw = dict(
+    type=COFWDataset,
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='cofw/annotations/cofw_train.json',
+    data_prefix=dict(img='pose/COFW/images/'),
+    pipeline=[
+        dict(
+            type=KeypointConverter,
+            num_keypoints=num_keypoints,
+            mapping=cofw_coco133), *face_pipeline
+    ],
+)
+
+lapa_coco133 = [(i * 2, 20 + i) for i in range(17)] + [
+    (33 + i, 41 + i) for i in range(5)
+] + [(42 + i, 46 + i) for i in range(5)] + [
+    (51 + i, 50 + i) for i in range(4)
+] + [(58 + i, 54 + i) for i in range(5)] + [(66, 59), (67, 60), (69, 61),
+                                            (70, 62), (71, 63), (73, 64),
+                                            (75, 65), (76, 66), (78, 67),
+                                            (79, 68), (80, 69),
+                                            (82, 70)] + [(84 + i, 71 + i)
+                                                         for i in range(20)]
+dataset_lapa = dict(
+    type=LapaDataset,
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='LaPa/annotations/lapa_trainval.json',
+    data_prefix=dict(img='pose/LaPa/'),
+    pipeline=[
+        dict(
+            type=KeypointConverter,
+            num_keypoints=num_keypoints,
+            mapping=lapa_coco133), *face_pipeline
+    ],
+)
+
+dataset_wb = dict(
+    type=CombinedDataset,
+    metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+    datasets=[dataset_coco, dataset_halpe, dataset_ubody],
+    pipeline=[],
+    test_mode=False,
+)
+
+dataset_body = dict(
+    type=CombinedDataset,
+    metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+    datasets=[
+        dataset_aic,
+        dataset_crowdpose,
+        dataset_mpii,
+        dataset_jhmdb,
+        dataset_posetrack,
+        dataset_humanart,
+    ],
+    pipeline=[],
+    test_mode=False,
+)
+
+dataset_face = dict(
+    type=CombinedDataset,
+    metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+    datasets=[
+        dataset_wflw,
+        dataset_300w,
+        dataset_cofw,
+        dataset_lapa,
+    ],
+    pipeline=[],
+    test_mode=False,
+)
+
+train_datasets = [
+    dataset_wb,
+    dataset_body,
+    dataset_face,
+]
+
+# data loaders
+train_dataloader = dict(
+    batch_size=train_batch_size,
+    num_workers=10,
+    pin_memory=True,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=CombinedDataset,
+        metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+        datasets=train_datasets,
+        pipeline=train_pipeline,
+        test_mode=False,
+    ))
+
+val_dataloader = dict(
+    batch_size=val_batch_size,
+    num_workers=4,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type=DefaultSampler, shuffle=False, round_up=False),
+    dataset=dict(
+        type=CocoWholeBodyDataset,
+        ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json',
+        data_prefix=dict(img='data/detection/coco/val2017/'),
+        pipeline=val_pipeline,
+        bbox_file='data/coco/person_detection_results/'
+        'COCO_val2017_detections_AP_H_56_person.json',
+        test_mode=True))
+
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+    checkpoint=dict(
+        save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1))
+custom_hooks = [
+    dict(
+        type=EMAHook,
+        ema_type=ExpMomentumEMA,
+        momentum=0.0002,
+        update_buffers=True,
+        priority=49),
+    dict(
+        type=PipelineSwitchHook,
+        switch_epoch=max_epochs - stage2_num_epochs,
+        switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+    type=CocoWholeBodyMetric,
+    ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json')
+test_evaluator = val_evaluator
diff --git a/configs/wholebody_2d_keypoint/rtmpose/cocktail13/rtmw-x_8xb704-270e_cocktail13-256x192.py b/configs/wholebody_2d_keypoint/rtmpose/cocktail13/rtmw-x_8xb704-270e_cocktail13-256x192.py
new file mode 100644
index 0000000000..02cdfd4ee3
--- /dev/null
+++ b/configs/wholebody_2d_keypoint/rtmpose/cocktail13/rtmw-x_8xb704-270e_cocktail13-256x192.py
@@ -0,0 +1,607 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from mmpose.configs._base_.default_runtime import *  # noqa
+
+from albumentations.augmentations import Blur, CoarseDropout, MedianBlur
+from mmdet.engine.hooks import PipelineSwitchHook
+from mmengine.dataset import DefaultSampler
+from mmengine.hooks import EMAHook
+from mmengine.model import PretrainedInit
+from mmengine.optim import CosineAnnealingLR, LinearLR, OptimWrapper
+from torch.nn import SiLU, SyncBatchNorm
+from torch.optim import AdamW
+
+from mmpose.codecs import SimCCLabel
+from mmpose.datasets import (AicDataset, CocoWholeBodyDataset, COFWDataset,
+                             CombinedDataset, CrowdPoseDataset,
+                             Face300WDataset, GenerateTarget,
+                             GetBBoxCenterScale, HalpeDataset,
+                             HumanArt21Dataset, JhmdbDataset,
+                             KeypointConverter, LapaDataset, LoadImage,
+                             MpiiDataset, PackPoseInputs, PoseTrack18Dataset,
+                             RandomFlip, RandomHalfBody, TopdownAffine,
+                             UBody2dDataset, WFLWDataset)
+from mmpose.datasets.transforms.common_transforms import (
+    Albumentation, PhotometricDistortion, RandomBBoxTransform)
+from mmpose.engine.hooks import ExpMomentumEMA
+from mmpose.evaluation import CocoWholeBodyMetric
+from mmpose.models import (CSPNeXt, CSPNeXtPAFPN, KLDiscretLoss,
+                           PoseDataPreprocessor, RTMWHead,
+                           TopdownPoseEstimator)
+
+# common setting
+num_keypoints = 133
+input_size = (192, 256)
+
+# runtime
+max_epochs = 270
+stage2_num_epochs = 10
+base_lr = 5e-4
+train_batch_size = 704
+val_batch_size = 32
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+    type=OptimWrapper,
+    optimizer=dict(type=AdamW, lr=base_lr, weight_decay=0.05),
+    clip_grad=dict(max_norm=35, norm_type=2),
+    paramwise_cfg=dict(
+        norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type=LinearLR, start_factor=1.0e-5, by_epoch=False, begin=0, end=1000),
+    dict(
+        type=CosineAnnealingLR,
+        eta_min=base_lr * 0.05,
+        begin=max_epochs // 2,
+        end=max_epochs,
+        T_max=max_epochs // 2,
+        by_epoch=True,
+        convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=5632)
+
+# codec settings
+codec = dict(
+    type=SimCCLabel,
+    input_size=input_size,
+    sigma=(4.9, 5.66),
+    simcc_split_ratio=2.0,
+    normalize=False,
+    use_dark=False)
+
+# model settings
+model = dict(
+    type=TopdownPoseEstimator,
+    data_preprocessor=dict(
+        type=PoseDataPreprocessor,
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True),
+    backbone=dict(
+        type=CSPNeXt,
+        arch='P5',
+        expand_ratio=0.5,
+        deepen_factor=1.33,
+        widen_factor=1.25,
+        channel_attention=True,
+        norm_cfg=dict(type='BN'),
+        act_cfg=dict(type=SiLU),
+        init_cfg=dict(
+            type=PretrainedInit,
+            prefix='backbone.',
+            checkpoint='https://download.openmmlab.com/mmpose/v1/'
+            'wholebody_2d_keypoint/rtmpose/ubody/rtmpose-x_simcc-ucoco_pt-aic-coco_270e-256x192-05f5bcb7_20230822.pth'  # noqa
+        )),
+    neck=dict(
+        type=CSPNeXtPAFPN,
+        in_channels=[320, 640, 1280],
+        out_channels=None,
+        out_indices=(
+            1,
+            2,
+        ),
+        num_csp_blocks=2,
+        expand_ratio=0.5,
+        norm_cfg=dict(type=SyncBatchNorm),
+        act_cfg=dict(type=SiLU, inplace=True)),
+    head=dict(
+        type=RTMWHead,
+        in_channels=1280,
+        out_channels=num_keypoints,
+        input_size=input_size,
+        in_featuremap_size=tuple([s // 32 for s in input_size]),
+        simcc_split_ratio=codec['simcc_split_ratio'],
+        final_layer_kernel_size=7,
+        gau_cfg=dict(
+            hidden_dims=256,
+            s=128,
+            expansion_factor=2,
+            dropout_rate=0.,
+            drop_path=0.,
+            act_fn=SiLU,
+            use_rel_bias=False,
+            pos_enc=False),
+        loss=dict(
+            type=KLDiscretLoss,
+            use_target_weight=True,
+            beta=10.,
+            label_softmax=True),
+        decoder=codec),
+    test_cfg=dict(flip_test=True))
+
+# base dataset settings
+dataset_type = CocoWholeBodyDataset
+data_mode = 'topdown'
+data_root = 'data/'
+
+backend_args = dict(backend='local')
+
+# pipelines
+train_pipeline = [
+    dict(type=LoadImage, backend_args=backend_args),
+    dict(type=GetBBoxCenterScale),
+    dict(type=RandomFlip, direction='horizontal'),
+    dict(type=RandomHalfBody),
+    dict(type=RandomBBoxTransform, scale_factor=[0.5, 1.5], rotate_factor=90),
+    dict(type=TopdownAffine, input_size=codec['input_size']),
+    dict(type=PhotometricDistortion),
+    dict(
+        type=Albumentation,
+        transforms=[
+            dict(type=Blur, p=0.1),
+            dict(type=MedianBlur, p=0.1),
+            dict(
+                type=CoarseDropout,
+                max_holes=1,
+                max_height=0.4,
+                max_width=0.4,
+                min_holes=1,
+                min_height=0.2,
+                min_width=0.2,
+                p=1.0),
+        ]),
+    dict(
+        type=GenerateTarget, encoder=codec, use_dataset_keypoint_weights=True),
+    dict(type=PackPoseInputs)
+]
+val_pipeline = [
+    dict(type=LoadImage, backend_args=backend_args),
+    dict(type=GetBBoxCenterScale),
+    dict(type=TopdownAffine, input_size=codec['input_size']),
+    dict(type=PackPoseInputs)
+]
+
+train_pipeline_stage2 = [
+    dict(type=LoadImage, backend_args=backend_args),
+    dict(type=GetBBoxCenterScale),
+    dict(type=RandomFlip, direction='horizontal'),
+    dict(type=RandomHalfBody),
+    dict(
+        type=RandomBBoxTransform,
+        shift_factor=0.,
+        scale_factor=[0.5, 1.5],
+        rotate_factor=90),
+    dict(type=TopdownAffine, input_size=codec['input_size']),
+    dict(
+        type=Albumentation,
+        transforms=[
+            dict(type=Blur, p=0.1),
+            dict(type=MedianBlur, p=0.1),
+            dict(
+                type=CoarseDropout,
+                max_holes=1,
+                max_height=0.4,
+                max_width=0.4,
+                min_holes=1,
+                min_height=0.2,
+                min_width=0.2,
+                p=0.5),
+        ]),
+    dict(
+        type=GenerateTarget, encoder=codec, use_dataset_keypoint_weights=True),
+    dict(type=PackPoseInputs)
+]
+
+# mapping
+
+aic_coco133 = [(0, 6), (1, 8), (2, 10), (3, 5), (4, 7), (5, 9), (6, 12),
+               (7, 14), (8, 16), (9, 11), (10, 13), (11, 15)]
+
+crowdpose_coco133 = [(0, 5), (1, 6), (2, 7), (3, 8), (4, 9), (5, 10), (6, 11),
+                     (7, 12), (8, 13), (9, 14), (10, 15), (11, 16)]
+
+mpii_coco133 = [
+    (0, 16),
+    (1, 14),
+    (2, 12),
+    (3, 11),
+    (4, 13),
+    (5, 15),
+    (8, 18),
+    (9, 17),
+    (10, 10),
+    (11, 8),
+    (12, 6),
+    (13, 5),
+    (14, 7),
+    (15, 9),
+]
+
+jhmdb_coco133 = [
+    (0, 18),
+    (2, 17),
+    (3, 6),
+    (4, 5),
+    (5, 12),
+    (6, 11),
+    (7, 8),
+    (8, 7),
+    (9, 14),
+    (10, 13),
+    (11, 10),
+    (12, 9),
+    (13, 16),
+    (14, 15),
+]
+
+halpe_coco133 = [(i, i)
+                 for i in range(17)] + [(20, 17), (21, 20), (22, 18), (23, 21),
+                                        (24, 19),
+                                        (25, 22)] + [(i, i - 3)
+                                                     for i in range(26, 136)]
+
+posetrack_coco133 = [
+    (0, 0),
+    (2, 17),
+    (3, 3),
+    (4, 4),
+    (5, 5),
+    (6, 6),
+    (7, 7),
+    (8, 8),
+    (9, 9),
+    (10, 10),
+    (11, 11),
+    (12, 12),
+    (13, 13),
+    (14, 14),
+    (15, 15),
+    (16, 16),
+]
+
+humanart_coco133 = [(i, i) for i in range(17)] + [(17, 99), (18, 120),
+                                                  (19, 17), (20, 20)]
+
+# train datasets
+dataset_coco = dict(
+    type=dataset_type,
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='coco/annotations/coco_wholebody_train_v1.0.json',
+    data_prefix=dict(img='detection/coco/train2017/'),
+    pipeline=[],
+)
+
+dataset_aic = dict(
+    type=AicDataset,
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='aic/annotations/aic_train.json',
+    data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
+                     '_train_20170902/keypoint_train_images_20170902/'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=aic_coco133)
+    ],
+)
+
+dataset_crowdpose = dict(
+    type=CrowdPoseDataset,
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
+    data_prefix=dict(img='pose/CrowdPose/images/'),
+    pipeline=[
+        dict(
+            type=KeypointConverter,
+            num_keypoints=num_keypoints,
+            mapping=crowdpose_coco133)
+    ],
+)
+
+dataset_mpii = dict(
+    type=MpiiDataset,
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='mpii/annotations/mpii_train.json',
+    data_prefix=dict(img='pose/MPI/images/'),
+    pipeline=[
+        dict(
+            type=KeypointConverter,
+            num_keypoints=num_keypoints,
+            mapping=mpii_coco133)
+    ],
+)
+
+dataset_jhmdb = dict(
+    type=JhmdbDataset,
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='jhmdb/annotations/Sub1_train.json',
+    data_prefix=dict(img='pose/JHMDB/'),
+    pipeline=[
+        dict(
+            type=KeypointConverter,
+            num_keypoints=num_keypoints,
+            mapping=jhmdb_coco133)
+    ],
+)
+
+dataset_halpe = dict(
+    type=HalpeDataset,
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='halpe/annotations/halpe_train_v1.json',
+    data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
+    pipeline=[
+        dict(
+            type=KeypointConverter,
+            num_keypoints=num_keypoints,
+            mapping=halpe_coco133)
+    ],
+)
+
+dataset_posetrack = dict(
+    type=PoseTrack18Dataset,
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='posetrack18/annotations/posetrack18_train.json',
+    data_prefix=dict(img='pose/PoseChallenge2018/'),
+    pipeline=[
+        dict(
+            type=KeypointConverter,
+            num_keypoints=num_keypoints,
+            mapping=posetrack_coco133)
+    ],
+)
+
+dataset_humanart = dict(
+    type=HumanArt21Dataset,
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='HumanArt/annotations/training_humanart.json',
+    filter_cfg=dict(scenes=['real_human']),
+    data_prefix=dict(img='pose/'),
+    pipeline=[
+        dict(
+            type=KeypointConverter,
+            num_keypoints=num_keypoints,
+            mapping=humanart_coco133)
+    ])
+
+ubody_scenes = [
+    'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow',
+    'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing',
+    'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference'
+]
+
+ubody_datasets = []
+for scene in ubody_scenes:
+    each = dict(
+        type=UBody2dDataset,
+        data_root=data_root,
+        data_mode=data_mode,
+        ann_file=f'Ubody/annotations/{scene}/train_annotations.json',
+        data_prefix=dict(img='pose/UBody/images/'),
+        pipeline=[],
+        sample_interval=10)
+    ubody_datasets.append(each)
+
+dataset_ubody = dict(
+    type=CombinedDataset,
+    metainfo=dict(from_file='configs/_base_/datasets/ubody2d.py'),
+    datasets=ubody_datasets,
+    pipeline=[],
+    test_mode=False,
+)
+
+face_pipeline = [
+    dict(type=LoadImage, backend_args=backend_args),
+    dict(type=GetBBoxCenterScale),
+    dict(
+        type=RandomBBoxTransform,
+        shift_factor=0.,
+        scale_factor=[0.3, 0.5],
+        rotate_factor=0),
+    dict(type=TopdownAffine, input_size=codec['input_size']),
+]
+
+wflw_coco133 = [(i * 2, 20 + i)
+                for i in range(17)] + [(33 + i, 41 + i) for i in range(5)] + [
+                    (42 + i, 46 + i) for i in range(5)
+                ] + [(51 + i, 50 + i)
+                     for i in range(9)] + [(60, 59), (61, 60), (63, 61),
+                                           (64, 62), (65, 63), (67, 64),
+                                           (68, 65), (69, 66), (71, 67),
+                                           (72, 68), (73, 69),
+                                           (75, 70)] + [(76 + i, 71 + i)
+                                                        for i in range(20)]
+dataset_wflw = dict(
+    type=WFLWDataset,
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='wflw/annotations/face_landmarks_wflw_train.json',
+    data_prefix=dict(img='pose/WFLW/images/'),
+    pipeline=[
+        dict(
+            type=KeypointConverter,
+            num_keypoints=num_keypoints,
+            mapping=wflw_coco133), *face_pipeline
+    ],
+)
+
+mapping_300w_coco133 = [(i, 20 + i) for i in range(68)]
+dataset_300w = dict(
+    type=Face300WDataset,
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='300w/annotations/face_landmarks_300w_train.json',
+    data_prefix=dict(img='pose/300w/images/'),
+    pipeline=[
+        dict(
+            type=KeypointConverter,
+            num_keypoints=num_keypoints,
+            mapping=mapping_300w_coco133), *face_pipeline
+    ],
+)
+
+cofw_coco133 = [(0, 41), (2, 45), (4, 43), (1, 47), (3, 49), (6, 45), (8, 59),
+                (10, 62), (9, 68), (11, 65), (18, 54), (19, 58), (20, 53),
+                (21, 56), (22, 71), (23, 77), (24, 74), (25, 85), (26, 89),
+                (27, 80), (28, 28)]
+dataset_cofw = dict(
+    type=COFWDataset,
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='cofw/annotations/cofw_train.json',
+    data_prefix=dict(img='pose/COFW/images/'),
+    pipeline=[
+        dict(
+            type=KeypointConverter,
+            num_keypoints=num_keypoints,
+            mapping=cofw_coco133), *face_pipeline
+    ],
+)
+
+lapa_coco133 = [(i * 2, 20 + i) for i in range(17)] + [
+    (33 + i, 41 + i) for i in range(5)
+] + [(42 + i, 46 + i) for i in range(5)] + [
+    (51 + i, 50 + i) for i in range(4)
+] + [(58 + i, 54 + i) for i in range(5)] + [(66, 59), (67, 60), (69, 61),
+                                            (70, 62), (71, 63), (73, 64),
+                                            (75, 65), (76, 66), (78, 67),
+                                            (79, 68), (80, 69),
+                                            (82, 70)] + [(84 + i, 71 + i)
+                                                         for i in range(20)]
+dataset_lapa = dict(
+    type=LapaDataset,
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='LaPa/annotations/lapa_trainval.json',
+    data_prefix=dict(img='pose/LaPa/'),
+    pipeline=[
+        dict(
+            type=KeypointConverter,
+            num_keypoints=num_keypoints,
+            mapping=lapa_coco133), *face_pipeline
+    ],
+)
+
+dataset_wb = dict(
+    type=CombinedDataset,
+    metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+    datasets=[dataset_coco, dataset_halpe, dataset_ubody],
+    pipeline=[],
+    test_mode=False,
+)
+
+dataset_body = dict(
+    type=CombinedDataset,
+    metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+    datasets=[
+        dataset_aic,
+        dataset_crowdpose,
+        dataset_mpii,
+        dataset_jhmdb,
+        dataset_posetrack,
+        dataset_humanart,
+    ],
+    pipeline=[],
+    test_mode=False,
+)
+
+dataset_face = dict(
+    type=CombinedDataset,
+    metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+    datasets=[
+        dataset_wflw,
+        dataset_300w,
+        dataset_cofw,
+        dataset_lapa,
+    ],
+    pipeline=[],
+    test_mode=False,
+)
+
+train_datasets = [
+    dataset_wb,
+    dataset_body,
+    dataset_face,
+]
+
+# data loaders
+train_dataloader = dict(
+    batch_size=train_batch_size,
+    num_workers=10,
+    pin_memory=True,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=CombinedDataset,
+        metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+        datasets=train_datasets,
+        pipeline=train_pipeline,
+        test_mode=False,
+    ))
+
+val_dataloader = dict(
+    batch_size=val_batch_size,
+    num_workers=4,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type=DefaultSampler, shuffle=False, round_up=False),
+    dataset=dict(
+        type=CocoWholeBodyDataset,
+        ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json',
+        data_prefix=dict(img='data/detection/coco/val2017/'),
+        pipeline=val_pipeline,
+        bbox_file='data/coco/person_detection_results/'
+        'COCO_val2017_detections_AP_H_56_person.json',
+        test_mode=True))
+
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+    checkpoint=dict(
+        save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1))
+custom_hooks = [
+    dict(
+        type=EMAHook,
+        ema_type=ExpMomentumEMA,
+        momentum=0.0002,
+        update_buffers=True,
+        priority=49),
+    dict(
+        type=PipelineSwitchHook,
+        switch_epoch=max_epochs - stage2_num_epochs,
+        switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+    type=CocoWholeBodyMetric,
+    ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json')
+test_evaluator = val_evaluator
diff --git a/configs/wholebody_2d_keypoint/rtmpose/cocktail13/rtmw_cocktail13.md b/configs/wholebody_2d_keypoint/rtmpose/cocktail13/rtmw_cocktail13.md
new file mode 100644
index 0000000000..54e75383ba
--- /dev/null
+++ b/configs/wholebody_2d_keypoint/rtmpose/cocktail13/rtmw_cocktail13.md
@@ -0,0 +1,76 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-030-58580-8_27">RTMPose (arXiv'2023)</a></summary>
+
+```bibtex
+@misc{https://doi.org/10.48550/arxiv.2303.07399,
+  doi = {10.48550/ARXIV.2303.07399},
+  url = {https://arxiv.org/abs/2303.07399},
+  author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai},
+  keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
+  title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose},
+  publisher = {arXiv},
+  year = {2023},
+  copyright = {Creative Commons Attribution 4.0 International}
+}
+
+```
+
+</details>
+
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/2212.07784">RTMDet (arXiv'2022)</a></summary>
+
+```bibtex
+@misc{lyu2022rtmdet,
+      title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
+      author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
+      year={2022},
+      eprint={2212.07784},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-030-58545-7_12">COCO-WholeBody (ECCV'2020)</a></summary>
+
+```bibtex
+@inproceedings{jin2020whole,
+  title={Whole-Body Human Pose Estimation in the Wild},
+  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+  year={2020}
+}
+```
+
+</details>
+
+- `Cocktail13` denotes model trained on 13 public datasets:
+  - [AI Challenger](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#aic)
+  - [CrowdPose](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#crowdpose)
+  - [MPII](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#mpii)
+  - [sub-JHMDB](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#sub-jhmdb-dataset)
+  - [Halpe](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_wholebody_keypoint.html#halpe)
+  - [PoseTrack18](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#posetrack18)
+  - [COCO-Wholebody](https://github.com/jin-s13/COCO-WholeBody/)
+  - [UBody](https://github.com/IDEA-Research/OSX)
+  - [Human-Art](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#human-art-dataset)
+  - [WFLW](https://wywu.github.io/projects/LAB/WFLW.html)
+  - [300W](https://ibug.doc.ic.ac.uk/resources/300-W/)
+  - [COFW](http://www.vision.caltech.edu/xpburgos/ICCV13/)
+  - [LaPa](https://github.com/JDAI-CV/lapa-dataset)
+
+Results on COCO-WholeBody v1.0 val with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch                                    | Input Size | Body AP | Body AR | Foot AP | Foot AR | Face AP | Face AR | Hand AP | Hand AR | Whole AP | Whole AR |                   ckpt                   |                   log                   |
+| :-------------------------------------- | :--------: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :------: | :------: | :--------------------------------------: | :-------------------------------------: |
+| [rtmw-x](/configs/wholebody_2d_keypoint/rtmpose/cocktail13/rtmw-x_8xb320-270e_cocktail13-256x192.py) |  256x192   |  0.753  |  0.815  |  0.773  |  0.869  |  0.843  |  0.894  |  0.602  |  0.703  |  0.672   |  0.754   | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmw/rtmw-x_simcc-cocktail13_pt-ucoco_270e-256x192-fbef0d61_20230925.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmw/rtmw-x_simcc-cocktail13_pt-ucoco_270e-256x192-fbef0d61_20230925.json) |
+| [rtmw-x](/configs/wholebody_2d_keypoint/rtmpose/cocktail13/rtmw-x_8xb320-270e_cocktail13-384x288.py) |  384x288   |  0.764  |  0.825  |  0.791  |  0.883  |  0.882  |  0.922  |  0.654  |  0.744  |  0.702   |  0.779   | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmw/rtmw-x_simcc-cocktail13_pt-ucoco_270e-384x288-0949e3a9_20230925.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmw/rtmw-x_simcc-cocktail13_pt-ucoco_270e-384x288-0949e3a9_20230925.json) |
diff --git a/mmpose/datasets/transforms/__init__.py b/mmpose/datasets/transforms/__init__.py
index b07a18885c..54ad7f3159 100644
--- a/mmpose/datasets/transforms/__init__.py
+++ b/mmpose/datasets/transforms/__init__.py
@@ -6,7 +6,7 @@
                                 GenerateTarget, GetBBoxCenterScale,
                                 PhotometricDistortion, RandomBBoxTransform,
                                 RandomFlip, RandomHalfBody, YOLOXHSVRandomAug)
-from .converting import KeypointConverter
+from .converting import KeypointConverter, SingleHandConverter
 from .formatting import PackPoseInputs
 from .hand_transforms import HandRandomFlip
 from .loading import LoadImage
@@ -21,5 +21,6 @@
     'BottomupGetHeatmapMask', 'BottomupRandomAffine', 'BottomupResize',
     'GenerateTarget', 'KeypointConverter', 'RandomFlipAroundRoot',
     'FilterAnnotations', 'YOLOXHSVRandomAug', 'YOLOXMixUp', 'Mosaic',
-    'BottomupRandomCrop', 'BottomupRandomChoiceResize', 'HandRandomFlip'
+    'BottomupRandomCrop', 'BottomupRandomChoiceResize', 'HandRandomFlip',
+    'SingleHandConverter'
 ]
diff --git a/mmpose/datasets/transforms/common_transforms.py b/mmpose/datasets/transforms/common_transforms.py
index 98aed11683..33f9c560c0 100644
--- a/mmpose/datasets/transforms/common_transforms.py
+++ b/mmpose/datasets/transforms/common_transforms.py
@@ -503,8 +503,13 @@ def _get_transform_params(self, num_bboxes: int) -> Tuple:
             - scale (np.ndarray): Scaling factor of each bbox in shape (n, 1)
             - rotate (np.ndarray): Rotation degree of each bbox in shape (n,)
         """
+        random_v = self._truncnorm(size=(num_bboxes, 4))
+        offset_v = random_v[:, :2]
+        scale_v = random_v[:, 2:3]
+        rotate_v = random_v[:, 3]
+
         # Get shift parameters
-        offset = self._truncnorm(size=(num_bboxes, 2)) * self.shift_factor
+        offset = offset_v * self.shift_factor
         offset = np.where(
             np.random.rand(num_bboxes, 1) < self.shift_prob, offset, 0.)
 
@@ -512,12 +517,12 @@ def _get_transform_params(self, num_bboxes: int) -> Tuple:
         scale_min, scale_max = self.scale_factor
         mu = (scale_max + scale_min) * 0.5
         sigma = (scale_max - scale_min) * 0.5
-        scale = self._truncnorm(size=(num_bboxes, 1)) * sigma + mu
+        scale = scale_v * sigma + mu
         scale = np.where(
             np.random.rand(num_bboxes, 1) < self.scale_prob, scale, 1.)
 
         # Get rotation parameters
-        rotate = self._truncnorm(size=(num_bboxes, )) * self.rotate_factor
+        rotate = rotate_v * self.rotate_factor
         rotate = np.where(
             np.random.rand(num_bboxes) < self.rotate_prob, rotate, 0.)
 
diff --git a/mmpose/datasets/transforms/converting.py b/mmpose/datasets/transforms/converting.py
index 1906f16972..fc8b335e26 100644
--- a/mmpose/datasets/transforms/converting.py
+++ b/mmpose/datasets/transforms/converting.py
@@ -62,7 +62,10 @@ def __init__(self, num_keypoints: int,
                                                                   int]]]):
         self.num_keypoints = num_keypoints
         self.mapping = mapping
-        source_index, target_index = zip(*mapping)
+        if len(mapping):
+            source_index, target_index = zip(*mapping)
+        else:
+            source_index, target_index = [], []
 
         src1, src2 = [], []
         interpolation = False
@@ -89,6 +92,9 @@ def __init__(self, num_keypoints: int,
     def transform(self, results: dict) -> dict:
         """Transforms the keypoint results to match the target keypoints."""
         num_instances = results['keypoints'].shape[0]
+        if len(results['keypoints_visible'].shape) > 2:
+            results['keypoints_visible'] = results['keypoints_visible'][:, :,
+                                                                        0]
 
         # Initialize output arrays
         keypoints = np.zeros((num_instances, self.num_keypoints, 3))
@@ -186,3 +192,83 @@ def __repr__(self) -> str:
         repr_str += f'(num_keypoints={self.num_keypoints}, '\
                     f'mapping={self.mapping})'
         return repr_str
+
+
+@TRANSFORMS.register_module()
+class SingleHandConverter(BaseTransform):
+    """Mapping a single hand keypoints into double hands according to the given
+    mapping and hand type.
+
+    Required Keys:
+
+        - keypoints
+        - keypoints_visible
+        - hand_type
+
+    Modified Keys:
+
+        - keypoints
+        - keypoints_visible
+
+    Args:
+        num_keypoints (int): The number of keypoints in target dataset.
+        left_hand_mapping (list): A list containing mapping indexes. Each
+            element has format (source_index, target_index)
+        right_hand_mapping (list): A list containing mapping indexes. Each
+            element has format (source_index, target_index)
+
+    Example:
+        >>> import numpy as np
+        >>> self = SingleHandConverter(
+        >>>     num_keypoints=42,
+        >>>     left_hand_mapping=[
+        >>>         (0, 0), (1, 1), (2, 2), (3, 3)
+        >>>     ],
+        >>>     right_hand_mapping=[
+        >>>         (0, 21), (1, 22), (2, 23), (3, 24)
+        >>>     ])
+        >>> results = dict(
+        >>>     keypoints=np.arange(84).reshape(2, 21, 2),
+        >>>     keypoints_visible=np.arange(84).reshape(2, 21, 2) % 2,
+        >>>     hand_type=np.array([[0, 1], [1, 0]]))
+        >>> results = self(results)
+    """
+
+    def __init__(self, num_keypoints: int,
+                 left_hand_mapping: Union[List[Tuple[int, int]],
+                                          List[Tuple[Tuple, int]]],
+                 right_hand_mapping: Union[List[Tuple[int, int]],
+                                           List[Tuple[Tuple, int]]]):
+        self.num_keypoints = num_keypoints
+        self.left_hand_converter = KeypointConverter(num_keypoints,
+                                                     left_hand_mapping)
+        self.right_hand_converter = KeypointConverter(num_keypoints,
+                                                      right_hand_mapping)
+
+    def transform(self, results: dict) -> dict:
+        """Transforms the keypoint results to match the target keypoints."""
+        assert 'hand_type' in results, (
+            'hand_type should be provided in results')
+        hand_type = results['hand_type']
+
+        if np.sum(hand_type - [[0, 1]]) <= 1e-6:
+            # left hand
+            results = self.left_hand_converter(results)
+        elif np.sum(hand_type - [[1, 0]]) <= 1e-6:
+            results = self.right_hand_converter(results)
+        else:
+            raise ValueError('hand_type should be left or right')
+
+        return results
+
+    def __repr__(self) -> str:
+        """print the basic information of the transform.
+
+        Returns:
+            str: Formatted string.
+        """
+        repr_str = self.__class__.__name__
+        repr_str += f'(num_keypoints={self.num_keypoints}, '\
+                    f'left_hand_converter={self.left_hand_converter}, '\
+                    f'right_hand_converter={self.right_hand_converter})'
+        return repr_str
diff --git a/mmpose/models/heads/__init__.py b/mmpose/models/heads/__init__.py
index b1258f53cc..e4b499ad2b 100644
--- a/mmpose/models/heads/__init__.py
+++ b/mmpose/models/heads/__init__.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .base_head import BaseHead
-from .coord_cls_heads import RTMCCHead, SimCCHead
+from .coord_cls_heads import RTMCCHead, RTMWHead, SimCCHead
 from .heatmap_heads import (AssociativeEmbeddingHead, CIDHead, CPMHead,
                             HeatmapHead, InternetHead, MSPNHead, ViPNASHead)
 from .hybrid_heads import DEKRHead, VisPredictHead
@@ -16,5 +16,5 @@
     'DSNTHead', 'AssociativeEmbeddingHead', 'DEKRHead', 'VisPredictHead',
     'CIDHead', 'RTMCCHead', 'TemporalRegressionHead',
     'TrajectoryRegressionHead', 'MotionRegressionHead', 'EDPoseHead',
-    'InternetHead'
+    'InternetHead', 'RTMWHead'
 ]
diff --git a/mmpose/models/heads/coord_cls_heads/__init__.py b/mmpose/models/heads/coord_cls_heads/__init__.py
index 104ff91308..6a4e51c4d7 100644
--- a/mmpose/models/heads/coord_cls_heads/__init__.py
+++ b/mmpose/models/heads/coord_cls_heads/__init__.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .rtmcc_head import RTMCCHead
+from .rtmw_head import RTMWHead
 from .simcc_head import SimCCHead
 
-__all__ = ['SimCCHead', 'RTMCCHead']
+__all__ = ['SimCCHead', 'RTMCCHead', 'RTMWHead']
diff --git a/mmpose/models/heads/coord_cls_heads/rtmw_head.py b/mmpose/models/heads/coord_cls_heads/rtmw_head.py
new file mode 100644
index 0000000000..7111f90446
--- /dev/null
+++ b/mmpose/models/heads/coord_cls_heads/rtmw_head.py
@@ -0,0 +1,337 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import Optional, Sequence, Tuple, Union
+
+import torch
+from mmcv.cnn import ConvModule
+from mmengine.dist import get_dist_info
+from mmengine.structures import PixelData
+from torch import Tensor, nn
+
+from mmpose.codecs.utils import get_simcc_normalized
+from mmpose.evaluation.functional import simcc_pck_accuracy
+from mmpose.models.utils.rtmcc_block import RTMCCBlock, ScaleNorm
+from mmpose.models.utils.tta import flip_vectors
+from mmpose.registry import KEYPOINT_CODECS, MODELS
+from mmpose.utils.tensor_utils import to_numpy
+from mmpose.utils.typing import (ConfigType, InstanceList, OptConfigType,
+                                 OptSampleList)
+from ..base_head import BaseHead
+
+OptIntSeq = Optional[Sequence[int]]
+
+
+@MODELS.register_module()
+class RTMWHead(BaseHead):
+    """Top-down head introduced in RTMPose-Wholebody (2023).
+
+    Args:
+        in_channels (int | sequence[int]): Number of channels in the input
+            feature map.
+        out_channels (int): Number of channels in the output heatmap.
+        input_size (tuple): Size of input image in shape [w, h].
+        in_featuremap_size (int | sequence[int]): Size of input feature map.
+        simcc_split_ratio (float): Split ratio of pixels.
+            Default: 2.0.
+        final_layer_kernel_size (int): Kernel size of the convolutional layer.
+            Default: 1.
+        gau_cfg (Config): Config dict for the Gated Attention Unit.
+            Default: dict(
+                hidden_dims=256,
+                s=128,
+                expansion_factor=2,
+                dropout_rate=0.,
+                drop_path=0.,
+                act_fn='ReLU',
+                use_rel_bias=False,
+                pos_enc=False).
+        loss (Config): Config of the keypoint loss. Defaults to use
+            :class:`KLDiscretLoss`
+        decoder (Config, optional): The decoder config that controls decoding
+            keypoint coordinates from the network output. Defaults to ``None``
+        init_cfg (Config, optional): Config to control the initialization. See
+            :attr:`default_init_cfg` for default settings
+    """
+
+    def __init__(
+        self,
+        in_channels: Union[int, Sequence[int]],
+        out_channels: int,
+        input_size: Tuple[int, int],
+        in_featuremap_size: Tuple[int, int],
+        simcc_split_ratio: float = 2.0,
+        final_layer_kernel_size: int = 1,
+        gau_cfg: ConfigType = dict(
+            hidden_dims=256,
+            s=128,
+            expansion_factor=2,
+            dropout_rate=0.,
+            drop_path=0.,
+            act_fn='ReLU',
+            use_rel_bias=False,
+            pos_enc=False),
+        loss: ConfigType = dict(type='KLDiscretLoss', use_target_weight=True),
+        decoder: OptConfigType = None,
+        init_cfg: OptConfigType = None,
+    ):
+
+        if init_cfg is None:
+            init_cfg = self.default_init_cfg
+
+        super().__init__(init_cfg)
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.input_size = input_size
+        self.in_featuremap_size = in_featuremap_size
+        self.simcc_split_ratio = simcc_split_ratio
+
+        self.loss_module = MODELS.build(loss)
+        if decoder is not None:
+            self.decoder = KEYPOINT_CODECS.build(decoder)
+        else:
+            self.decoder = None
+
+        if isinstance(in_channels, (tuple, list)):
+            raise ValueError(
+                f'{self.__class__.__name__} does not support selecting '
+                'multiple input features.')
+
+        # Define SimCC layers
+        flatten_dims = self.in_featuremap_size[0] * self.in_featuremap_size[1]
+
+        ps = 2
+        self.ps = nn.PixelShuffle(ps)
+        self.conv_dec = ConvModule(
+            in_channels // ps**2,
+            in_channels // 4,
+            kernel_size=final_layer_kernel_size,
+            stride=1,
+            padding=final_layer_kernel_size // 2,
+            norm_cfg=dict(type='BN', requires_grad=True),
+            act_cfg=dict(type='ReLU'))
+
+        self.final_layer = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size=final_layer_kernel_size,
+            stride=1,
+            padding=final_layer_kernel_size // 2,
+            norm_cfg=dict(type='BN', requires_grad=True),
+            act_cfg=dict(type='ReLU'))
+        self.final_layer2 = ConvModule(
+            in_channels // ps + in_channels // 4,
+            out_channels,
+            kernel_size=final_layer_kernel_size,
+            stride=1,
+            padding=final_layer_kernel_size // 2,
+            norm_cfg=dict(type='BN', requires_grad=True),
+            act_cfg=dict(type='ReLU'))
+
+        self.mlp = nn.Sequential(
+            ScaleNorm(flatten_dims),
+            nn.Linear(flatten_dims, gau_cfg['hidden_dims'] // 2, bias=False))
+
+        self.mlp2 = nn.Sequential(
+            ScaleNorm(flatten_dims * ps**2),
+            nn.Linear(
+                flatten_dims * ps**2, gau_cfg['hidden_dims'] // 2, bias=False))
+
+        W = int(self.input_size[0] * self.simcc_split_ratio)
+        H = int(self.input_size[1] * self.simcc_split_ratio)
+
+        self.gau = RTMCCBlock(
+            self.out_channels,
+            gau_cfg['hidden_dims'],
+            gau_cfg['hidden_dims'],
+            s=gau_cfg['s'],
+            expansion_factor=gau_cfg['expansion_factor'],
+            dropout_rate=gau_cfg['dropout_rate'],
+            drop_path=gau_cfg['drop_path'],
+            attn_type='self-attn',
+            act_fn=gau_cfg['act_fn'],
+            use_rel_bias=gau_cfg['use_rel_bias'],
+            pos_enc=gau_cfg['pos_enc'])
+
+        self.cls_x = nn.Linear(gau_cfg['hidden_dims'], W, bias=False)
+        self.cls_y = nn.Linear(gau_cfg['hidden_dims'], H, bias=False)
+
+    def forward(self, feats: Tuple[Tensor]) -> Tuple[Tensor, Tensor]:
+        """Forward the network.
+
+        The input is the featuremap extracted by backbone and the
+        output is the simcc representation.
+
+        Args:
+            feats (Tuple[Tensor]): Multi scale feature maps.
+
+        Returns:
+            pred_x (Tensor): 1d representation of x.
+            pred_y (Tensor): 1d representation of y.
+        """
+        # enc_b  n / 2, h, w
+        # enc_t  n,     h, w
+        enc_b, enc_t = feats
+
+        feats_t = self.final_layer(enc_t)
+        feats_t = torch.flatten(feats_t, 2)
+        feats_t = self.mlp(feats_t)
+
+        dec_t = self.ps(enc_t)
+        dec_t = self.conv_dec(dec_t)
+        enc_b = torch.cat([dec_t, enc_b], dim=1)
+
+        feats_b = self.final_layer2(enc_b)
+        feats_b = torch.flatten(feats_b, 2)
+        feats_b = self.mlp2(feats_b)
+
+        feats = torch.cat([feats_t, feats_b], dim=2)
+
+        feats = self.gau(feats)
+
+        pred_x = self.cls_x(feats)
+        pred_y = self.cls_y(feats)
+
+        return pred_x, pred_y
+
+    def predict(
+        self,
+        feats: Tuple[Tensor],
+        batch_data_samples: OptSampleList,
+        test_cfg: OptConfigType = {},
+    ) -> InstanceList:
+        """Predict results from features.
+
+        Args:
+            feats (Tuple[Tensor] | List[Tuple[Tensor]]): The multi-stage
+                features (or multiple multi-stage features in TTA)
+            batch_data_samples (List[:obj:`PoseDataSample`]): The batch
+                data samples
+            test_cfg (dict): The runtime config for testing process. Defaults
+                to {}
+
+        Returns:
+            List[InstanceData]: The pose predictions, each contains
+            the following fields:
+                - keypoints (np.ndarray): predicted keypoint coordinates in
+                    shape (num_instances, K, D) where K is the keypoint number
+                    and D is the keypoint dimension
+                - keypoint_scores (np.ndarray): predicted keypoint scores in
+                    shape (num_instances, K)
+                - keypoint_x_labels (np.ndarray, optional): The predicted 1-D
+                    intensity distribution in the x direction
+                - keypoint_y_labels (np.ndarray, optional): The predicted 1-D
+                    intensity distribution in the y direction
+        """
+
+        if test_cfg.get('flip_test', False):
+            # TTA: flip test -> feats = [orig, flipped]
+            assert isinstance(feats, list) and len(feats) == 2
+            flip_indices = batch_data_samples[0].metainfo['flip_indices']
+            _feats, _feats_flip = feats
+
+            _batch_pred_x, _batch_pred_y = self.forward(_feats)
+
+            _batch_pred_x_flip, _batch_pred_y_flip = self.forward(_feats_flip)
+            _batch_pred_x_flip, _batch_pred_y_flip = flip_vectors(
+                _batch_pred_x_flip,
+                _batch_pred_y_flip,
+                flip_indices=flip_indices)
+
+            batch_pred_x = (_batch_pred_x + _batch_pred_x_flip) * 0.5
+            batch_pred_y = (_batch_pred_y + _batch_pred_y_flip) * 0.5
+        else:
+            batch_pred_x, batch_pred_y = self.forward(feats)
+
+        preds = self.decode((batch_pred_x, batch_pred_y))
+
+        if test_cfg.get('output_heatmaps', False):
+            rank, _ = get_dist_info()
+            if rank == 0:
+                warnings.warn('The predicted simcc values are normalized for '
+                              'visualization. This may cause discrepancy '
+                              'between the keypoint scores and the 1D heatmaps'
+                              '.')
+
+            # normalize the predicted 1d distribution
+            batch_pred_x = get_simcc_normalized(batch_pred_x)
+            batch_pred_y = get_simcc_normalized(batch_pred_y)
+
+            B, K, _ = batch_pred_x.shape
+            # B, K, Wx -> B, K, Wx, 1
+            x = batch_pred_x.reshape(B, K, 1, -1)
+            # B, K, Wy -> B, K, 1, Wy
+            y = batch_pred_y.reshape(B, K, -1, 1)
+            # B, K, Wx, Wy
+            batch_heatmaps = torch.matmul(y, x)
+            pred_fields = [
+                PixelData(heatmaps=hm) for hm in batch_heatmaps.detach()
+            ]
+
+            for pred_instances, pred_x, pred_y in zip(preds,
+                                                      to_numpy(batch_pred_x),
+                                                      to_numpy(batch_pred_y)):
+
+                pred_instances.keypoint_x_labels = pred_x[None]
+                pred_instances.keypoint_y_labels = pred_y[None]
+
+            return preds, pred_fields
+        else:
+            return preds
+
+    def loss(
+        self,
+        feats: Tuple[Tensor],
+        batch_data_samples: OptSampleList,
+        train_cfg: OptConfigType = {},
+    ) -> dict:
+        """Calculate losses from a batch of inputs and data samples."""
+
+        pred_x, pred_y = self.forward(feats)
+
+        gt_x = torch.cat([
+            d.gt_instance_labels.keypoint_x_labels for d in batch_data_samples
+        ],
+                         dim=0)
+        gt_y = torch.cat([
+            d.gt_instance_labels.keypoint_y_labels for d in batch_data_samples
+        ],
+                         dim=0)
+        keypoint_weights = torch.cat(
+            [
+                d.gt_instance_labels.keypoint_weights
+                for d in batch_data_samples
+            ],
+            dim=0,
+        )
+
+        pred_simcc = (pred_x, pred_y)
+        gt_simcc = (gt_x, gt_y)
+
+        # calculate losses
+        losses = dict()
+        loss = self.loss_module(pred_simcc, gt_simcc, keypoint_weights)
+
+        losses.update(loss_kpt=loss)
+
+        # calculate accuracy
+        _, avg_acc, _ = simcc_pck_accuracy(
+            output=to_numpy(pred_simcc),
+            target=to_numpy(gt_simcc),
+            simcc_split_ratio=self.simcc_split_ratio,
+            mask=to_numpy(keypoint_weights) > 0,
+        )
+
+        acc_pose = torch.tensor(avg_acc, device=gt_x.device)
+        losses.update(acc_pose=acc_pose)
+
+        return losses
+
+    @property
+    def default_init_cfg(self):
+        init_cfg = [
+            dict(type='Normal', layer=['Conv2d'], std=0.001),
+            dict(type='Constant', layer='BatchNorm2d', val=1),
+            dict(type='Normal', layer=['Linear'], std=0.01, bias=0),
+        ]
+        return init_cfg
diff --git a/mmpose/models/necks/__init__.py b/mmpose/models/necks/__init__.py
index c9d14cefc8..d4b4f51308 100644
--- a/mmpose/models/necks/__init__.py
+++ b/mmpose/models/necks/__init__.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .channel_mapper import ChannelMapper
+from .cspnext_pafpn import CSPNeXtPAFPN
 from .fmap_proc_neck import FeatureMapProcessor
 from .fpn import FPN
 from .gap_neck import GlobalAveragePooling
@@ -8,5 +9,5 @@
 
 __all__ = [
     'GlobalAveragePooling', 'PoseWarperNeck', 'FPN', 'FeatureMapProcessor',
-    'ChannelMapper', 'YOLOXPAFPN'
+    'ChannelMapper', 'YOLOXPAFPN', 'CSPNeXtPAFPN'
 ]
diff --git a/mmpose/models/necks/cspnext_pafpn.py b/mmpose/models/necks/cspnext_pafpn.py
new file mode 100644
index 0000000000..35f4dc2f10
--- /dev/null
+++ b/mmpose/models/necks/cspnext_pafpn.py
@@ -0,0 +1,187 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Sequence, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmpose.registry import MODELS
+from mmpose.utils.typing import ConfigType, OptMultiConfig
+from ..utils import CSPLayer
+
+
+@MODELS.register_module()
+class CSPNeXtPAFPN(BaseModule):
+    """Path Aggregation Network with CSPNeXt blocks. Modified from RTMDet.
+
+    Args:
+        in_channels (Sequence[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale)
+        out_indices (Sequence[int]): Output from which stages.
+        num_csp_blocks (int): Number of bottlenecks in CSPLayer.
+            Defaults to 3.
+        use_depthwise (bool): Whether to use depthwise separable convolution in
+            blocks. Defaults to False.
+        expand_ratio (float): Ratio to adjust the number of channels of the
+            hidden layer. Default: 0.5
+        upsample_cfg (dict): Config dict for interpolate layer.
+            Default: `dict(scale_factor=2, mode='nearest')`
+        conv_cfg (dict, optional): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN')
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='Swish')
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(
+        self,
+        in_channels: Sequence[int],
+        out_channels: int,
+        out_indices=(
+            0,
+            1,
+            2,
+        ),
+        num_csp_blocks: int = 3,
+        use_depthwise: bool = False,
+        expand_ratio: float = 0.5,
+        upsample_cfg: ConfigType = dict(scale_factor=2, mode='nearest'),
+        conv_cfg: bool = None,
+        norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001),
+        act_cfg: ConfigType = dict(type='Swish'),
+        init_cfg: OptMultiConfig = dict(
+            type='Kaiming',
+            layer='Conv2d',
+            a=math.sqrt(5),
+            distribution='uniform',
+            mode='fan_in',
+            nonlinearity='leaky_relu')
+    ) -> None:
+        super().__init__(init_cfg)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.out_indices = out_indices
+
+        conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
+
+        # build top-down blocks
+        self.upsample = nn.Upsample(**upsample_cfg)
+        self.reduce_layers = nn.ModuleList()
+        self.top_down_blocks = nn.ModuleList()
+        for idx in range(len(in_channels) - 1, 0, -1):
+            self.reduce_layers.append(
+                ConvModule(
+                    in_channels[idx],
+                    in_channels[idx - 1],
+                    1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+            self.top_down_blocks.append(
+                CSPLayer(
+                    in_channels[idx - 1] * 2,
+                    in_channels[idx - 1],
+                    num_blocks=num_csp_blocks,
+                    add_identity=False,
+                    use_depthwise=use_depthwise,
+                    use_cspnext_block=True,
+                    expand_ratio=expand_ratio,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+
+        # build bottom-up blocks
+        self.downsamples = nn.ModuleList()
+        self.bottom_up_blocks = nn.ModuleList()
+        for idx in range(len(in_channels) - 1):
+            self.downsamples.append(
+                conv(
+                    in_channels[idx],
+                    in_channels[idx],
+                    3,
+                    stride=2,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+            self.bottom_up_blocks.append(
+                CSPLayer(
+                    in_channels[idx] * 2,
+                    in_channels[idx + 1],
+                    num_blocks=num_csp_blocks,
+                    add_identity=False,
+                    use_depthwise=use_depthwise,
+                    use_cspnext_block=True,
+                    expand_ratio=expand_ratio,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+
+        if self.out_channels is not None:
+            self.out_convs = nn.ModuleList()
+            for i in range(len(in_channels)):
+                self.out_convs.append(
+                    conv(
+                        in_channels[i],
+                        out_channels,
+                        3,
+                        padding=1,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        act_cfg=act_cfg))
+            self.out_convs = conv(
+                in_channels[-1],
+                out_channels,
+                3,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+
+    def forward(self, inputs: Tuple[Tensor, ...]) -> Tuple[Tensor, ...]:
+        """
+        Args:
+            inputs (tuple[Tensor]): input features.
+
+        Returns:
+            tuple[Tensor]: YOLOXPAFPN features.
+        """
+        assert len(inputs) == len(self.in_channels)
+
+        # top-down path
+        inner_outs = [inputs[-1]]
+        for idx in range(len(self.in_channels) - 1, 0, -1):
+            feat_high = inner_outs[0]
+            feat_low = inputs[idx - 1]
+            feat_high = self.reduce_layers[len(self.in_channels) - 1 - idx](
+                feat_high)
+            inner_outs[0] = feat_high
+
+            upsample_feat = self.upsample(feat_high)
+
+            inner_out = self.top_down_blocks[len(self.in_channels) - 1 - idx](
+                torch.cat([upsample_feat, feat_low], 1))
+            inner_outs.insert(0, inner_out)
+
+        # bottom-up path
+        outs = [inner_outs[0]]
+        for idx in range(len(self.in_channels) - 1):
+            feat_low = outs[-1]
+            feat_high = inner_outs[idx + 1]
+            downsample_feat = self.downsamples[idx](feat_low)
+            out = self.bottom_up_blocks[idx](
+                torch.cat([downsample_feat, feat_high], 1))
+            outs.append(out)
+
+        if self.out_channels is not None:
+            # out convs
+            for idx, conv in enumerate(self.out_convs):
+                outs[idx] = conv(outs[idx])
+
+        return tuple([outs[i] for i in self.out_indices])
diff --git a/projects/rtmpose/README.md b/projects/rtmpose/README.md
index 27a8a90144..e2412c0014 100644
--- a/projects/rtmpose/README.md
+++ b/projects/rtmpose/README.md
@@ -44,6 +44,9 @@ ______________________________________________________________________
 
 ## 🥳 🚀 What's New [🔝](#-table-of-contents)
 
+- Sep. 2023:
+  - Add RTMW models trained on combined datasets. The alpha version of RTMW-x model achieves 70.2 mAP on COCO-Wholebody val set. The technical report will be released soon.
+  - Add YOLOX and RTMDet models trained on HumanArt dataset.
 - Aug. 2023:
   - Support distilled 133-keypoint WholeBody models powered by [DWPose](https://github.com/IDEA-Research/DWPose/tree/main).
     - You can try DWPose/RTMPose with [sd-webui-controlnet](https://github.com/Mikubill/sd-webui-controlnet) now! Just update your sd-webui-controlnet >= v1.1237, then choose `dw_openpose_full` as preprocessor.
@@ -219,15 +222,6 @@ Feel free to join our community group for more help:
 - RTMPose for Human-Centric Artificial Scenes is supported by  [Human-Art](https://github.com/IDEA-Research/HumanArt)
 - <img src="https://github.com/open-mmlab/mmpose/assets/13503330/685bc610-dd9e-4e6f-9c41-dbc8220584f4" height="300px">
 
-Pose Estimators:
-
-|                                     Config                                      | Input Size | AP<sup><br>(Human-Art GT) | Params<sup><br>(M) | FLOPS<sup><br>(G) | ORT-Latency<sup><br>(ms)<sup><br>(i7-11700) | TRT-FP16-Latency<sup><br>(ms)<sup><br>(GTX 1660Ti) | ncnn-FP16-Latency<sup><br>(ms)<sup><br>(Snapdragon 865) |                                                                                                                                   Download                                                                                                                                   |
-| :-----------------------------------------------------------------------------: | :--------: | :-----------------------: | :----------------: | :---------------: | :-----------------------------------------: | :------------------------------------------------: | :-----------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-| [RTMPose-t\*](./rtmpose/body_2d_keypoint/rtmpose-t_8xb256-420e_coco-256x192.py) |  256x192   |           65.5            |        3.34        |       0.36        |                    3.20                     |                        1.06                        |                          9.02                           | [pth](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-t_8xb256-420e_humanart-256x192-60b68c98_20230612.pth)<br>[onnx](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/onnx_sdk/rtmpose-t_8xb256-420e_humanart-256x192-60b68c98_20230612.zip) |
-| [RTMPose-s\*](./rtmpose/body_2d_keypoint/rtmpose-s_8xb256-420e_coco-256x192.py) |  256x192   |           69.8            |        5.47        |       0.68        |                    4.48                     |                        1.39                        |                          13.89                          | [pth](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_8xb256-420e_humanart-256x192-5a3ac943_20230611.pth)<br>[onnx](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/onnx_sdk/rtmpose-s_8xb256-420e_humanart-256x192-5a3ac943_20230611.zip) |
-| [RTMPose-m\*](./rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-256x192.py) |  256x192   |           72.8            |       13.59        |       1.93        |                    11.06                    |                        2.29                        |                          26.44                          | [pth](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_8xb256-420e_humanart-256x192-8430627b_20230611.pth)<br>[onnx](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/onnx_sdk/rtmpose-m_8xb256-420e_humanart-256x192-8430627b_20230611.zip) |
-| [RTMPose-l\*](./rtmpose/body_2d_keypoint/rtmpose-l_8xb256-420e_coco-256x192.py) |  256x192   |           75.3            |       27.66        |       4.16        |                    18.85                    |                        3.46                        |                          45.37                          | [pth](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_8xb256-420e_humanart-256x192-389f2cb0_20230611.pth)<br>[onnx](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/onnx_sdk/rtmpose-l_8xb256-420e_humanart-256x192-389f2cb0_20230611.zip) |
-
 Detetors:
 
 |       Detection Config        | Input Size | Model AP<sup><br>(OneHand10K) | Flops<sup><br>(G) | ORT-Latency<sup><br>(ms)<sup><br>(i7-11700) | TRT-FP16-Latency<sup><br>(ms)<sup><br>(GTX 1660Ti) |        Download        |
@@ -241,6 +235,15 @@ Detetors:
 | [YOLOX-l](./yolox/humanart/yolox_l_8xb8-300e_humanart.py) |  640x640   |             60.2              |         -         |                      -                      |                         -                          | [Det Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/yolox_l_8xb8-300e_humanart-ce1d7a62.pth) |
 | [YOLOX-x](./yolox/humanart/yolox_x_8xb8-300e_humanart.py) |  640x640   |             61.3              |         -         |                      -                      |                         -                          | [Det Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/yolox_x_8xb8-300e_humanart-a39d44ed.pth) |
 
+Pose Estimators:
+
+|                                     Config                                      | Input Size | AP<sup><br>(Human-Art GT) | Params<sup><br>(M) | FLOPS<sup><br>(G) | ORT-Latency<sup><br>(ms)<sup><br>(i7-11700) | TRT-FP16-Latency<sup><br>(ms)<sup><br>(GTX 1660Ti) | ncnn-FP16-Latency<sup><br>(ms)<sup><br>(Snapdragon 865) |                                                                                                                                   Download                                                                                                                                   |
+| :-----------------------------------------------------------------------------: | :--------: | :-----------------------: | :----------------: | :---------------: | :-----------------------------------------: | :------------------------------------------------: | :-----------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [RTMPose-t\*](./rtmpose/body_2d_keypoint/rtmpose-t_8xb256-420e_coco-256x192.py) |  256x192   |           65.5            |        3.34        |       0.36        |                    3.20                     |                        1.06                        |                          9.02                           | [pth](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-t_8xb256-420e_humanart-256x192-60b68c98_20230612.pth)<br>[onnx](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/onnx_sdk/rtmpose-t_8xb256-420e_humanart-256x192-60b68c98_20230612.zip) |
+| [RTMPose-s\*](./rtmpose/body_2d_keypoint/rtmpose-s_8xb256-420e_coco-256x192.py) |  256x192   |           69.8            |        5.47        |       0.68        |                    4.48                     |                        1.39                        |                          13.89                          | [pth](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_8xb256-420e_humanart-256x192-5a3ac943_20230611.pth)<br>[onnx](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/onnx_sdk/rtmpose-s_8xb256-420e_humanart-256x192-5a3ac943_20230611.zip) |
+| [RTMPose-m\*](./rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-256x192.py) |  256x192   |           72.8            |       13.59        |       1.93        |                    11.06                    |                        2.29                        |                          26.44                          | [pth](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_8xb256-420e_humanart-256x192-8430627b_20230611.pth)<br>[onnx](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/onnx_sdk/rtmpose-m_8xb256-420e_humanart-256x192-8430627b_20230611.zip) |
+| [RTMPose-l\*](./rtmpose/body_2d_keypoint/rtmpose-l_8xb256-420e_coco-256x192.py) |  256x192   |           75.3            |       27.66        |       4.16        |                    18.85                    |                        3.46                        |                          45.37                          | [pth](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_8xb256-420e_humanart-256x192-389f2cb0_20230611.pth)<br>[onnx](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/onnx_sdk/rtmpose-l_8xb256-420e_humanart-256x192-389f2cb0_20230611.zip) |
+
 </details>
 
 #### 26 Keypoints
@@ -276,7 +279,7 @@ For more details, please refer to [GroupFisher Pruning for RTMPose](./rtmpose/pr
 - Keypoints are defined as [COCO-WholeBody](https://github.com/jin-s13/COCO-WholeBody/). For details please refer to the [meta info](/configs/_base_/datasets/coco_wholebody.py).
 - <img src="https://user-images.githubusercontent.com/100993824/227770977-c8f00355-c43a-467e-8444-d307789cf4b2.png" height="300px">
 
-<details open>
+<details close>
 <summary><b>COCO-WholeBody</b></summary>
 
 | Config                          | Input Size | Whole AP | Whole AR | FLOPS<sup><br>(G) | ORT-Latency<sup><br>(ms)<sup><br>(i7-11700) | TRT-FP16-Latency<sup><br>(ms)<sup><br>(GTX 1660Ti) |             Download              |
@@ -289,7 +292,32 @@ For more details, please refer to [GroupFisher Pruning for RTMPose](./rtmpose/pr
 </details>
 
 <details open>
-<summary><b>DWPose</b></summary>
+<summary><b>Cocktail13</b></summary>
+
+- `Cocktail13` denotes model trained on 13 public datasets:
+  - [AI Challenger](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#aic)
+  - [CrowdPose](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#crowdpose)
+  - [MPII](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#mpii)
+  - [sub-JHMDB](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#sub-jhmdb-dataset)
+  - [Halpe](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_wholebody_keypoint.html#halpe)
+  - [PoseTrack18](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#posetrack18)
+  - [COCO-Wholebody](https://github.com/jin-s13/COCO-WholeBody/)
+  - [UBody](https://github.com/IDEA-Research/OSX)
+  - [Human-Art](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#human-art-dataset)
+  - [WFLW](https://wywu.github.io/projects/LAB/WFLW.html)
+  - [300W](https://ibug.doc.ic.ac.uk/resources/300-W/)
+  - [COFW](http://www.vision.caltech.edu/xpburgos/ICCV13/)
+  - [LaPa](https://github.com/JDAI-CV/lapa-dataset)
+
+| Config                          | Input Size | Whole AP | Whole AR | FLOPS<sup><br>(G) | ORT-Latency<sup><br>(ms)<sup><br>(i7-11700) | TRT-FP16-Latency<sup><br>(ms)<sup><br>(GTX 1660Ti) |             Download              |
+| :------------------------------ | :--------: | :------: | :------: | :---------------: | :-----------------------------------------: | :------------------------------------------------: | :-------------------------------: |
+| [RTMW-x<sup><br>(alpha version)](./rtmpose/wholebody_2d_keypoint/rtmpose-l_8xb64-270e_coco-wholebody-256x192.py) |  256x192   |   67.2   |   75.4   |       13.1        |                      -                      |                         -                          | [pth](https://download.openmmlab.com/mmpose/v1/projects/rtmw/rtmw-x_simcc-cocktail13_pt-ucoco_270e-256x192-fbef0d61_20230925.pth) |
+| [RTMW-x<sup><br>(alpha version)](./rtmpose/wholebody_2d_keypoint/rtmpose-x_8xb32-270e_coco-wholebody-384x288.py) |  384x288   |   70.2   |   77.9   |       29.3        |                      -                      |                         -                          | [pth](https://download.openmmlab.com/mmpose/v1/projects/rtmw/rtmw-x_simcc-cocktail13_pt-ucoco_270e-384x288-0949e3a9_20230925.pth) |
+
+</details>
+
+<details open>
+<summary><b>COCO+UBody</b></summary>
 
 - DWPose Models are supported by [DWPose](https://github.com/IDEA-Research/DWPose)
 - Models are trained and distilled on:
diff --git a/projects/rtmpose/README_CN.md b/projects/rtmpose/README_CN.md
index a08c74283f..7930d34797 100644
--- a/projects/rtmpose/README_CN.md
+++ b/projects/rtmpose/README_CN.md
@@ -40,6 +40,9 @@ ______________________________________________________________________
 
 ## 🥳 最新进展 [🔝](#-table-of-contents)
 
+- 2023 年 9 月：
+  - 发布混合数据集上训练的 RTMW 模型。Alpha 版本的 RTMW-x 在 COCO-Wholebody 验证集上取得了 70.2 mAP。技术报告正在撰写中。
+  - 增加 HumanArt 上训练的 YOLOX 和 RTMDet 模型。
 - 2023 年 8 月：
   - 支持基于 RTMPose 模型蒸馏的 133 点 WholeBody 模型（由 [DWPose](https://github.com/IDEA-Research/DWPose/tree/main) 提供）。
     - 你可以在 [sd-webui-controlnet](https://github.com/Mikubill/sd-webui-controlnet) 中使用 DWPose/RTMPose 作为姿态估计后端进行人物图像生成。升级 sd-webui-controlnet >= v1.1237 并选择 `dw_openpose_full` 即可使用。
@@ -210,15 +213,6 @@ RTMPose 是一个长期优化迭代的项目，致力于业务场景下的高性
 - 面向艺术图片的人体姿态估计 RTMPose 模型由 [Human-Art](https://github.com/IDEA-Research/HumanArt) 提供。
 - <img src="https://github.com/open-mmlab/mmpose/assets/13503330/685bc610-dd9e-4e6f-9c41-dbc8220584f4" height="300px">
 
-人体姿态估计模型：
-
-|                                     Config                                      | Input Size | AP<sup><br>(Human-Art GT) | Params<sup><br>(M) | FLOPS<sup><br>(G) | ORT-Latency<sup><br>(ms)<sup><br>(i7-11700) | TRT-FP16-Latency<sup><br>(ms)<sup><br>(GTX 1660Ti) | ncnn-FP16-Latency<sup><br>(ms)<sup><br>(Snapdragon 865) |                                                                                                                                   Download                                                                                                                                   |
-| :-----------------------------------------------------------------------------: | :--------: | :-----------------------: | :----------------: | :---------------: | :-----------------------------------------: | :------------------------------------------------: | :-----------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-| [RTMPose-t\*](./rtmpose/body_2d_keypoint/rtmpose-t_8xb256-420e_coco-256x192.py) |  256x192   |           65.5            |        3.34        |       0.36        |                    3.20                     |                        1.06                        |                          9.02                           | [pth](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-t_8xb256-420e_humanart-256x192-60b68c98_20230612.pth)<br>[onnx](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/onnx_sdk/rtmpose-t_8xb256-420e_humanart-256x192-60b68c98_20230612.zip) |
-| [RTMPose-s\*](./rtmpose/body_2d_keypoint/rtmpose-s_8xb256-420e_coco-256x192.py) |  256x192   |           69.8            |        5.47        |       0.68        |                    4.48                     |                        1.39                        |                          13.89                          | [pth](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_8xb256-420e_humanart-256x192-5a3ac943_20230611.pth)<br>[onnx](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/onnx_sdk/rtmpose-s_8xb256-420e_humanart-256x192-5a3ac943_20230611.zip) |
-| [RTMPose-m\*](./rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-256x192.py) |  256x192   |           72.8            |       13.59        |       1.93        |                    11.06                    |                        2.29                        |                          26.44                          | [pth](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_8xb256-420e_humanart-256x192-8430627b_20230611.pth)<br>[onnx](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/onnx_sdk/rtmpose-m_8xb256-420e_humanart-256x192-8430627b_20230611.zip) |
-| [RTMPose-l\*](./rtmpose/body_2d_keypoint/rtmpose-l_8xb256-420e_coco-256x192.py) |  256x192   |           75.3            |       27.66        |       4.16        |                    18.85                    |                        3.46                        |                          45.37                          | [pth](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_8xb256-420e_humanart-256x192-389f2cb0_20230611.pth)<br>[onnx](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/onnx_sdk/rtmpose-l_8xb256-420e_humanart-256x192-389f2cb0_20230611.zip) |
-
 人体检测模型：
 
 |       Detection Config        | Input Size | Model AP<sup><br>(OneHand10K) | Flops<sup><br>(G) | ORT-Latency<sup><br>(ms)<sup><br>(i7-11700) | TRT-FP16-Latency<sup><br>(ms)<sup><br>(GTX 1660Ti) |        Download        |
@@ -232,6 +226,15 @@ RTMPose 是一个长期优化迭代的项目，致力于业务场景下的高性
 | [YOLOX-l](./yolox/humanart/yolox_l_8xb8-300e_humanart.py) |  640x640   |             60.2              |         -         |                      -                      |                         -                          | [Det Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/yolox_l_8xb8-300e_humanart-ce1d7a62.pth) |
 | [YOLOX-x](./yolox/humanart/yolox_x_8xb8-300e_humanart.py) |  640x640   |             61.3              |         -         |                      -                      |                         -                          | [Det Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/yolox_x_8xb8-300e_humanart-a39d44ed.pth) |
 
+人体姿态估计模型：
+
+|                                     Config                                      | Input Size | AP<sup><br>(Human-Art GT) | Params<sup><br>(M) | FLOPS<sup><br>(G) | ORT-Latency<sup><br>(ms)<sup><br>(i7-11700) | TRT-FP16-Latency<sup><br>(ms)<sup><br>(GTX 1660Ti) | ncnn-FP16-Latency<sup><br>(ms)<sup><br>(Snapdragon 865) |                                                                                                                                   Download                                                                                                                                   |
+| :-----------------------------------------------------------------------------: | :--------: | :-----------------------: | :----------------: | :---------------: | :-----------------------------------------: | :------------------------------------------------: | :-----------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [RTMPose-t\*](./rtmpose/body_2d_keypoint/rtmpose-t_8xb256-420e_coco-256x192.py) |  256x192   |           65.5            |        3.34        |       0.36        |                    3.20                     |                        1.06                        |                          9.02                           | [pth](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-t_8xb256-420e_humanart-256x192-60b68c98_20230612.pth)<br>[onnx](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/onnx_sdk/rtmpose-t_8xb256-420e_humanart-256x192-60b68c98_20230612.zip) |
+| [RTMPose-s\*](./rtmpose/body_2d_keypoint/rtmpose-s_8xb256-420e_coco-256x192.py) |  256x192   |           69.8            |        5.47        |       0.68        |                    4.48                     |                        1.39                        |                          13.89                          | [pth](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_8xb256-420e_humanart-256x192-5a3ac943_20230611.pth)<br>[onnx](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/onnx_sdk/rtmpose-s_8xb256-420e_humanart-256x192-5a3ac943_20230611.zip) |
+| [RTMPose-m\*](./rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-256x192.py) |  256x192   |           72.8            |       13.59        |       1.93        |                    11.06                    |                        2.29                        |                          26.44                          | [pth](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_8xb256-420e_humanart-256x192-8430627b_20230611.pth)<br>[onnx](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/onnx_sdk/rtmpose-m_8xb256-420e_humanart-256x192-8430627b_20230611.zip) |
+| [RTMPose-l\*](./rtmpose/body_2d_keypoint/rtmpose-l_8xb256-420e_coco-256x192.py) |  256x192   |           75.3            |       27.66        |       4.16        |                    18.85                    |                        3.46                        |                          45.37                          | [pth](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_8xb256-420e_humanart-256x192-389f2cb0_20230611.pth)<br>[onnx](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/onnx_sdk/rtmpose-l_8xb256-420e_humanart-256x192-389f2cb0_20230611.zip) |
+
 </details>
 
 #### 26 Keypoints
@@ -267,7 +270,7 @@ RTMPose 是一个长期优化迭代的项目，致力于业务场景下的高性
 - 关键点骨架定义遵循 [COCO-WholeBody](https://github.com/jin-s13/COCO-WholeBody/)，详情见 [meta info](/configs/_base_/datasets/coco_wholebody.py)。
 - <img src="https://user-images.githubusercontent.com/100993824/227770977-c8f00355-c43a-467e-8444-d307789cf4b2.png" height="300px">
 
-<details open>
+<details close>
 <summary><b>COCO-WholeBody</b></summary>
 
 | Config                          | Input Size | Whole AP | Whole AR | FLOPS<sup><br>(G) | ORT-Latency<sup><br>(ms)<sup><br>(i7-11700) | TRT-FP16-Latency<sup><br>(ms)<sup><br>(GTX 1660Ti) |             Download              |
@@ -280,7 +283,32 @@ RTMPose 是一个长期优化迭代的项目，致力于业务场景下的高性
 </details>
 
 <details open>
-<summary><b>DWPose</b></summary>
+<summary><b>Cocktail13</b></summary>
+
+- `Cocktail13` 代表模型在 13 个开源数据集上训练得到:
+  - [AI Challenger](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#aic)
+  - [CrowdPose](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#crowdpose)
+  - [MPII](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#mpii)
+  - [sub-JHMDB](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#sub-jhmdb-dataset)
+  - [Halpe](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_wholebody_keypoint.html#halpe)
+  - [PoseTrack18](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#posetrack18)
+  - [COCO-Wholebody](https://github.com/jin-s13/COCO-WholeBody/)
+  - [UBody](https://github.com/IDEA-Research/OSX)
+  - [Human-Art](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#human-art-dataset)
+  - [WFLW](https://wywu.github.io/projects/LAB/WFLW.html)
+  - [300W](https://ibug.doc.ic.ac.uk/resources/300-W/)
+  - [COFW](http://www.vision.caltech.edu/xpburgos/ICCV13/)
+  - [LaPa](https://github.com/JDAI-CV/lapa-dataset)
+
+| Config                          | Input Size | Whole AP | Whole AR | FLOPS<sup><br>(G) | ORT-Latency<sup><br>(ms)<sup><br>(i7-11700) | TRT-FP16-Latency<sup><br>(ms)<sup><br>(GTX 1660Ti) |             Download              |
+| :------------------------------ | :--------: | :------: | :------: | :---------------: | :-----------------------------------------: | :------------------------------------------------: | :-------------------------------: |
+| [RTMW-x<sup><br>(alpha version)](./rtmpose/wholebody_2d_keypoint/rtmpose-l_8xb64-270e_coco-wholebody-256x192.py) |  256x192   |   67.2   |   75.4   |       13.1        |                      -                      |                         -                          | [pth](https://download.openmmlab.com/mmpose/v1/projects/rtmw/rtmw-x_simcc-cocktail13_pt-ucoco_270e-256x192-fbef0d61_20230925.pth) |
+| [RTMW-x<sup><br>(alpha version)](./rtmpose/wholebody_2d_keypoint/rtmpose-x_8xb32-270e_coco-wholebody-384x288.py) |  384x288   |   70.2   |   77.9   |       29.3        |                      -                      |                         -                          | [pth](https://download.openmmlab.com/mmpose/v1/projects/rtmw/rtmw-x_simcc-cocktail13_pt-ucoco_270e-384x288-0949e3a9_20230925.pth) |
+
+</details>
+
+<details open>
+<summary><b>COCO+UBody</b></summary>
 
 - DWPose 模型由 [DWPose](https://github.com/IDEA-Research/DWPose) 项目提供
 - 模型在以下数据集上训练并蒸馏:
diff --git a/projects/rtmpose/rtmpose/wholebody_2d_keypoint/rtmw-x_8xb320-270e_cocktail13-384x288.py b/projects/rtmpose/rtmpose/wholebody_2d_keypoint/rtmw-x_8xb320-270e_cocktail13-384x288.py
new file mode 100644
index 0000000000..03f5fd3a04
--- /dev/null
+++ b/projects/rtmpose/rtmpose/wholebody_2d_keypoint/rtmw-x_8xb320-270e_cocktail13-384x288.py
@@ -0,0 +1,586 @@
+_base_ = ['mmpose::_base_/default_runtime.py']
+
+# common setting
+num_keypoints = 133
+input_size = (288, 384)
+
+# runtime
+max_epochs = 270
+stage2_num_epochs = 10
+base_lr = 5e-4
+train_batch_size = 320
+val_batch_size = 32
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+    clip_grad=dict(max_norm=35, norm_type=2),
+    paramwise_cfg=dict(
+        norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0e-5,
+        by_epoch=False,
+        begin=0,
+        end=1000),
+    dict(
+        type='CosineAnnealingLR',
+        eta_min=base_lr * 0.05,
+        begin=max_epochs // 2,
+        end=max_epochs,
+        T_max=max_epochs // 2,
+        by_epoch=True,
+        convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=2560)
+
+# codec settings
+codec = dict(
+    type='SimCCLabel',
+    input_size=input_size,
+    sigma=(6., 6.93),
+    simcc_split_ratio=2.0,
+    normalize=False,
+    use_dark=False)
+
+# model settings
+model = dict(
+    type='TopdownPoseEstimator',
+    data_preprocessor=dict(
+        type='PoseDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True),
+    backbone=dict(
+        _scope_='mmdet',
+        type='CSPNeXt',
+        arch='P5',
+        expand_ratio=0.5,
+        deepen_factor=1.33,
+        widen_factor=1.25,
+        channel_attention=True,
+        norm_cfg=dict(type='BN'),
+        act_cfg=dict(type='SiLU'),
+        init_cfg=dict(
+            type='Pretrained',
+            prefix='backbone.',
+            checkpoint='https://download.openmmlab.com/mmpose/v1/'
+            'wholebody_2d_keypoint/rtmpose/ubody/rtmpose-x_simcc-ucoco_pt-aic-coco_270e-384x288-f5b50679_20230822.pth'  # noqa
+        )),
+    neck=dict(
+        type='CSPNeXtPAFPN',
+        in_channels=[320, 640, 1280],
+        out_channels=None,
+        out_indices=(
+            1,
+            2,
+        ),
+        num_csp_blocks=2,
+        expand_ratio=0.5,
+        norm_cfg=dict(type='SyncBN'),
+        act_cfg=dict(type='SiLU', inplace=True)),
+    head=dict(
+        type='RTMWHead',
+        in_channels=1280,
+        out_channels=num_keypoints,
+        input_size=input_size,
+        in_featuremap_size=tuple([s // 32 for s in input_size]),
+        simcc_split_ratio=codec['simcc_split_ratio'],
+        final_layer_kernel_size=7,
+        gau_cfg=dict(
+            hidden_dims=256,
+            s=128,
+            expansion_factor=2,
+            dropout_rate=0.,
+            drop_path=0.,
+            act_fn='SiLU',
+            use_rel_bias=False,
+            pos_enc=False),
+        loss=dict(
+            type='KLDiscretLoss',
+            use_target_weight=True,
+            beta=10.,
+            label_softmax=True),
+        decoder=codec),
+    test_cfg=dict(flip_test=True))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyDataset'
+data_mode = 'topdown'
+data_root = 'data/'
+
+backend_args = dict(backend='local')
+
+# pipelines
+train_pipeline = [
+    dict(type='LoadImage', backend_args=backend_args),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='RandomFlip', direction='horizontal'),
+    dict(type='RandomHalfBody'),
+    dict(
+        type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90),
+    dict(type='TopdownAffine', input_size=codec['input_size']),
+    dict(type='PhotometricDistortion'),
+    dict(
+        type='Albumentation',
+        transforms=[
+            dict(type='Blur', p=0.1),
+            dict(type='MedianBlur', p=0.1),
+            dict(
+                type='CoarseDropout',
+                max_holes=1,
+                max_height=0.4,
+                max_width=0.4,
+                min_holes=1,
+                min_height=0.2,
+                min_width=0.2,
+                p=1.0),
+        ]),
+    dict(
+        type='GenerateTarget',
+        encoder=codec,
+        use_dataset_keypoint_weights=True),
+    dict(type='PackPoseInputs')
+]
+val_pipeline = [
+    dict(type='LoadImage', backend_args=backend_args),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='TopdownAffine', input_size=codec['input_size']),
+    dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+    dict(type='LoadImage', backend_args=backend_args),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='RandomFlip', direction='horizontal'),
+    dict(type='RandomHalfBody'),
+    dict(
+        type='RandomBBoxTransform',
+        shift_factor=0.,
+        scale_factor=[0.5, 1.5],
+        rotate_factor=90),
+    dict(type='TopdownAffine', input_size=codec['input_size']),
+    dict(
+        type='Albumentation',
+        transforms=[
+            dict(type='Blur', p=0.1),
+            dict(type='MedianBlur', p=0.1),
+            dict(
+                type='CoarseDropout',
+                max_holes=1,
+                max_height=0.4,
+                max_width=0.4,
+                min_holes=1,
+                min_height=0.2,
+                min_width=0.2,
+                p=0.5),
+        ]),
+    dict(
+        type='GenerateTarget',
+        encoder=codec,
+        use_dataset_keypoint_weights=True),
+    dict(type='PackPoseInputs')
+]
+
+# mapping
+
+aic_coco133 = [(0, 6), (1, 8), (2, 10), (3, 5), (4, 7), (5, 9), (6, 12),
+               (7, 14), (8, 16), (9, 11), (10, 13), (11, 15)]
+
+crowdpose_coco133 = [(0, 5), (1, 6), (2, 7), (3, 8), (4, 9), (5, 10), (6, 11),
+                     (7, 12), (8, 13), (9, 14), (10, 15), (11, 16)]
+
+mpii_coco133 = [
+    (0, 16),
+    (1, 14),
+    (2, 12),
+    (3, 11),
+    (4, 13),
+    (5, 15),
+    (8, 18),
+    (9, 17),
+    (10, 10),
+    (11, 8),
+    (12, 6),
+    (13, 5),
+    (14, 7),
+    (15, 9),
+]
+
+jhmdb_coco133 = [
+    (0, 18),
+    (2, 17),
+    (3, 6),
+    (4, 5),
+    (5, 12),
+    (6, 11),
+    (7, 8),
+    (8, 7),
+    (9, 14),
+    (10, 13),
+    (11, 10),
+    (12, 9),
+    (13, 16),
+    (14, 15),
+]
+
+halpe_coco133 = [(i, i)
+                 for i in range(17)] + [(20, 17), (21, 20), (22, 18), (23, 21),
+                                        (24, 19),
+                                        (25, 22)] + [(i, i - 3)
+                                                     for i in range(26, 136)]
+
+posetrack_coco133 = [
+    (0, 0),
+    (2, 17),
+    (3, 3),
+    (4, 4),
+    (5, 5),
+    (6, 6),
+    (7, 7),
+    (8, 8),
+    (9, 9),
+    (10, 10),
+    (11, 11),
+    (12, 12),
+    (13, 13),
+    (14, 14),
+    (15, 15),
+    (16, 16),
+]
+
+humanart_coco133 = [(i, i) for i in range(17)] + [(17, 99), (18, 120),
+                                                  (19, 17), (20, 20)]
+
+# train datasets
+dataset_coco = dict(
+    type=dataset_type,
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='coco/annotations/coco_wholebody_train_v1.0.json',
+    data_prefix=dict(img='detection/coco/train2017/'),
+    pipeline=[],
+)
+
+dataset_aic = dict(
+    type='AicDataset',
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='aic/annotations/aic_train.json',
+    data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
+                     '_train_20170902/keypoint_train_images_20170902/'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=aic_coco133)
+    ],
+)
+
+dataset_crowdpose = dict(
+    type='CrowdPoseDataset',
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
+    data_prefix=dict(img='pose/CrowdPose/images/'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=crowdpose_coco133)
+    ],
+)
+
+dataset_mpii = dict(
+    type='MpiiDataset',
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='mpii/annotations/mpii_train.json',
+    data_prefix=dict(img='pose/MPI/images/'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=mpii_coco133)
+    ],
+)
+
+dataset_jhmdb = dict(
+    type='JhmdbDataset',
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='jhmdb/annotations/Sub1_train.json',
+    data_prefix=dict(img='pose/JHMDB/'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=jhmdb_coco133)
+    ],
+)
+
+dataset_halpe = dict(
+    type='HalpeDataset',
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='halpe/annotations/halpe_train_v1.json',
+    data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=halpe_coco133)
+    ],
+)
+
+dataset_posetrack = dict(
+    type='PoseTrack18Dataset',
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='posetrack18/annotations/posetrack18_train.json',
+    data_prefix=dict(img='pose/PoseChallenge2018/'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=posetrack_coco133)
+    ],
+)
+
+dataset_humanart = dict(
+    type='HumanArt21Dataset',
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='HumanArt/annotations/training_humanart.json',
+    filter_cfg=dict(scenes=['real_human']),
+    data_prefix=dict(img='pose/'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=humanart_coco133)
+    ])
+
+ubody_scenes = [
+    'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow',
+    'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing',
+    'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference'
+]
+
+ubody_datasets = []
+for scene in ubody_scenes:
+    each = dict(
+        type='UBody2dDataset',
+        data_root=data_root,
+        data_mode=data_mode,
+        ann_file=f'Ubody/annotations/{scene}/train_annotations.json',
+        data_prefix=dict(img='pose/UBody/images/'),
+        pipeline=[],
+        sample_interval=10)
+    ubody_datasets.append(each)
+
+dataset_ubody = dict(
+    type='CombinedDataset',
+    metainfo=dict(from_file='configs/_base_/datasets/ubody2d.py'),
+    datasets=ubody_datasets,
+    pipeline=[],
+    test_mode=False,
+)
+
+face_pipeline = [
+    dict(type='LoadImage', backend_args=backend_args),
+    dict(type='GetBBoxCenterScale'),
+    dict(
+        type='RandomBBoxTransform',
+        shift_factor=0.,
+        scale_factor=[0.3, 0.5],
+        rotate_factor=0),
+    dict(type='TopdownAffine', input_size=codec['input_size']),
+]
+
+wflw_coco133 = [(i * 2, 20 + i)
+                for i in range(17)] + [(33 + i, 41 + i) for i in range(5)] + [
+                    (42 + i, 46 + i) for i in range(5)
+                ] + [(51 + i, 50 + i)
+                     for i in range(9)] + [(60, 59), (61, 60), (63, 61),
+                                           (64, 62), (65, 63), (67, 64),
+                                           (68, 65), (69, 66), (71, 67),
+                                           (72, 68), (73, 69),
+                                           (75, 70)] + [(76 + i, 71 + i)
+                                                        for i in range(20)]
+dataset_wflw = dict(
+    type='WFLWDataset',
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='wflw/annotations/face_landmarks_wflw_train.json',
+    data_prefix=dict(img='pose/WFLW/images/'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=wflw_coco133), *face_pipeline
+    ],
+)
+
+mapping_300w_coco133 = [(i, 20 + i) for i in range(68)]
+dataset_300w = dict(
+    type='Face300WDataset',
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='300w/annotations/face_landmarks_300w_train.json',
+    data_prefix=dict(img='pose/300w/images/'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=mapping_300w_coco133), *face_pipeline
+    ],
+)
+
+cofw_coco133 = [(0, 41), (2, 45), (4, 43), (1, 47), (3, 49), (6, 45), (8, 59),
+                (10, 62), (9, 68), (11, 65), (18, 54), (19, 58), (20, 53),
+                (21, 56), (22, 71), (23, 77), (24, 74), (25, 85), (26, 89),
+                (27, 80), (28, 28)]
+dataset_cofw = dict(
+    type='COFWDataset',
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='cofw/annotations/cofw_train.json',
+    data_prefix=dict(img='pose/COFW/images/'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=cofw_coco133), *face_pipeline
+    ],
+)
+
+lapa_coco133 = [(i * 2, 20 + i) for i in range(17)] + [
+    (33 + i, 41 + i) for i in range(5)
+] + [(42 + i, 46 + i) for i in range(5)] + [
+    (51 + i, 50 + i) for i in range(4)
+] + [(58 + i, 54 + i) for i in range(5)] + [(66, 59), (67, 60), (69, 61),
+                                            (70, 62), (71, 63), (73, 64),
+                                            (75, 65), (76, 66), (78, 67),
+                                            (79, 68), (80, 69),
+                                            (82, 70)] + [(84 + i, 71 + i)
+                                                         for i in range(20)]
+dataset_lapa = dict(
+    type='LapaDataset',
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='LaPa/annotations/lapa_trainval.json',
+    data_prefix=dict(img='pose/LaPa/'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=lapa_coco133), *face_pipeline
+    ],
+)
+
+dataset_wb = dict(
+    type='CombinedDataset',
+    metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+    datasets=[dataset_coco, dataset_halpe, dataset_ubody],
+    pipeline=[],
+    test_mode=False,
+)
+
+dataset_body = dict(
+    type='CombinedDataset',
+    metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+    datasets=[
+        dataset_aic,
+        dataset_crowdpose,
+        dataset_mpii,
+        dataset_jhmdb,
+        dataset_posetrack,
+        dataset_humanart,
+    ],
+    pipeline=[],
+    test_mode=False,
+)
+
+dataset_face = dict(
+    type='CombinedDataset',
+    metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+    datasets=[
+        dataset_wflw,
+        dataset_300w,
+        dataset_cofw,
+        dataset_lapa,
+    ],
+    pipeline=[],
+    test_mode=False,
+)
+
+train_datasets = [
+    dataset_wb,
+    dataset_body,
+    dataset_face,
+]
+
+# data loaders
+train_dataloader = dict(
+    batch_size=train_batch_size,
+    num_workers=10,
+    pin_memory=True,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='CombinedDataset',
+        metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+        datasets=train_datasets,
+        pipeline=train_pipeline,
+        test_mode=False,
+    ))
+
+val_dataloader = dict(
+    batch_size=val_batch_size,
+    num_workers=4,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        type='CocoWholeBodyDataset',
+        ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json',
+        data_prefix=dict(img='data/detection/coco/val2017/'),
+        pipeline=val_pipeline,
+        bbox_file='data/coco/person_detection_results/'
+        'COCO_val2017_detections_AP_H_56_person.json',
+        test_mode=True))
+
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+    checkpoint=dict(
+        save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1))
+custom_hooks = [
+    dict(
+        type='EMAHook',
+        ema_type='ExpMomentumEMA',
+        momentum=0.0002,
+        update_buffers=True,
+        priority=49),
+    dict(
+        type='mmdet.PipelineSwitchHook',
+        switch_epoch=max_epochs - stage2_num_epochs,
+        switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+    type='CocoWholeBodyMetric',
+    ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json')
+test_evaluator = val_evaluator
diff --git a/projects/rtmpose/rtmpose/wholebody_2d_keypoint/rtmw-x_8xb704-270e_cocktail13-256x192.py b/projects/rtmpose/rtmpose/wholebody_2d_keypoint/rtmw-x_8xb704-270e_cocktail13-256x192.py
new file mode 100644
index 0000000000..369b85b1ce
--- /dev/null
+++ b/projects/rtmpose/rtmpose/wholebody_2d_keypoint/rtmw-x_8xb704-270e_cocktail13-256x192.py
@@ -0,0 +1,586 @@
+_base_ = ['mmpose::_base_/default_runtime.py']
+
+# common setting
+num_keypoints = 133
+input_size = (192, 256)
+
+# runtime
+max_epochs = 270
+stage2_num_epochs = 10
+base_lr = 5e-4
+train_batch_size = 704
+val_batch_size = 32
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+    clip_grad=dict(max_norm=35, norm_type=2),
+    paramwise_cfg=dict(
+        norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0e-5,
+        by_epoch=False,
+        begin=0,
+        end=1000),
+    dict(
+        type='CosineAnnealingLR',
+        eta_min=base_lr * 0.05,
+        begin=max_epochs // 2,
+        end=max_epochs,
+        T_max=max_epochs // 2,
+        by_epoch=True,
+        convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=5632)
+
+# codec settings
+codec = dict(
+    type='SimCCLabel',
+    input_size=input_size,
+    sigma=(4.9, 5.66),
+    simcc_split_ratio=2.0,
+    normalize=False,
+    use_dark=False)
+
+# model settings
+model = dict(
+    type='TopdownPoseEstimator',
+    data_preprocessor=dict(
+        type='PoseDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True),
+    backbone=dict(
+        _scope_='mmdet',
+        type='CSPNeXt',
+        arch='P5',
+        expand_ratio=0.5,
+        deepen_factor=1.33,
+        widen_factor=1.25,
+        channel_attention=True,
+        norm_cfg=dict(type='BN'),
+        act_cfg=dict(type='SiLU'),
+        init_cfg=dict(
+            type='Pretrained',
+            prefix='backbone.',
+            checkpoint='https://download.openmmlab.com/mmpose/v1/'
+            'wholebody_2d_keypoint/rtmpose/ubody/rtmpose-x_simcc-ucoco_pt-aic-coco_270e-256x192-05f5bcb7_20230822.pth'  # noqa
+        )),
+    neck=dict(
+        type='CSPNeXtPAFPN',
+        in_channels=[320, 640, 1280],
+        out_channels=None,
+        out_indices=(
+            1,
+            2,
+        ),
+        num_csp_blocks=2,
+        expand_ratio=0.5,
+        norm_cfg=dict(type='SyncBN'),
+        act_cfg=dict(type='SiLU', inplace=True)),
+    head=dict(
+        type='RTMWHead',
+        in_channels=1280,
+        out_channels=num_keypoints,
+        input_size=input_size,
+        in_featuremap_size=tuple([s // 32 for s in input_size]),
+        simcc_split_ratio=codec['simcc_split_ratio'],
+        final_layer_kernel_size=7,
+        gau_cfg=dict(
+            hidden_dims=256,
+            s=128,
+            expansion_factor=2,
+            dropout_rate=0.,
+            drop_path=0.,
+            act_fn='SiLU',
+            use_rel_bias=False,
+            pos_enc=False),
+        loss=dict(
+            type='KLDiscretLoss',
+            use_target_weight=True,
+            beta=10.,
+            label_softmax=True),
+        decoder=codec),
+    test_cfg=dict(flip_test=True))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyDataset'
+data_mode = 'topdown'
+data_root = 'data/'
+
+backend_args = dict(backend='local')
+
+# pipelines
+train_pipeline = [
+    dict(type='LoadImage', backend_args=backend_args),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='RandomFlip', direction='horizontal'),
+    dict(type='RandomHalfBody'),
+    dict(
+        type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90),
+    dict(type='TopdownAffine', input_size=codec['input_size']),
+    dict(type='PhotometricDistortion'),
+    dict(
+        type='Albumentation',
+        transforms=[
+            dict(type='Blur', p=0.1),
+            dict(type='MedianBlur', p=0.1),
+            dict(
+                type='CoarseDropout',
+                max_holes=1,
+                max_height=0.4,
+                max_width=0.4,
+                min_holes=1,
+                min_height=0.2,
+                min_width=0.2,
+                p=1.0),
+        ]),
+    dict(
+        type='GenerateTarget',
+        encoder=codec,
+        use_dataset_keypoint_weights=True),
+    dict(type='PackPoseInputs')
+]
+val_pipeline = [
+    dict(type='LoadImage', backend_args=backend_args),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='TopdownAffine', input_size=codec['input_size']),
+    dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+    dict(type='LoadImage', backend_args=backend_args),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='RandomFlip', direction='horizontal'),
+    dict(type='RandomHalfBody'),
+    dict(
+        type='RandomBBoxTransform',
+        shift_factor=0.,
+        scale_factor=[0.5, 1.5],
+        rotate_factor=90),
+    dict(type='TopdownAffine', input_size=codec['input_size']),
+    dict(
+        type='Albumentation',
+        transforms=[
+            dict(type='Blur', p=0.1),
+            dict(type='MedianBlur', p=0.1),
+            dict(
+                type='CoarseDropout',
+                max_holes=1,
+                max_height=0.4,
+                max_width=0.4,
+                min_holes=1,
+                min_height=0.2,
+                min_width=0.2,
+                p=0.5),
+        ]),
+    dict(
+        type='GenerateTarget',
+        encoder=codec,
+        use_dataset_keypoint_weights=True),
+    dict(type='PackPoseInputs')
+]
+
+# mapping
+
+aic_coco133 = [(0, 6), (1, 8), (2, 10), (3, 5), (4, 7), (5, 9), (6, 12),
+               (7, 14), (8, 16), (9, 11), (10, 13), (11, 15)]
+
+crowdpose_coco133 = [(0, 5), (1, 6), (2, 7), (3, 8), (4, 9), (5, 10), (6, 11),
+                     (7, 12), (8, 13), (9, 14), (10, 15), (11, 16)]
+
+mpii_coco133 = [
+    (0, 16),
+    (1, 14),
+    (2, 12),
+    (3, 11),
+    (4, 13),
+    (5, 15),
+    (8, 18),
+    (9, 17),
+    (10, 10),
+    (11, 8),
+    (12, 6),
+    (13, 5),
+    (14, 7),
+    (15, 9),
+]
+
+jhmdb_coco133 = [
+    (0, 18),
+    (2, 17),
+    (3, 6),
+    (4, 5),
+    (5, 12),
+    (6, 11),
+    (7, 8),
+    (8, 7),
+    (9, 14),
+    (10, 13),
+    (11, 10),
+    (12, 9),
+    (13, 16),
+    (14, 15),
+]
+
+halpe_coco133 = [(i, i)
+                 for i in range(17)] + [(20, 17), (21, 20), (22, 18), (23, 21),
+                                        (24, 19),
+                                        (25, 22)] + [(i, i - 3)
+                                                     for i in range(26, 136)]
+
+posetrack_coco133 = [
+    (0, 0),
+    (2, 17),
+    (3, 3),
+    (4, 4),
+    (5, 5),
+    (6, 6),
+    (7, 7),
+    (8, 8),
+    (9, 9),
+    (10, 10),
+    (11, 11),
+    (12, 12),
+    (13, 13),
+    (14, 14),
+    (15, 15),
+    (16, 16),
+]
+
+humanart_coco133 = [(i, i) for i in range(17)] + [(17, 99), (18, 120),
+                                                  (19, 17), (20, 20)]
+
+# train datasets
+dataset_coco = dict(
+    type=dataset_type,
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='coco/annotations/coco_wholebody_train_v1.0.json',
+    data_prefix=dict(img='detection/coco/train2017/'),
+    pipeline=[],
+)
+
+dataset_aic = dict(
+    type='AicDataset',
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='aic/annotations/aic_train.json',
+    data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
+                     '_train_20170902/keypoint_train_images_20170902/'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=aic_coco133)
+    ],
+)
+
+dataset_crowdpose = dict(
+    type='CrowdPoseDataset',
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
+    data_prefix=dict(img='pose/CrowdPose/images/'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=crowdpose_coco133)
+    ],
+)
+
+dataset_mpii = dict(
+    type='MpiiDataset',
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='mpii/annotations/mpii_train.json',
+    data_prefix=dict(img='pose/MPI/images/'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=mpii_coco133)
+    ],
+)
+
+dataset_jhmdb = dict(
+    type='JhmdbDataset',
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='jhmdb/annotations/Sub1_train.json',
+    data_prefix=dict(img='pose/JHMDB/'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=jhmdb_coco133)
+    ],
+)
+
+dataset_halpe = dict(
+    type='HalpeDataset',
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='halpe/annotations/halpe_train_v1.json',
+    data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=halpe_coco133)
+    ],
+)
+
+dataset_posetrack = dict(
+    type='PoseTrack18Dataset',
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='posetrack18/annotations/posetrack18_train.json',
+    data_prefix=dict(img='pose/PoseChallenge2018/'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=posetrack_coco133)
+    ],
+)
+
+dataset_humanart = dict(
+    type='HumanArt21Dataset',
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='HumanArt/annotations/training_humanart.json',
+    filter_cfg=dict(scenes=['real_human']),
+    data_prefix=dict(img='pose/'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=humanart_coco133)
+    ])
+
+ubody_scenes = [
+    'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow',
+    'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing',
+    'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference'
+]
+
+ubody_datasets = []
+for scene in ubody_scenes:
+    each = dict(
+        type='UBody2dDataset',
+        data_root=data_root,
+        data_mode=data_mode,
+        ann_file=f'Ubody/annotations/{scene}/train_annotations.json',
+        data_prefix=dict(img='pose/UBody/images/'),
+        pipeline=[],
+        sample_interval=10)
+    ubody_datasets.append(each)
+
+dataset_ubody = dict(
+    type='CombinedDataset',
+    metainfo=dict(from_file='configs/_base_/datasets/ubody2d.py'),
+    datasets=ubody_datasets,
+    pipeline=[],
+    test_mode=False,
+)
+
+face_pipeline = [
+    dict(type='LoadImage', backend_args=backend_args),
+    dict(type='GetBBoxCenterScale'),
+    dict(
+        type='RandomBBoxTransform',
+        shift_factor=0.,
+        scale_factor=[0.3, 0.5],
+        rotate_factor=0),
+    dict(type='TopdownAffine', input_size=codec['input_size']),
+]
+
+wflw_coco133 = [(i * 2, 20 + i)
+                for i in range(17)] + [(33 + i, 41 + i) for i in range(5)] + [
+                    (42 + i, 46 + i) for i in range(5)
+                ] + [(51 + i, 50 + i)
+                     for i in range(9)] + [(60, 59), (61, 60), (63, 61),
+                                           (64, 62), (65, 63), (67, 64),
+                                           (68, 65), (69, 66), (71, 67),
+                                           (72, 68), (73, 69),
+                                           (75, 70)] + [(76 + i, 71 + i)
+                                                        for i in range(20)]
+dataset_wflw = dict(
+    type='WFLWDataset',
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='wflw/annotations/face_landmarks_wflw_train.json',
+    data_prefix=dict(img='pose/WFLW/images/'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=wflw_coco133), *face_pipeline
+    ],
+)
+
+mapping_300w_coco133 = [(i, 20 + i) for i in range(68)]
+dataset_300w = dict(
+    type='Face300WDataset',
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='300w/annotations/face_landmarks_300w_train.json',
+    data_prefix=dict(img='pose/300w/images/'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=mapping_300w_coco133), *face_pipeline
+    ],
+)
+
+cofw_coco133 = [(0, 41), (2, 45), (4, 43), (1, 47), (3, 49), (6, 45), (8, 59),
+                (10, 62), (9, 68), (11, 65), (18, 54), (19, 58), (20, 53),
+                (21, 56), (22, 71), (23, 77), (24, 74), (25, 85), (26, 89),
+                (27, 80), (28, 28)]
+dataset_cofw = dict(
+    type='COFWDataset',
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='cofw/annotations/cofw_train.json',
+    data_prefix=dict(img='pose/COFW/images/'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=cofw_coco133), *face_pipeline
+    ],
+)
+
+lapa_coco133 = [(i * 2, 20 + i) for i in range(17)] + [
+    (33 + i, 41 + i) for i in range(5)
+] + [(42 + i, 46 + i) for i in range(5)] + [
+    (51 + i, 50 + i) for i in range(4)
+] + [(58 + i, 54 + i) for i in range(5)] + [(66, 59), (67, 60), (69, 61),
+                                            (70, 62), (71, 63), (73, 64),
+                                            (75, 65), (76, 66), (78, 67),
+                                            (79, 68), (80, 69),
+                                            (82, 70)] + [(84 + i, 71 + i)
+                                                         for i in range(20)]
+dataset_lapa = dict(
+    type='LapaDataset',
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='LaPa/annotations/lapa_trainval.json',
+    data_prefix=dict(img='pose/LaPa/'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=lapa_coco133), *face_pipeline
+    ],
+)
+
+dataset_wb = dict(
+    type='CombinedDataset',
+    metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+    datasets=[dataset_coco, dataset_halpe, dataset_ubody],
+    pipeline=[],
+    test_mode=False,
+)
+
+dataset_body = dict(
+    type='CombinedDataset',
+    metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+    datasets=[
+        dataset_aic,
+        dataset_crowdpose,
+        dataset_mpii,
+        dataset_jhmdb,
+        dataset_posetrack,
+        dataset_humanart,
+    ],
+    pipeline=[],
+    test_mode=False,
+)
+
+dataset_face = dict(
+    type='CombinedDataset',
+    metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+    datasets=[
+        dataset_wflw,
+        dataset_300w,
+        dataset_cofw,
+        dataset_lapa,
+    ],
+    pipeline=[],
+    test_mode=False,
+)
+
+train_datasets = [
+    dataset_wb,
+    dataset_body,
+    dataset_face,
+]
+
+# data loaders
+train_dataloader = dict(
+    batch_size=train_batch_size,
+    num_workers=10,
+    pin_memory=True,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='CombinedDataset',
+        metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+        datasets=train_datasets,
+        pipeline=train_pipeline,
+        test_mode=False,
+    ))
+
+val_dataloader = dict(
+    batch_size=val_batch_size,
+    num_workers=4,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        type='CocoWholeBodyDataset',
+        ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json',
+        data_prefix=dict(img='data/detection/coco/val2017/'),
+        pipeline=val_pipeline,
+        bbox_file='data/coco/person_detection_results/'
+        'COCO_val2017_detections_AP_H_56_person.json',
+        test_mode=True))
+
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+    checkpoint=dict(
+        save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1))
+custom_hooks = [
+    dict(
+        type='EMAHook',
+        ema_type='ExpMomentumEMA',
+        momentum=0.0002,
+        update_buffers=True,
+        priority=49),
+    dict(
+        type='mmdet.PipelineSwitchHook',
+        switch_epoch=max_epochs - stage2_num_epochs,
+        switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+    type='CocoWholeBodyMetric',
+    ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json')
+test_evaluator = val_evaluator
diff --git a/tools/misc/pth_transfer.py b/tools/misc/pth_transfer.py
index ee08fee748..7433c6771e 100644
--- a/tools/misc/pth_transfer.py
+++ b/tools/misc/pth_transfer.py
@@ -6,7 +6,7 @@
 
 
 def change_model(args):
-    dis_model = torch.load(args.dis_path)
+    dis_model = torch.load(args.dis_path, map_location='cpu')
     all_name = []
     if args.two_dis:
         for name, v in dis_model['state_dict'].items():