diff --git a/mmtrack/datasets/__init__.py b/mmtrack/datasets/__init__.py index dbb34dece..0a1678542 100644 --- a/mmtrack/datasets/__init__.py +++ b/mmtrack/datasets/__init__.py @@ -14,10 +14,12 @@ from .sot_train_dataset import SOTTrainDataset from .trackingnet_dataset import TrackingNetTestDataset from .uav123_dataset import UAV123Dataset +from .youtube_vis_dataset import YouTubeVISDataset __all__ = [ 'DATASETS', 'PIPELINES', 'build_dataloader', 'build_dataset', 'CocoVID', 'CocoVideoDataset', 'ImagenetVIDDataset', 'MOTChallengeDataset', 'ReIDDataset', 'SOTTrainDataset', 'SOTTestDataset', 'LaSOTDataset', - 'UAV123Dataset', 'TrackingNetTestDataset', 'OTB100Dataset' + 'UAV123Dataset', 'TrackingNetTestDataset', 'OTB100Dataset', + 'YouTubeVISDataset' ] diff --git a/mmtrack/datasets/youtube_vis_dataset.py b/mmtrack/datasets/youtube_vis_dataset.py new file mode 100644 index 000000000..45ca169ae --- /dev/null +++ b/mmtrack/datasets/youtube_vis_dataset.py @@ -0,0 +1,43 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.datasets import DATASETS + +from .coco_video_dataset import CocoVideoDataset + + +@DATASETS.register_module() +class YouTubeVISDataset(CocoVideoDataset): + """YouTube VIS dataset for video instance segmentation.""" + + CLASSES_2019_version = ('person', 'giant_panda', 'lizard', 'parrot', + 'skateboard', 'sedan', 'ape', 'dog', 'snake', + 'monkey', 'hand', 'rabbit', 'duck', 'cat', 'cow', + 'fish', 'train', 'horse', 'turtle', 'bear', + 'motorbike', 'giraffe', 'leopard', 'fox', 'deer', + 'owl', 'surfboard', 'airplane', 'truck', 'zebra', + 'tiger', 'elephant', 'snowboard', 'boat', 'shark', + 'mouse', 'frog', 'eagle', 'earless_seal', + 'tennis_racket') + + CLASSES_2021_version = ('airplane', 'bear', 'bird', 'boat', 'car', 'cat', + 'cow', 'deer', 'dog', 'duck', 'earless_seal', + 'elephant', 'fish', 'flying_disc', 'fox', 'frog', + 'giant_panda', 'giraffe', 'horse', 'leopard', + 'lizard', 'monkey', 'motorbike', 'mouse', 'parrot', + 'person', 'rabbit', 'shark', 'skateboard', 'snake', + 'snowboard', 'squirrel', 'surfboard', + 'tennis_racket', 'tiger', 'train', 'truck', + 'turtle', 'whale', 'zebra') + + def __init__(self, dataset_version, *args, **kwargs): + self.set_dataset_classes(dataset_version) + super().__init__(*args, **kwargs) + + @classmethod + def set_dataset_classes(cls, dataset_version): + if dataset_version == '2019': + cls.CLASSES = cls.CLASSES_2019_version + elif dataset_version == '2021': + cls.CLASSES = cls.CLASSES_2021_version + else: + raise NotImplementedError('Not supported YouTubeVIS dataset' + f'version: {dataset_version}') diff --git a/tools/convert_datasets/youtubevis/youtubevis2coco.py b/tools/convert_datasets/youtubevis/youtubevis2coco.py new file mode 100644 index 000000000..9312ba272 --- /dev/null +++ b/tools/convert_datasets/youtubevis/youtubevis2coco.py @@ -0,0 +1,152 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import copy +import os +import os.path as osp +from collections import defaultdict + +import mmcv +from tqdm import tqdm + + +def parse_args(): + parser = argparse.ArgumentParser( + description='YouTube-VIS to COCO Video format') + parser.add_argument( + '-i', + '--input', + help='root directory of YouTube-VIS annotations', + ) + parser.add_argument( + '-o', + '--output', + help='directory to save coco formatted label file', + ) + parser.add_argument( + '--version', + choices=['2019', '2021'], + help='The version of YouTube-VIS Dataset', + ) + return parser.parse_args() + + +def convert_vis(ann_dir, save_dir, dataset_version, mode='train'): + """Convert YouTube-VIS dataset in COCO style. + + Args: + ann_dir (str): The path of YouTube-VIS dataset. + save_dir (str): The path to save `VIS`. + dataset_version (str): The version of dataset. Options are '2019', + '2021'. + mode (str): Convert train dataset or validation dataset or test + dataset. Options are 'train', 'valid', 'test'. Default: 'train'. + """ + assert dataset_version in ['2019', '2021'] + assert mode in ['train', 'valid', 'test'] + VIS = defaultdict(list) + records = dict(vid_id=1, img_id=1, ann_id=1, global_instance_id=1) + obj_num_classes = dict() + + if dataset_version == '2019': + official_anns = mmcv.load(osp.join(ann_dir, f'{mode}.json')) + elif dataset_version == '2021': + official_anns = mmcv.load(osp.join(ann_dir, mode, 'instances.json')) + VIS['categories'] = copy.deepcopy(official_anns['categories']) + + has_annotations = mode == 'train' + if has_annotations: + vid_to_anns = defaultdict(list) + for ann_info in official_anns['annotations']: + vid_to_anns[ann_info['video_id']].append(ann_info) + + video_infos = official_anns['videos'] + for video_info in tqdm(video_infos): + video_name = video_info['file_names'][0].split('/')[0] + video = dict(id=video_info['id'], name=video_name) + VIS['videos'].append(video) + + num_frames = len(video_info['file_names']) + width = video_info['width'] + height = video_info['height'] + if has_annotations: + ann_infos_in_video = vid_to_anns[video_info['id']] + instance_id_maps = dict() + + for frame_id in range(num_frames): + image = dict( + file_name=video_info['file_names'][frame_id], + height=height, + width=width, + id=records['img_id'], + frame_id=frame_id, + video_id=video_info['id']) + VIS['images'].append(image) + + if has_annotations: + for ann_info in ann_infos_in_video: + bbox = ann_info['bboxes'][frame_id] + if bbox is None: + continue + + category_id = ann_info['category_id'] + track_id = ann_info['id'] + segmentation = ann_info['segmentations'][frame_id] + area = ann_info['areas'][frame_id] + assert isinstance(category_id, int) + assert isinstance(track_id, int) + assert segmentation is not None + assert area is not None + + if track_id in instance_id_maps: + instance_id = instance_id_maps[track_id] + else: + instance_id = records['global_instance_id'] + records['global_instance_id'] += 1 + instance_id_maps[track_id] = instance_id + + ann = dict( + id=records['ann_id'], + video_id=video_info['id'], + image_id=records['img_id'], + category_id=category_id, + instance_id=instance_id, + bbox=bbox, + segmentation=segmentation, + area=area, + iscrowd=ann_info['iscrowd']) + + if category_id not in obj_num_classes: + obj_num_classes[category_id] = 1 + else: + obj_num_classes[category_id] += 1 + + VIS['annotations'].append(ann) + records['ann_id'] += 1 + records['img_id'] += 1 + records['vid_id'] += 1 + + if not osp.isdir(save_dir): + os.makedirs(save_dir) + mmcv.dump(VIS, + osp.join(save_dir, f'youtube_vis_{dataset_version}_{mode}.json')) + print(f'-----YouTube VIS {dataset_version} {mode}------') + print(f'{records["vid_id"]- 1} videos') + print(f'{records["img_id"]- 1} images') + if has_annotations: + print(f'{records["ann_id"] - 1} objects') + print(f'{records["global_instance_id"] - 1} instances') + print('-----------------------') + if has_annotations: + for i in range(1, len(VIS['categories']) + 1): + class_name = VIS['categories'][i - 1]['name'] + print(f'Class {i} {class_name} has {obj_num_classes[i]} objects.') + + +def main(): + args = parse_args() + for sub_set in ['train', 'valid', 'test']: + convert_vis(args.input, args.output, args.version, sub_set) + + +if __name__ == '__main__': + main()