diff --git a/docs/reference/audiovisual-emotion-recognition-learner.md b/docs/reference/audiovisual-emotion-recognition-learner.md new file mode 100644 index 0000000000..6391a1c21a --- /dev/null +++ b/docs/reference/audiovisual-emotion-recognition-learner.md @@ -0,0 +1,329 @@ +## audiovisual_emotion_learner module + +The *audiovisual_emotion_learner* module contains the *AudiovisualEmotionLearner* class, which inherits from the abstract class *Learner*. + +### Class AudiovisualEmotionLearner +Bases: `opendr.engine.learners.Learner` + +The *AudiovisualEmotionLearner* class provides an implementation of audiovisual emotion recognition method using audio and video (frontal face) inputs. +The implementation follows the method described in ['Self-attention fusion for audiovisual emotion recognition with incomplete data'](https://arxiv.org/abs/2201.11095). +Three fusion methods are provided. + +AudiovisualEmotionLearner relies on EfficientFace model implementation [1]. +Parts of training pipeline are modified from [2]. + +The [AudiovisualEmotionLearner](/src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/avlearner.py) class has the following public methods: + +#### `AudiovisualEmotionLearner` constructor +```python +AudiovisualEmotionLearner(self, num_class, seq_length, fusion, mod_drop, pretr_ef, lr, lr_steps, momentum, dampening, weight_decay, iters, batch_size, n_workers, checkpoint_after_ietr, checkpoint_load_iter, temp_path, device) +``` + +**Parameters**: + +- **num_class**: *int, default=8*\ + Specifies the number of classes. + +- **seq_length**: *int, default=15*\ + Length of frame sequence representing a video. + +- **fusion**: *str, default='ia'*\ + Modality fusion method. + Options are 'ia', 'it', 'lt', referring to 'intermediate attention', 'intermediate transformer', and 'late transformer' fusion. + Refer [here](https://arxiv.org/abs/2201.11095) for details. + +- **mod_dropout**: *str, default='zerodrop', {'nodrop', 'noisedrop', 'zerodrop'}* + Modality dropout method. + Refer to [here](https://arxiv.org/abs/2201.11095) for details. + +- **pretr_ef**: *str, default=None* + Checkpoint of the pre-trained EfficientFace model that is used to initialize the weights of the vision backbone. + Default is None or no initialization. + It is recommended to use pre-trained model for initialization and the pre-trained model can be obtained e.g. from [here](https://github.com/zengqunzhao/EfficientFace) (EfficientFace_Trained_on_AffectNet7.pth.tar is recommended) + +- **lr** : *float, default=0.04*\ + Specifies the learning rate of the optimizer. + +- **lr_steps**: *list of ints, default=[40, 55, 65, 70, 200, 250]*\ + Specifies the epochs on which learning rate is reduced by the factor of 10. + +- **momentum**: *float, default=0.9*\ + Specifies the momentum of SGD optimizer. + +- **dampening**: *float, default=0.9*\ + Specifies the dampening factor coefficient. + +- **weight_decay**: *float, default=1e-3*\ + Specifies the weight decay coefficient. + +- **iters**: *int, default=100*\ + Specifies the number of epochs used to train the classifier. + +- **batch-size**: int, default=8\ + Specifies the minibatch size. + +- **n_workers**: *int, default=4*\ + Specifies number of threads to be used in data loading. + +- **device**: *str, default='cpu', {'cuda', 'cpu'}*\ + Specifies the computation device. + +#### `AudiovisualEmotionLearner.fit` +```python +AudiovisualEmotionLearner.fit(self, dataset, val_dataset, logging_path, silent, verbose, eval_mode, restore_best): +) +``` + +Method to train the audiovisual emotion recognition model. +After calling this method the model is trained for specified number of iterations and the last checkpoint is saved. +If the validation set is provided, the best checkpoint on validation set is saved in addition. + +Returns a dictionary containing a list of cross entropy measures (dict keys: `"train_loss"`, `"val_loss"`) and a list of accuracy (dict key: `"train_acc"`, `"val_acc"`) during the entire optimization process. + +**Parameters**: + +- **dataset**: *engine.datasets.DatasetIterator*\ + OpenDR dataset object that holds the training set. + The `__get_item__` should return audio data of shape (C x N), video data of shape (C x N x H x W). + +- **val_dataset**: *engine.datasets.DatasetIterator, default=None*\ + OpenDR dataset object that holds the validation set. + The `__get_item__` should return audio data of shape (C x N), video data of shape (C x N x H x W). + If `val_dataset` is not `None`, it is used to select the model's weights that produce the best validation accuracy and save these weights. + +- **logging_path**: *str, default='logs/'*\ + Path for saving Tensorboard logs and model checkpoints. + +- **silent**: *bool, default=False*\ + If set to True, disables all printing, otherwise, the performance statistics are printed to STDOUT after every 10th epoch. + +- **verbose**: *bool, default=True*\ + If set to True, the performance statistics are printed after every epoch. + +- **eval_mode**: *str, default='audiovisual', {'audiovisual', 'noisyaudio', 'noisyvideo', 'onlyaudio', 'onlyvideo'}*\ + Evaluation mode of the model. + Check [here](https://arxiv.org/abs/2201.11095) for details. +- **restore_best**: *bool, default=False* + If set to true, best weights on validation set are restored in the end of training if validation set is provided, otherwise best weights on train set. + +**Returns**: + + - **performance**: *dict* + A dictionary that holds the lists of performance curves with the following keys: `"train_acc"`, `"train_loss"`, `"val_acc"`, `"val_loss"`. + + +#### `AudiovisualEmotionLearner.eval` +```python +AudiovisualEmotionLearner.eval(self, dataset, silent, verbose, mode) +``` + +This method is used to evaluate the current audiovisual emotion recognition model given a dataset. + +**Parameters**: + +- **dataset**: *engine.datasets.DatasetIterator*\ + OpenDR dataset object that holds the training set. + The `__get_item__` should return audio data of shape (C x N), video data of shape (C x N x H x W). + +- **silent**: *bool, default=False*\ + If set to True, disables all prints + +- **verbose**: *bool, default=True*\ + If set to True, prints the accuracy and performance of the model + +- **mode**: *str, default='audiovisual', {'audiovisual', 'noisyaudio', 'noisyvideo', 'onlyaudio', 'onlyvideo'}\ + Evaluation mode of the model. Check [here](https://arxiv.org/abs/2201.11095) for details. + +**Returns**: + +- **performance**: *dict* + Dictionary that contains `"cross_entropy"` and `"acc"` as keys. + + +#### `AudiovisualEmotionLearner.infer` +```python +AudiovisualEmotionLearner.infer(self, audio, video) +``` + +This method is used to generate the emotion prediction given an audio and a video. +Returns an instance of `engine.target.Category` representing the prediction. + +**Parameters**: + +- **audio**: *engine.data.Timeseries*\ + Object of type `engine.data.Timeseries` that holds the input audio data. + +- **video**: *engine.data.Video*\ + Object of type `engine.data.Video` that holds the input video data. + +**Returns**: + +- **prediction**: *engine.target.Category*\ + Object of type `engine.target.Category` that contains the prediction. + + +#### `AudiovisualEmotionLearner.save` +```python +AudiovisualEmotionLearner.save(path, verbose) +``` + +This method is used to save the current model instance under a given path. +The saved model can be loaded later by calling `AudiovisualEmotionLearner.load(path)`. +Two files are saved under the given directory path, namely `"path/metadata.json"` and `"path/model_weights.pt"`. +The former keeps the metadata and the latter keeps the model weights. + +**Parameters**: + +- **path**: *str*\ + Directory path to save the model. +- **verbose**: *bool, default=True*\ + If set to True, print acknowledge message when saving is successful. + +#### `AudiovisualEmotionLearner.download` +```python +AudiovisualEmotionLearner.download(self, path) +``` + +This method is used to download a pretrained model from a given directory. +Pretrained models are provided for the 'ia' fusion with 'zerodrop' modality dropout. +The pretrained model is trained on [RAVDESS](https://zenodo.org/record/1188976#.YlkXNyjP1PY) dataset which is under CC BY-NC-SA 4.0 license. + +**Parameters**: + +- **path**: *str*\ + Directory path to download the model. + Under this path, "metadata.json" and "model_weights.pt" will be downloaded. + The weights of the downloaded pretrained model can be loaded by calling the AudiovisualEmotionLearner.load method. + +#### `AudiovisualEmotionLearner.load` +```python +AudiovisualEmotionLearner.load(self, path, verbose) +``` + +This method is used to load a previously saved model (by calling `AudiovisualEmotionLearner.save`) from a given directory. +Note that under the given directory path, `"metadata.json"` and `"model_weights.pt"` must exist. + +**Parameters**: + +- **path**: *str*\ + Directory path of the model to be loaded. + +- **verbose**: *bool, default=True*\ + If set to True, print acknowledge message when model loading is successful. + +#### `opendr.perception.multimodal_human_centric.audiovisual_emotion_learner.algorithm.data.get_audiovisual_emotion_dataset` +```python +opendr.perception.multimodal_human_centric.audiovisual_emotion_learner.algorithm.data.get_audiovisual_emotion_dataset(path, sr, n_mfcc, preprocess, target_time=, input_fps, save_frames, target_im_size, device +) +``` + +**Parameters**: + +- **path**: *str*\ + Specifies the directory path where the dataset resides. + For training on [Ravdess dataset](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0196391) please download the data from [here](https://zenodo.org/record/1188976#.YlCAwijP2bi). + +- **sr**: *int, default=22050*\ + Specifies sampling rate for audio processing. + +- **n-mfcc**: *int, default=10*\ + Specifies number of MFCC featuers to extract from audio. + +- **preprocess**: *bool, default=False*\ + Preprocesses the downloaded Ravdess dataset for training and saves the processed data files. + +- **target_time**: *float, default=3.6*\ + Target in seconds time to which the videos and audios are cropped/padded. + +- **input_fps**: *int, default=30*\ + Frames per second of input video file + +- **save_frames**: *int, defauult=15*\ + Number of frames from the video to keep for training + +- **target_im_size**: *int, default=224*\ + Target height and width of video to which the video is reshaped + +- **device**: *{'cpu', 'cuda'}*\ + Device on which to run the preprocessing (has no effect if preprocess=False) + +**Returns**: + +- **train_set**: *opendr.engine.datasets.DatasetIterator*\ + The training set object that can be used with the `fit()` method to train the hand gesture classifier. + +- **val_set**: *opendr.engine.datasets.DatasetIterator*\ + The validation set object that can be used with the `fit()` method to train the hand gesture classifier. + +- **test_set**: *opendr.engine.datasets.DatasetIterator*\ + The test set object that can be used with the `fit()` method to train the hand gesture classifier. + + +### Examples + +* **Training an audio and vision emotion recognition model**. + In this example, we will train an audiovisual emotion recognition model on [Ravdess](https://zenodo.org/record/1188976#.YlCAwijP2bi) dataset. + First, the dataset needs to be downloaded (the files Video_Speech_Actor_[01-24].zip and Audio_Speech_Actors_01-24.zip). + The directory should be organized as follows: + ``` + RAVDESS + └───ACTOR01 + │ │ 01-01-01-01-01-01-01.mp4 + │ │ 01-01-01-01-01-02-01.mp4 + │ │ ... + │ │ 03-01-01-01-01-01-01.wav + │ │ 03-01-01-01-01-02-01.wav + │ │ ... + └───ACTOR02 + └───... + └───ACTOR24 + ``` + The training, validation, and testing data objects can be constructed easily by using our method `get_audiovisual_emotion_dataset()` as follows: + + ```python + from opendr.perception.multimodal_human_centric import get_audiovisual_emotion_dataset + from opendr.perception.multimodal_human_centric import AudiovisualEmotionLearner + + train_set, val_set, test_set = get_audiovisual_emotion_dataset('RAVDESS/', preprocess=True) + ``` + When the data is downloaded and processed for the first time, it has to be preprocessed by setting the argument `preprocess=True` in `get_audiovisual_emotion_dataset()`. + This will preprocess the audio and video files and save the preprocessed files as numpy arrays, as well as create a random train-val-test split. + The preprocessing should be ran once and `preprocess=False` should be used in subsequent runs on the same dataset. + + Then, we can construct the emotion recognition model, and train the learner for 100 iterations as follows: + + ```python + learner = AudiovisualEmotionLearner(iters=100, device='cuda') + performance = learner.fit(train_set, val_set, logging_path = 'logs', restore_best=True) + learner.save('model') + ``` + Here, we restore the best performance on the validation set and save the model. + The training logs are saved under logs/. + + Additionally, pretrained EfficientFace model can be obtained from [here](https://github.com/zengqunzhao/EfficientFace) and used for weight initialization of the vision backbone by setting `pretr_ef='EfficientFace_Trained_on_AffectNet7.pth.tar'`. + +* **Using pretrained audiovisual emotion recognition model** + + In this example, we will demonstrate how a pretrained audiovisual emotion recognition model can be used. + First, we will download the pre-trained model. + The model is trained on RAVDESS [3] dataset under CC BY-NC-SA 4.0 license. + + ```python + from opendr.perception.multimodal_human_centric.audiovisual_emotion_learner.avlearner import AudiovisualEmotionLearner + + learner = AudiovisualEmotionLearner() + learner.download('model') + learner.load('model') + ``` + + Given an input video and audio, we can preprocess the data and make inference using the pretrained model as follows: + + ```python + audio, video = avlearner.load_inference_data(args.input_audio, args.input_video) + prediction = avlearner.infer(audio, video) + ``` + +#### References +[1] https://github.com/zengqunzhao/EfficientFace +[2] https://github.com/okankop/Efficient-3DCNNs +[3] https://zenodo.org/record/1188976#.YlCAwijP2bi diff --git a/docs/reference/index.md b/docs/reference/index.md index 1514a10acb..728f90a959 100644 --- a/docs/reference/index.md +++ b/docs/reference/index.md @@ -52,6 +52,7 @@ Neither the copyright holder nor any applicable licensor will be liable for any - [ab3dmot Module](object-tracking-3d-ab3dmot.md) - multimodal human centric: - [rgbd_hand_gesture_learner Module](rgbd-hand-gesture-learner.md) + - [audiovisual_emotion_recognition_learner Module](audiovisual-emotion-recognition-learner.md) - compressive learning: - [multilinear_compressive_learning Module](multilinear-compressive-learning.md) - semantic segmentation: @@ -103,7 +104,8 @@ Neither the copyright holder nor any applicable licensor will be liable for any - pose estimation: - [lightweight_open_pose Demo](/projects/perception/lightweight_open_pose) - multimodal human centric: - - [rgbd_hand_gesture_learner Demo](/projects/perception/multimodal_human_centric) + - [rgbd_hand_gesture_learner Demo](/projects/perception/multimodal_human_centric/rgbd_hand_gesture_recognition) + - [audiovisual_emotion_recognition Demo](/projects/perception/multimodal_human_centric/audiovisual_emotion_recognition) - object detection 2d: - [detr Demo](/projects/perception/object_detection_2d/detr) - [gem Demo](/projects/perception/object_detection_2d/gem) diff --git a/projects/opendr_ws/src/perception/scripts/audiovisual_emotion_recognition.py b/projects/opendr_ws/src/perception/scripts/audiovisual_emotion_recognition.py new file mode 100644 index 0000000000..c4fe3e126a --- /dev/null +++ b/projects/opendr_ws/src/perception/scripts/audiovisual_emotion_recognition.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright 2020-2022 OpenDR European Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import argparse +import numpy as np +import torch +import librosa + +import rospy +import message_filters +from sensor_msgs.msg import Image as ROS_Image +from audio_common_msgs.msg import AudioData +from vision_msgs.msg import Classification2D + +from opendr_bridge import ROSBridge +from opendr.perception.multimodal_human_centric import AudiovisualEmotionLearner +from opendr.perception.multimodal_human_centric import spatial_transforms as transforms +from opendr.engine.data import Video, Timeseries + + +class AudiovisualEmotionNode: + + def __init__(self, input_video_topic="/usb_cam/image_raw", input_audio_topic="/audio/audio", + annotations_topic="/opendr/audiovisual_emotion", buffer_size=3.6, device="cuda"): + """ + Creates a ROS Node for audiovisual emotion recognition + :param input_video_topic: Topic from which we are reading the input video. Expects detected face of size 224x224 + :type input_video_topic: str + :param input_audio_topic: Topic from which we are reading the input audio + :type input_audio_topic: str + :param annotations_topic: Topic to which we are publishing the predicted class + :type annotations_topic: str + :param buffer_size: length of audio and video in sec + :type buffer_size: float + :param device: device on which we are running inference ('cpu' or 'cuda') + :type device: str + """ + + self.publisher = rospy.Publisher(annotations_topic, Classification2D, queue_size=10) + + video_sub = message_filters.Subscriber(input_video_topic, ROS_Image) + audio_sub = message_filters.Subscriber(input_audio_topic, AudioData) + # synchronize video and audio data topics + ts = message_filters.ApproximateTimeSynchronizer([video_sub, audio_sub], 10, 0.1, allow_headerless=True) + ts.registerCallback(self.callback) + + self.bridge = ROSBridge() + + # Initialize the gesture recognition + self.avlearner = AudiovisualEmotionLearner(device=device, fusion='ia', mod_drop='zerodrop') + if not os.path.exists('model'): + self.avlearner.download('model') + self.avlearner.load('model') + + self.buffer_size = buffer_size + self.data_buffer = np.zeros((1)) + self.video_buffer = np.zeros((1, 224, 224, 3)) + + self.video_transform = transforms.Compose([ + transforms.ToTensor(255)]) + + def listen(self): + """ + Start the node and begin processing input data + """ + rospy.init_node('opendr_audiovisualemotion_recognition', anonymous=True) + rospy.loginfo("Audiovisual emotion recognition node started!") + rospy.spin() + + def callback(self, image_data, audio_data): + """ + Callback that process the input data and publishes to the corresponding topics + :param image_data: input image message, face image of size 224x224 + :type image_data: sensor_msgs.msg.Image + :param audio_data: input audio message, speech + :type audio_data: audio_common_msgs.msg.AudioData + """ + audio_data = np.reshape(np.frombuffer(audio_data.data, dtype=np.int16)/32768.0, (1, -1)) + self.data_buffer = np.append(self.data_buffer, audio_data) + image_data = self.bridge.from_ros_image(image_data, encoding='bgr8').convert(format='channels_last') + self.video_buffer = np.append(self.video_buffer, np.expand_dims(image_data.data, 0), axis=0) + + if self.data_buffer.shape[0] > 16000*self.buffer_size: + audio = librosa.feature.mfcc(self.data_buffer[1:], sr=16000, n_mfcc=10) + audio = Timeseries(audio) + + to_select = select_distributed(15, len(self.video_buffer)-1) + video = self.video_buffer[1:][to_select] + + video = [self.video_transform(img) for img in video] + video = Video(torch.stack(video, 0).permute(1, 0, 2, 3)) + + class_pred = self.avlearner.infer(audio, video) + + # Publish output + ros_class = self.bridge.from_category_to_rosclass(class_pred) + self.publisher.publish(ros_class) + + self.data_buffer = np.zeros((1)) + self.video_buffer = np.zeros((1, 224, 224, 3)) + + +def select_distributed(m, n): return [i*n//m + n//(2*m) for i in range(m)] + +if __name__ == '__main__': + device = 'cuda' if torch.cuda.is_available() else 'cpu' + + parser = argparse.ArgumentParser() + parser.add_argument('--video_topic', type=str, help='listen to video input data on this topic') + parser.add_argument('--audio_topic', type=str, help='listen to audio input data on this topic') + parser.add_argument('--buffer_size', type=float, default=3.6, help='size of the audio buffer in seconds') + args = parser.parse_args() + + avnode = AudiovisualEmotionNode(input_video_topic=args.video_topic, input_audio_topic=args.audio_topic, + annotations_topic="/opendr/audiovisual_emotion", + buffer_size=args.buffer_size, device=device) + avnode.listen() diff --git a/projects/perception/multimodal_human_centric/audiovisual_emotion_recognition/README.MD b/projects/perception/multimodal_human_centric/audiovisual_emotion_recognition/README.MD new file mode 100644 index 0000000000..842bd53d04 --- /dev/null +++ b/projects/perception/multimodal_human_centric/audiovisual_emotion_recognition/README.MD @@ -0,0 +1,10 @@ +# Audiovisual Emotion Recognition + +This folder contains a sample application for audiovisual emotion recognition from provided audio and video. +Demo can be run as follows: + +```python +python3 audiovisual_emotion_recognition_demo.py -input_video input_video.mp4 -input_audio input_audio.wav +``` + +The tool expects to receive time-matching speech recording as an audio file and frontal face video as a video file input. Example audio and video files can be obtained from [RAVDESS dataset](https://zenodo.org/record/1188976#.Yl2yNijP1Pa). The data used for training the model is under 'Audio_Speech_Actors_01-24.zip' and 'Video_Speech_Actor_XX.zip' and audio-video pairs for running the demo can be obtained by downloading these files. The RAVDESS dataset is under CC BY-NC-SA 4.0 license. diff --git a/projects/perception/multimodal_human_centric/audiovisual_emotion_recognition/audiovisual_emotion_recognition_demo.py b/projects/perception/multimodal_human_centric/audiovisual_emotion_recognition/audiovisual_emotion_recognition_demo.py new file mode 100644 index 0000000000..64367c7a29 --- /dev/null +++ b/projects/perception/multimodal_human_centric/audiovisual_emotion_recognition/audiovisual_emotion_recognition_demo.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright 2020-2022 OpenDR European Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +from opendr.perception.multimodal_human_centric import AudiovisualEmotionLearner +import os + +parser = argparse.ArgumentParser() + +parser.add_argument('-input_video', type=str, help='path to video file', required=True) +parser.add_argument('-input_audio', type=str, help='path to audio file', required=True) + +args = parser.parse_args() + +assert os.path.exists(args.input_video), 'Provided input video file does not exist' +assert os.path.exists(args.input_audio), 'Provided input audio file does not exist' + +avlearner = AudiovisualEmotionLearner(device='cuda', fusion='ia', mod_drop='zerodrop') + +avlearner.download('model') +avlearner.load('model') + +audio, video = avlearner.load_inference_data(args.input_audio, args.input_video) +prediction = avlearner.infer(audio, video) +print(avlearner.pred_to_label(prediction)) diff --git a/projects/perception/multimodal_human_centric/README.MD b/projects/perception/multimodal_human_centric/rgbd_hand_gesture_recognition/README.MD similarity index 100% rename from projects/perception/multimodal_human_centric/README.MD rename to projects/perception/multimodal_human_centric/rgbd_hand_gesture_recognition/README.MD diff --git a/projects/perception/multimodal_human_centric/gesture_recognition_demo.py b/projects/perception/multimodal_human_centric/rgbd_hand_gesture_recognition/gesture_recognition_demo.py similarity index 100% rename from projects/perception/multimodal_human_centric/gesture_recognition_demo.py rename to projects/perception/multimodal_human_centric/rgbd_hand_gesture_recognition/gesture_recognition_demo.py diff --git a/projects/perception/multimodal_human_centric/input_depth.png b/projects/perception/multimodal_human_centric/rgbd_hand_gesture_recognition/input_depth.png similarity index 100% rename from projects/perception/multimodal_human_centric/input_depth.png rename to projects/perception/multimodal_human_centric/rgbd_hand_gesture_recognition/input_depth.png diff --git a/projects/perception/multimodal_human_centric/input_rgb.png b/projects/perception/multimodal_human_centric/rgbd_hand_gesture_recognition/input_rgb.png similarity index 100% rename from projects/perception/multimodal_human_centric/input_rgb.png rename to projects/perception/multimodal_human_centric/rgbd_hand_gesture_recognition/input_rgb.png diff --git a/src/opendr/perception/multimodal_human_centric/__init__.py b/src/opendr/perception/multimodal_human_centric/__init__.py index e39d7d761b..4c1c7a15b8 100644 --- a/src/opendr/perception/multimodal_human_centric/__init__.py +++ b/src/opendr/perception/multimodal_human_centric/__init__.py @@ -2,5 +2,11 @@ RgbdHandGestureLearner, get_builtin_architectures, ) +from opendr.perception.multimodal_human_centric.\ + audiovisual_emotion_learner.avlearner import AudiovisualEmotionLearner +from opendr.perception.multimodal_human_centric.\ + audiovisual_emotion_learner.algorithm.data import get_audiovisual_emotion_dataset +from opendr.perception.multimodal_human_centric.audiovisual_emotion_learner.algorithm import spatial_transforms -__all__ = ['RgbdHandGestureLearner', 'get_builtin_architectures'] +__all__ = ['RgbdHandGestureLearner', 'get_builtin_architectures', 'AudiovisualEmotionLearner', + 'get_audiovisual_emotion_dataset', 'spatial_transforms'] diff --git a/src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/README.MD b/src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/README.MD new file mode 100644 index 0000000000..5635074312 --- /dev/null +++ b/src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/README.MD @@ -0,0 +1,3 @@ +# OpenDR - Audiovisual Emotion Recognition Model + +This module provides the implementation of the audiovisual emotion recognition method using speech audio and frontal face video data following the approach described in ['Self-attention fusion for audiovisual emotion recognition with incomplete data'](https://arxiv.org/abs/2201.11095). A pretrained model on [RAVDESS](https://zenodo.org/record/1188976#.YlkXNyjP1PY) dataset is provided. RAVDESS dataset is under CC BY-NC-SA 4.0 license. diff --git a/src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/__init__.py b/src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/algorithm/__init__.py b/src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/algorithm/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/algorithm/data.py b/src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/algorithm/data.py new file mode 100644 index 0000000000..8fa644d99e --- /dev/null +++ b/src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/algorithm/data.py @@ -0,0 +1,158 @@ +# Copyright 2020-2022 OpenDR European Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import numpy as np +import os +from opendr.engine.datasets import DatasetIterator +from opendr.perception.multimodal_human_centric.audiovisual_emotion_learner.algorithm import spatial_transforms as transforms +from opendr.perception.multimodal_human_centric.audiovisual_emotion_learner.algorithm.data_utils import ( + preprocess_video, + preprocess_audio + ) +import librosa +from PIL import Image +import random +from tqdm import tqdm + + +class RavdessDataset(DatasetIterator): + def __init__(self, annotation, video_transform, sr=22050, n_mfcc=10): + + self.annotation = annotation + self.video_transform = video_transform + self.sr = sr + self.n_mfcc = n_mfcc + + def __len__(self,): + return len(self.annotation) + + def __getitem__(self, i): + + target = self.annotation[i]['label'] + video = np.load(self.annotation[i]['video_path']) + video = [Image.fromarray(video[i, :, :, :]) + for i in range(np.shape(video)[0])] + self.video_transform.randomize_parameters() + video = [self.video_transform(img) for img in video] + video = torch.stack(video, 0).permute(1, 0, 2, 3) + + audio = np.load(self.annotation[i]['audio_path']).astype(np.float32) + audio = librosa.feature.mfcc(y=audio, sr=self.sr, n_mfcc=self.n_mfcc) + + return audio, video, target + + +class DataWrapper: + def __init__(self, opendr_dataset): + self.dataset = opendr_dataset + + def __len__(self,): + return len(self.dataset) + + def __getitem__(self, i): + x, y, z = self.dataset.__getitem__(i) + return x.data, y.data, z.data + + +def parse_annotations(path, annotation_path): + with open(annotation_path, 'r') as f: + annots = f.readlines() + train_dataset = [] + val_dataset = [] + test_dataset = [] + for line in annots: + videofilename, audiofilename, label, trainvaltest = line.rstrip().split(';') + videofilename = os.path.join(path, videofilename) + audiofilename = os.path.join(path, audiofilename) + + assert os.path.exists(videofilename), 'File {} not found.'.format(videofilename) + assert os.path.exists(audiofilename), 'File {} not found.'.format(audiofilename) + + sample = {'video_path': videofilename, + 'audio_path': audiofilename, + 'label': int(label)-1} + + if trainvaltest == 'training': + train_dataset.append(sample) + elif trainvaltest == 'testing': + test_dataset.append(sample) + elif trainvaltest == 'validation': + val_dataset.append(sample) + + return train_dataset, val_dataset, test_dataset + + +def get_random_split_ravdess(): + ids = list(np.arange(1, 25)) + s1 = ids[::2] + s2 = ids[1::2] + random.shuffle(s1) + random.shuffle(s2) + n_train = 8 + n_val = 2 + train_ids = s1[:n_train] + s2[:n_train] + val_ids = s1[n_train:n_train+n_val]+s2[n_train:n_train+n_val] + test_ids = s1[n_train+n_val:] + s2[n_train+n_val:] + return train_ids, val_ids, test_ids + + +def preprocess_ravdess(src='RAVDESS/', sr=22050, n_mfcc=10, target_time=3.6, + input_fps=30, save_frames=15, target_im_size=224, device='cpu'): + train_ids, val_ids, test_ids = get_random_split_ravdess() + annotations_file = os.path.join(src, 'annotations.txt') + for actor in os.listdir(src): + if int(actor[-2:]) in train_ids: + subset = 'training' + elif int(actor[-2:]) in val_ids: + subset = 'validation' + elif int(actor[-2:]) in test_ids: + subset = 'testing' + + for file in tqdm(os.listdir(os.path.join(src, actor))): + if file.endswith('.mp4'): + video = preprocess_video(os.path.join(src, actor, file), target_time, input_fps, + save_frames, target_im_size, device) + np.save(os.path.join(src, actor, file.replace('.mp4', '.npy')), video) + label = str(int(file.split('-')[2])) + audio_path = '03' + file[2:].replace('.mp4', '.wav') + audio = preprocess_audio(os.path.join(src, actor, audio_path), sr, target_time) + np.save(os.path.join(src, actor, audio_path.replace('.wav', '.npy')), audio) + with open(annotations_file, 'a') as f: + f.write(os.path.join(src, actor, file.replace('.mp4', '.npy')) + + ';' + os.path.join(src, actor, audio_path.replace('.wav', '.npy')) + + ';' + label + ';' + subset + '\n') + + +def get_audiovisual_emotion_dataset(path='RAVDESS/', sr=22050, n_mfcc=10, preprocess=False, + target_time=3.6, input_fps=30, save_frames=15, target_im_size=224, device='cpu'): + if preprocess: + preprocess_ravdess(path, sr, n_mfcc, target_time, input_fps, save_frames, target_im_size, device) + annot_path = os.path.join(path, 'annotations.txt') + + train_annots, val_annots, test_annots = parse_annotations(path, annot_path) + video_scale = 255 + + video_train_transform = transforms.Compose([ + transforms.RandomHorizontalFlip(), + transforms.RandomRotate(), + transforms.ToTensor(video_scale)]) + + video_val_transform = transforms.Compose([ + transforms.ToTensor(video_scale)]) + + train_set = RavdessDataset(train_annots, video_train_transform, sr=sr, n_mfcc=n_mfcc) + val_set = RavdessDataset(val_annots, video_val_transform, sr=sr, n_mfcc=n_mfcc) + test_set = RavdessDataset(test_annots, video_val_transform, sr=sr, n_mfcc=n_mfcc) + return train_set, val_set, test_set diff --git a/src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/algorithm/data_utils.py b/src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/algorithm/data_utils.py new file mode 100644 index 0000000000..154b6fd6a7 --- /dev/null +++ b/src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/algorithm/data_utils.py @@ -0,0 +1,119 @@ +# Copyright 2020-2022 OpenDR European Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import librosa +import numpy as np +import cv2 +from opendr.perception.object_detection_2d import RetinaFaceLearner + + +def preprocess_video(video_path, target_time=3.6, input_fps=30, save_frames=15, target_im_size=224, device='cpu'): + """ + This function preprocesses input video file: crops/pads it to desired target_time (match with audio), + performs face detection and uniformly selects N frames + Parameters + ---------- + video_path : str + path to video file. + target_time : float, optional + Target time of processed video file in seconds. The default is 3.6. + input_fps : int, optional + Frames Per Second of input video file. The default is 30. + save_frames : int, optional + Length of target frame sequence. The default is 15. + target_im_size : int, optional + Target width and height of each frame. The default is 224. + + Returns + ------- + numpy_video: numpy.array + N frames as numpy array + + """ + + learner = RetinaFaceLearner(backbone='resnet', device=device) + + if not os.path.exists('./retinaface_resnet'): + learner.download(".", mode="pretrained") + learner.load("./retinaface_resnet") + + def select_distributed(m, n): return [i*n//m + n//(2*m) for i in range(m)] + + cap = cv2.VideoCapture(video_path) + framen = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + if target_time*input_fps > framen: + skip_begin = int((framen - (target_time*input_fps)) // 2) + for i in range(skip_begin): + _, im = cap.read() + + framen = int(target_time*input_fps) + frames_to_select = select_distributed(save_frames, framen) + numpy_video = [] + frame_ctr = 0 + while True: + ret, im = cap.read() + if not ret or len(frames_to_select) == 0: + break + if frame_ctr not in frames_to_select: + frame_ctr += 1 + continue + else: + frames_to_select.remove(frame_ctr) + frame_ctr += 1 + + bboxes = learner.infer(im) + if len(bboxes) > 1: + print('Warning! Multiple faces detected. Using first detected face') + + im = im[int(bboxes[0].top):int(bboxes[0].top+bboxes[0].height), + int(bboxes[0].left):int(bboxes[0].left+bboxes[0].width), :] + + im = cv2.resize(im, (target_im_size, target_im_size)) + numpy_video.append(im) + + if len(frames_to_select) > 0: + for i in range(len(frames_to_select)): + numpy_video.append(np.zeros((224, 224, 3), dtype=np.uint8)) + + numpy_video = np.array(numpy_video) + return numpy_video + + +def preprocess_audio(audio_path, sr=22050, target_time=3.6): + """ + This function preprocesses an audio file. Audio file is cropped/padded to target time. + + Parameters + ---------- + audio_path : str + Path to audio file. + target_time : int, optional + Target duration of audio. The default is 3.6. + sr : int, optional + Sampling rate of audio. The default is 22050. + + Returns + ------- + y : numpy array + audio file saved as numpy array. + """ + y, _ = librosa.core.load(audio_path, sr=sr) + target_length = int(sr * target_time) + if len(y) < target_length: + y = np.array(list(y) + [0 for i in range(target_length - len(y))]) + else: + remain = len(y) - target_length + y = y[remain//2:-(remain - remain//2)] + return y diff --git a/src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/algorithm/efficientface_modulator.py b/src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/algorithm/efficientface_modulator.py new file mode 100644 index 0000000000..31486f9687 --- /dev/null +++ b/src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/algorithm/efficientface_modulator.py @@ -0,0 +1,84 @@ +# MIT License +# +# Copyright (c) 2021 Zengqun (Zeke) Zhao +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +""" +This code is modified from: https://github.com/zengqunzhao/EfficientFace/blob/master/models/modulator.py +""" + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class Flatten(nn.Module): + def forward(self, x): + return x.view(x.size(0), -1) + + +class Channel(nn.Module): + def __init__(self, gate_channel, reduction_ratio=16, num_layers=1): + super(Channel, self).__init__() + self.gate_c = nn.Sequential() + self.gate_c.add_module('flatten', Flatten()) + gate_channels = [gate_channel] + gate_channels += [gate_channel // reduction_ratio] * num_layers + gate_channels += [gate_channel] + for i in range(len(gate_channels) - 2): + self.gate_c.add_module('gate_c_fc_%d' % i, nn.Linear(gate_channels[i], gate_channels[i+1])) + self.gate_c.add_module('gate_c_bn_%d' % (i+1), nn.BatchNorm1d(gate_channels[i+1])) + self.gate_c.add_module('gate_c_relu_%d' % (i+1), nn.ReLU()) + self.gate_c.add_module('gate_c_fc_final', nn.Linear(gate_channels[-2], gate_channels[-1])) + + def forward(self, in_tensor): + avg_pool = F.avg_pool2d(in_tensor, in_tensor.size(2), stride=in_tensor.size(2)) + return self.gate_c(avg_pool).unsqueeze(2).unsqueeze(3).expand_as(in_tensor) + + +class Spatial(nn.Module): + def __init__(self, gate_channel, reduction_ratio=16, dilation_conv_num=2, dilation_val=4): + super(Spatial, self).__init__() + self.gate_s = nn.Sequential() + self.gate_s.add_module('gate_s_conv_reduce0', nn.Conv2d(gate_channel, gate_channel//reduction_ratio, kernel_size=1)) + self.gate_s.add_module('gate_s_bn_reduce0', nn.BatchNorm2d(gate_channel//reduction_ratio)) + self.gate_s.add_module('gate_s_relu_reduce0', nn.ReLU()) + for i in range(dilation_conv_num): + self.gate_s.add_module('gate_s_conv_di_%d' % i, nn.Conv2d(gate_channel//reduction_ratio, + gate_channel//reduction_ratio, + kernel_size=3, + padding=dilation_val, + dilation=dilation_val)) + self.gate_s.add_module('gate_s_bn_di_%d' % i, nn.BatchNorm2d(gate_channel//reduction_ratio)) + self.gate_s.add_module('gate_s_relu_di_%d' % i, nn.ReLU()) + self.gate_s.add_module('gate_s_conv_final', nn.Conv2d(gate_channel//reduction_ratio, 1, kernel_size=1)) + + def forward(self, in_tensor): + return self.gate_s(in_tensor).expand_as(in_tensor) + + +class Modulator(nn.Module): + def __init__(self, gate_channel): + super(Modulator, self).__init__() + self.channel_att = Channel(gate_channel) + self.spatial_att = Spatial(gate_channel) + + def forward(self, in_tensor): + att = torch.sigmoid(self.channel_att(in_tensor) * self.spatial_att(in_tensor)) + return att * in_tensor diff --git a/src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/algorithm/efficientface_utils.py b/src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/algorithm/efficientface_utils.py new file mode 100644 index 0000000000..604078ba8a --- /dev/null +++ b/src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/algorithm/efficientface_utils.py @@ -0,0 +1,155 @@ +# MIT License +# +# Copyright (c) 2021 Zengqun (Zeke) Zhao +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +""" +This code is modified from https://github.com/zengqunzhao/EfficientFace/blob/master/models/EfficientFace.py +""" + +import torch +import torch.nn as nn + + +def depthwise_conv(i, o, kernel_size, stride=1, padding=0, bias=False): + return nn.Conv2d(i, o, kernel_size, stride, padding, bias=bias, groups=i) + + +def channel_shuffle(x, groups): + batchsize, num_channels, height, width = x.data.size() + channels_per_group = num_channels // groups + # reshape + x = x.view(batchsize, groups, channels_per_group, height, width) + x = torch.transpose(x, 1, 2).contiguous() + # flatten + x = x.view(batchsize, -1, height, width) + return x + + +class LocalFeatureExtractor(nn.Module): + + def __init__(self, inplanes, planes, index): + super(LocalFeatureExtractor, self).__init__() + self.index = index + + norm_layer = nn.BatchNorm2d + self.relu = nn.ReLU() + + self.conv1_1 = depthwise_conv(inplanes, planes, kernel_size=3, stride=2, padding=1) + self.bn1_1 = norm_layer(planes) + self.conv1_2 = depthwise_conv(planes, planes, kernel_size=3, stride=1, padding=1) + self.bn1_2 = norm_layer(planes) + + self.conv2_1 = depthwise_conv(inplanes, planes, kernel_size=3, stride=2, padding=1) + self.bn2_1 = norm_layer(planes) + self.conv2_2 = depthwise_conv(planes, planes, kernel_size=3, stride=1, padding=1) + self.bn2_2 = norm_layer(planes) + + self.conv3_1 = depthwise_conv(inplanes, planes, kernel_size=3, stride=2, padding=1) + self.bn3_1 = norm_layer(planes) + self.conv3_2 = depthwise_conv(planes, planes, kernel_size=3, stride=1, padding=1) + self.bn3_2 = norm_layer(planes) + + self.conv4_1 = depthwise_conv(inplanes, planes, kernel_size=3, stride=2, padding=1) + self.bn4_1 = norm_layer(planes) + self.conv4_2 = depthwise_conv(planes, planes, kernel_size=3, stride=1, padding=1) + self.bn4_2 = norm_layer(planes) + + def forward(self, x): + + patch_11 = x[:, :, 0:28, 0:28] + patch_21 = x[:, :, 28:56, 0:28] + patch_12 = x[:, :, 0:28, 28:56] + patch_22 = x[:, :, 28:56, 28:56] + + out_1 = self.conv1_1(patch_11) + out_1 = self.bn1_1(out_1) + out_1 = self.relu(out_1) + out_1 = self.conv1_2(out_1) + out_1 = self.bn1_2(out_1) + out_1 = self.relu(out_1) + + out_2 = self.conv2_1(patch_21) + out_2 = self.bn2_1(out_2) + out_2 = self.relu(out_2) + out_2 = self.conv2_2(out_2) + out_2 = self.bn2_2(out_2) + out_2 = self.relu(out_2) + + out_3 = self.conv3_1(patch_12) + out_3 = self.bn3_1(out_3) + out_3 = self.relu(out_3) + out_3 = self.conv3_2(out_3) + out_3 = self.bn3_2(out_3) + out_3 = self.relu(out_3) + + out_4 = self.conv4_1(patch_22) + out_4 = self.bn4_1(out_4) + out_4 = self.relu(out_4) + out_4 = self.conv4_2(out_4) + out_4 = self.bn4_2(out_4) + out_4 = self.relu(out_4) + + out1 = torch.cat([out_1, out_2], dim=2) + out2 = torch.cat([out_3, out_4], dim=2) + out = torch.cat([out1, out2], dim=3) + + return out + + +class InvertedResidual(nn.Module): + def __init__(self, inp, oup, stride): + super(InvertedResidual, self).__init__() + + if not (1 <= stride <= 3): + raise ValueError('illegal stride value') + self.stride = stride + + branch_features = oup // 2 + assert (self.stride != 1) or (inp == branch_features << 1) + + if self.stride > 1: + self.branch1 = nn.Sequential( + depthwise_conv(inp, inp, kernel_size=3, stride=self.stride, padding=1), + nn.BatchNorm2d(inp), + nn.Conv2d(inp, branch_features, kernel_size=1, stride=1, padding=0, bias=False), + nn.BatchNorm2d(branch_features), + nn.ReLU(inplace=True)) + + self.branch2 = nn.Sequential( + nn.Conv2d(inp if (self.stride > 1) else branch_features, + branch_features, kernel_size=1, stride=1, padding=0, bias=False), + nn.BatchNorm2d(branch_features), + nn.ReLU(inplace=True), + depthwise_conv(branch_features, branch_features, kernel_size=3, stride=self.stride, padding=1), + nn.BatchNorm2d(branch_features), + nn.Conv2d(branch_features, branch_features, kernel_size=1, stride=1, padding=0, bias=False), + nn.BatchNorm2d(branch_features), + nn.ReLU(inplace=True)) + + def forward(self, x): + if self.stride == 1: + x1, x2 = x.chunk(2, dim=1) + out = torch.cat((x1, self.branch2(x2)), dim=1) + else: + out = torch.cat((self.branch1(x), self.branch2(x)), dim=1) + + out = channel_shuffle(out, 2) + + return out diff --git a/src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/algorithm/models.py b/src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/algorithm/models.py new file mode 100644 index 0000000000..b1401bb85d --- /dev/null +++ b/src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/algorithm/models.py @@ -0,0 +1,320 @@ +# Copyright 2020-2022 OpenDR European Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Parts of this code is modified from https://github.com/zengqunzhao/EfficientFace/blob/master/models/EfficientFace.py: +""" + +# MIT License +# +# Copyright (c) 2021 Zengqun (Zeke) Zhao +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +import torch +import torch.nn as nn +from opendr.perception.multimodal_human_centric.audiovisual_emotion_learner.algorithm.efficientface_modulator import Modulator +from opendr.perception.multimodal_human_centric.audiovisual_emotion_learner.algorithm.efficientface_utils import ( + LocalFeatureExtractor, + InvertedResidual + ) +from opendr.perception.multimodal_human_centric.audiovisual_emotion_learner.algorithm.transformer_timm import ( + AttentionBlock, + Attention + ) + + +def conv1d_block(in_channels, out_channels, kernel_size=3, stride=1, padding=0): + padding = kernel_size//2 + return nn.Sequential(nn.Conv1d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding), + nn.BatchNorm1d(out_channels), + nn.ReLU(inplace=True)) + + +class EfficientFaceTemporal(nn.Module): + + def __init__(self, stages_repeats, stages_out_channels, num_classes=8, im_per_sample=15): + super(EfficientFaceTemporal, self).__init__() + + if len(stages_repeats) != 3: + raise ValueError('expected stages_repeats as list of 3 positive ints') + if len(stages_out_channels) != 5: + raise ValueError('expected stages_out_channels as list of 5 positive ints') + self._stage_out_channels = stages_out_channels + + input_channels = 3 + output_channels = self._stage_out_channels[0] + self.conv1 = nn.Sequential(nn.Conv2d(input_channels, output_channels, 3, 2, 1, bias=False), + nn.BatchNorm2d(output_channels), + nn.ReLU(inplace=True),) + input_channels = output_channels + + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + + stage_names = ['stage{}'.format(i) for i in [2, 3, 4]] + for name, repeats, output_channels in zip(stage_names, stages_repeats, self._stage_out_channels[1:]): + seq = [InvertedResidual(input_channels, output_channels, 2)] + for i in range(repeats - 1): + seq.append(InvertedResidual(output_channels, output_channels, 1)) + setattr(self, name, nn.Sequential(*seq)) + input_channels = output_channels + + self.local = LocalFeatureExtractor(29, 116, 1) + self.modulator = Modulator(116) + + output_channels = self._stage_out_channels[-1] + + self.conv5 = nn.Sequential(nn.Conv2d(input_channels, output_channels, 1, 1, 0, bias=False), + nn.BatchNorm2d(output_channels), + nn.ReLU(inplace=True),) + self.conv1d_0 = conv1d_block(output_channels, 64) + self.conv1d_1 = conv1d_block(64, 64) + + self.conv1d_2 = conv1d_block(64, 128) + self.conv1d_3 = conv1d_block(128, 128) + + self.classifier = nn.Sequential( + nn.Linear(128, num_classes), + ) + + self.im_per_sample = im_per_sample + + def forward_features(self, x): + x = self.conv1(x) + x = self.maxpool(x) + x = self.modulator(self.stage2(x)) + self.local(x) + x = self.stage3(x) + x = self.stage4(x) + x = self.conv5(x) + x = x.mean([2, 3]) + return x + + def forward_stage1(self, x): + assert x.shape[0] % self.im_per_sample == 0, "Batch size is not a multiple of sequence length." + n_samples = x.shape[0] // self.im_per_sample + x = x.view(n_samples, self.im_per_sample, x.shape[1]) + x = x.permute(0, 2, 1) + x = self.conv1d_0(x) + x = self.conv1d_1(x) + return x + + def forward_stage2(self, x): + x = self.conv1d_2(x) + x = self.conv1d_3(x) + return x + + def forward_classifier(self, x): + x = x.mean([-1]) + x = self.classifier(x) + return x + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_stage1(x) + x = self.forward_stage2(x) + x = self.forward_classifier(x) + return x + + +def init_feature_extractor(model, path): + if path is None: + return + checkpoint = torch.load(path, map_location=torch.device('cpu')) + pre_trained_dict = checkpoint['state_dict'] + pre_trained_dict = {key.replace("module.", ""): value for key, value in pre_trained_dict.items()} + model.load_state_dict(pre_trained_dict, strict=False) + + +def conv1d_block_audio(in_channels, out_channels, kernel_size=3, stride=1, padding=0): + return nn.Sequential(nn.Conv1d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=0), + nn.BatchNorm1d(out_channels), + nn.ReLU(inplace=True), nn.MaxPool1d(2, 1)) + + +class AudioCNNPool(nn.Module): + + def __init__(self, num_classes=8): + super(AudioCNNPool, self).__init__() + input_channels = 10 + self.conv1d_0 = conv1d_block_audio(input_channels, 64) + self.conv1d_1 = conv1d_block_audio(64, 128) + self.conv1d_2 = conv1d_block_audio(128, 256) + self.conv1d_3 = conv1d_block_audio(256, 128) + + self.classifier = nn.Sequential( + nn.Linear(128, num_classes), + ) + + def forward(self, x): + x = self.forward_stage1(x) + x = self.forward_stage2(x) + x = self.forward_classifier(x) + return x + + def forward_stage1(self, x): + x = self.conv1d_0(x) + x = self.conv1d_1(x) + return x + + def forward_stage2(self, x): + x = self.conv1d_2(x) + x = self.conv1d_3(x) + return x + + def forward_classifier(self, x): + x = x.mean([-1]) # pooling accross temporal dimension + x = self.classifier(x) + return x + + +class MultiModalCNN(nn.Module): + def __init__(self, num_classes=8, fusion='ia', seq_length=15, pretr_ef=None, num_heads=1): + super(MultiModalCNN, self).__init__() + + self.audio_model = AudioCNNPool(num_classes=num_classes) + self.visual_model = EfficientFaceTemporal([4, 8, 4], [29, 116, 232, 464, 1024], num_classes, seq_length) + self.fusion = fusion + + init_feature_extractor(self.visual_model, pretr_ef) + + e_dim = 128 + input_dim_video = 64 + input_dim_audio = 128 + if fusion == 'lt': + input_dim_video = 128 + self.av = AttentionBlock(in_dim_k=input_dim_video, in_dim_q=input_dim_audio, out_dim=e_dim, num_heads=num_heads) + self.va = AttentionBlock(in_dim_k=input_dim_audio, in_dim_q=input_dim_video, out_dim=e_dim, num_heads=num_heads) + elif fusion == 'it': + self.av = AttentionBlock(in_dim_k=input_dim_video, in_dim_q=input_dim_audio, + out_dim=input_dim_audio, num_heads=num_heads) + self.va = AttentionBlock(in_dim_k=input_dim_audio, in_dim_q=input_dim_video, + out_dim=input_dim_video, num_heads=num_heads) + elif fusion == 'ia': + self.av = Attention(in_dim_k=input_dim_video, in_dim_q=input_dim_audio, + out_dim=input_dim_audio, num_heads=num_heads) + self.va = Attention(in_dim_k=input_dim_audio, in_dim_q=input_dim_video, + out_dim=input_dim_video, num_heads=num_heads) + + self.classifier = nn.Sequential( + nn.Linear(e_dim*2, num_classes), + ) + + def forward(self, x_audio, x_visual): + + if self.fusion == 'lt': + return self.forward_late_transformer(x_audio, x_visual) + elif self.fusion == 'ia': + return self.forward_int_attention(x_audio, x_visual) + elif self.fusion == 'it': + return self.forward_int_transformer(x_audio, x_visual) + + def forward_int_transformer(self, x_audio, x_visual): + x_audio = self.audio_model.forward_stage1(x_audio) + x_visual = self.visual_model.forward_features(x_visual) + x_visual = self.visual_model.forward_stage1(x_visual) + + proj_x_a = x_audio.permute(0, 2, 1) + proj_x_v = x_visual.permute(0, 2, 1) + + h_av = self.av(proj_x_v, proj_x_a) + h_va = self.va(proj_x_a, proj_x_v) + + h_av = h_av.permute(0, 2, 1) + h_va = h_va.permute(0, 2, 1) + + x_audio = h_av+x_audio + x_visual = h_va + x_visual + + x_audio = self.audio_model.forward_stage2(x_audio) + x_visual = self.visual_model.forward_stage2(x_visual) + + audio_pooled = x_audio.mean([-1]) # mean accross temporal dimension + video_pooled = x_visual.mean([-1]) + + x = torch.cat((audio_pooled, video_pooled), dim=-1) + + x = self.classifier(x) + return x + + def forward_int_attention(self, x_audio, x_visual): + x_audio = self.audio_model.forward_stage1(x_audio) + x_visual = self.visual_model.forward_features(x_visual) + x_visual = self.visual_model.forward_stage1(x_visual) + + proj_x_a = x_audio.permute(0, 2, 1) + proj_x_v = x_visual.permute(0, 2, 1) + + _, h_av = self.av(proj_x_v, proj_x_a) + _, h_va = self.va(proj_x_a, proj_x_v) + + if h_av.size(1) > 1: # if more than 1 head, take mean + h_av = torch.mean(h_av, axis=1).unsqueeze(1) + + h_av = h_av.sum([-2]) + + if h_va.size(1) > 1: # if more than 1 head, take mean + h_va = torch.mean(h_va, axis=1).unsqueeze(1) + + h_va = h_va.sum([-2]) + + x_audio = h_va*x_audio + x_visual = h_av*x_visual + + x_audio = self.audio_model.forward_stage2(x_audio) + x_visual = self.visual_model.forward_stage2(x_visual) + + audio_pooled = x_audio.mean([-1]) # mean accross temporal dimension + video_pooled = x_visual.mean([-1]) + + x = torch.cat((audio_pooled, video_pooled), dim=-1) + + x = self.classifier(x) + return x + + def forward_late_transformer(self, x_audio, x_visual): + x_audio = self.audio_model.forward_stage1(x_audio) + proj_x_a = self.audio_model.forward_stage2(x_audio) + + x_visual = self.visual_model.forward_features(x_visual) + x_visual = self.visual_model.forward_stage1(x_visual) + proj_x_v = self.visual_model.forward_stage2(x_visual) + + proj_x_a = proj_x_a.permute(0, 2, 1) + proj_x_v = proj_x_v.permute(0, 2, 1) + h_av = self.av(proj_x_v, proj_x_a) + h_va = self.va(proj_x_a, proj_x_v) + + audio_pooled = h_av.mean([1]) # mean accross temporal dimension + video_pooled = h_va.mean([1]) + + x = torch.cat((audio_pooled, video_pooled), dim=-1) + + x = self.classifier(x) + return x diff --git a/src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/algorithm/spatial_transforms.py b/src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/algorithm/spatial_transforms.py new file mode 100644 index 0000000000..6a0ca881c7 --- /dev/null +++ b/src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/algorithm/spatial_transforms.py @@ -0,0 +1,194 @@ +# MIT License +# +# Copyright (c) 2019, Okan Köpüklü +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +""" +This code is modified from https://github.com/okankop/Efficient-3DCNNs/ +""" + +import random +import numbers +import numpy as np +import torch +from PIL import Image +try: + import accimage +except ImportError: + accimage = None + + +class Compose(object): + """Composes several transforms together. + Args: + transforms (list of ``Transform`` objects): list of transforms to compose. + Example: + >>> transforms.Compose([ + >>> transforms.CenterCrop(10), + >>> transforms.ToTensor(), + >>> ]) + """ + + def __init__(self, transforms): + self.transforms = transforms + + def __call__(self, img): + for t in self.transforms: + + img = t(img) + return img + + def randomize_parameters(self): + for t in self.transforms: + t.randomize_parameters() + + +class ToTensor(object): + """Convert a ``PIL.Image`` or ``numpy.ndarray`` to tensor. + Converts a PIL.Image or numpy.ndarray (H x W x C) in the range + [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0]. + """ + + def __init__(self, norm_value=255): + self.norm_value = norm_value + + def __call__(self, pic): + """ + Args: + pic (PIL.Image or numpy.ndarray): Image to be converted to tensor. + Returns: + Tensor: Converted image. + """ + if isinstance(pic, np.ndarray): + # handle numpy array + img = torch.from_numpy(pic.transpose((2, 0, 1))) + # backward compatibility + return img.float().div(self.norm_value) + if accimage is not None and isinstance(pic, accimage.Image): + nppic = np.zeros( + [pic.channels, pic.height, pic.width], dtype=np.float32) + pic.copyto(nppic) + return torch.from_numpy(nppic) + + # handle PIL Image + if pic.mode == 'I': + img = torch.from_numpy(np.array(pic, np.int32, copy=False)) + elif pic.mode == 'I;16': + img = torch.from_numpy(np.array(pic, np.int16, copy=False)) + else: + img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes())) + # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK + if pic.mode == 'YCbCr': + nchannel = 3 + elif pic.mode == 'I;16': + nchannel = 1 + else: + nchannel = len(pic.mode) + img = img.view(pic.size[1], pic.size[0], nchannel) + # put it from HWC to CHW format + # yikes, this transpose takes 80% of the loading time/CPU + img = img.transpose(0, 1).transpose(0, 2).contiguous() + + if isinstance(img, torch.ByteTensor): + return img.float().div(self.norm_value) + else: + return img + + def randomize_parameters(self): + pass + + +class CenterCrop(object): + """Crops the given PIL.Image at the center. + Args: + size (sequence or int): Desired output size of the crop. If size is an + int instead of sequence like (h, w), a square crop (size, size) is + made. + """ + + def __init__(self, size): + if isinstance(size, numbers.Number): + self.size = (int(size), int(size)) + else: + self.size = size + + def __call__(self, img): + """ + Args: + img (PIL.Image): Image to be cropped. + Returns: + PIL.Image: Cropped image. + """ + w, h = img.size + th, tw = self.size + x1 = int(round((w - tw) / 2.)) + y1 = int(round((h - th) / 2.)) + return img.crop((x1, y1, x1 + tw, y1 + th)) + + def randomize_parameters(self): + pass + + +class RandomHorizontalFlip(object): + """Horizontally flip the given PIL.Image randomly with a probability of 0.5.""" + + def __call__(self, img): + """ + Args: + img (PIL.Image): Image to be flipped. + Returns: + PIL.Image: Randomly flipped image. + """ + if self.p < 0.5: + return img.transpose(Image.FLIP_LEFT_RIGHT) + return img + + def randomize_parameters(self): + self.p = random.random() + + +class RandomRotate(object): + + def __init__(self): + self.interpolation = Image.BILINEAR + + def __call__(self, img): + ret_img = img.rotate(self.rotate_angle, resample=self.interpolation) + + return ret_img + + def randomize_parameters(self): + self.rotate_angle = random.randint(-10, 10) + + +class RandomResize(object): + + def __init__(self): + self.interpolation = Image.BILINEAR + + def __call__(self, img): + im_size = img.size + ret_img = img.resize((int(im_size[0]*self.resize_const), + int(im_size[1]*self.resize_const))) + + return ret_img + + def randomize_parameters(self): + self.resize_const = random.uniform(0.9, 1.1) diff --git a/src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/algorithm/trainer.py b/src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/algorithm/trainer.py new file mode 100644 index 0000000000..be5e6f0659 --- /dev/null +++ b/src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/algorithm/trainer.py @@ -0,0 +1,238 @@ +# Copyright 2020-2022 OpenDR European Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from opendr.perception.multimodal_human_centric.audiovisual_emotion_learner.algorithm.utils import ( + adjust_learning_rate, + Logger, + save_checkpoint, + calculate_accuracy, + AverageMeter + ) +import os +import torch.nn as nn +from torch import optim +import torch + + +def update_tensorboard(tensorboard_logger, logger_prefix, name, value, epoch_idx): + tensorboard_logger.add_scalar(tag='{}/{}'.format(logger_prefix, name), + scalar_value=value, + global_step=epoch_idx + 1) + tensorboard_logger.flush() + + +def train(model, train_loader, val_loader, learning_rate, momentum, dampening, weight_decay, n_epochs, save_dir, lr_steps, + mod_drop='zerodrop', device='cpu', silent=False, verbose=True, + tensorboard_logger=None, eval_mode='audiovisual', restore_best=False): + + metrics = {'train_loss': [], 'train_acc': []} + train_logger = Logger(os.path.join(save_dir, 'train.log'), + ['epoch', 'loss', 'acc']) + + if val_loader is not None: + metrics.update({'val_loss': [], 'val_acc': []}) + val_logger = Logger(os.path.join(save_dir, 'val.log'), ['epoch', 'loss', 'acc']) + + criterion = nn.CrossEntropyLoss() + + optimizer = optim.SGD( + model.parameters(), + lr=learning_rate, + momentum=momentum, + dampening=dampening, + weight_decay=weight_decay) + + best_acc = -1 + is_best = False + for i in range(n_epochs): + adjust_learning_rate(optimizer, i, learning_rate, lr_steps) + + train_loss, train_acc = train_one_epoch(i, train_loader, model, criterion, optimizer, + train_logger, mod_drop, device, silent, verbose) + metrics['train_loss'].append(train_loss) + metrics['train_acc'].append(train_acc) + if tensorboard_logger is not None: + update_tensorboard(tensorboard_logger, 'train', 'loss', train_loss, i) + update_tensorboard(tensorboard_logger, 'train', 'acc', train_acc, i) + + if val_loader is not None: + validation_loss, validation_acc = val_one_epoch( + i, val_loader, model, criterion, val_logger, eval_mode, device, silent, verbose) + metrics['val_loss'].append(validation_loss) + metrics['val_acc'].append(validation_acc) + if tensorboard_logger is not None: + update_tensorboard(tensorboard_logger, 'val', 'loss', validation_loss, i) + update_tensorboard(tensorboard_logger, 'val', 'acc', validation_acc, i) + + is_best = validation_acc > best_acc + if is_best and not silent and verbose: + print('Validation accuracy improved from {} to {}.'.format(best_acc, validation_acc)) + + best_acc = max(validation_acc, best_acc) + else: + is_best = train_acc > best_acc + best_acc = max(train_acc, best_acc) + + state = { + 'epoch': i, + 'state_dict': model.state_dict(), + 'optimizer': optimizer.state_dict(), + 'best_acc': best_acc + } + save_checkpoint(state, is_best, save_dir, 'model') + if restore_best: + print('restoring best') + if os.path.exists((os.path.join(save_dir, 'model_best.pth'))): + print('restoring best') + checkpoint = torch.load(os.path.join(save_dir, 'model_best.pth')) + model.load_state_dict(checkpoint['state_dict']) + return metrics + + +def apply_mask(audio_inputs, visual_inputs, targets, mod_drop): + if mod_drop == 'zerodrop': + coefficients = torch.randint(low=0, high=100, size=(audio_inputs.size(0), 1, 1))/100 + vision_coefficients = 1 - coefficients + coefficients = coefficients.repeat(1, audio_inputs.size(1), audio_inputs.size(2)) + vision_coefficients = vision_coefficients.unsqueeze(-1).unsqueeze(-1).repeat( + 1, visual_inputs.size(1), visual_inputs.size(2), visual_inputs.size(3), visual_inputs.size(4)) + + audio_inputs = torch.cat((audio_inputs, audio_inputs*coefficients, + torch.zeros(audio_inputs.size()), audio_inputs), dim=0) + visual_inputs = torch.cat((visual_inputs, visual_inputs*vision_coefficients, + visual_inputs, torch.zeros(visual_inputs.size())), dim=0) + + targets = torch.cat((targets, targets, targets, targets), dim=0) + shuffle = torch.randperm(audio_inputs.size()[0]) + audio_inputs = audio_inputs[shuffle] + visual_inputs = visual_inputs[shuffle] + targets = targets[shuffle] + elif mod_drop == 'noisedrop': + audio_inputs = torch.cat((audio_inputs, torch.randn(audio_inputs.size()), audio_inputs), dim=0) + + visual_inputs = torch.cat((visual_inputs, visual_inputs, torch.randn(visual_inputs.size())), dim=0) + + targets = torch.cat((targets, targets, targets), dim=0) + + shuffle = torch.randperm(audio_inputs.size()[0]) + audio_inputs = audio_inputs[shuffle] + visual_inputs = visual_inputs[shuffle] + targets = targets[shuffle] + + return audio_inputs, visual_inputs, targets + + +def train_one_epoch(epoch, data_loader, model, criterion, optimizer, e_logger, + mod_drop='zerodrop', device='cpu', silent=False, verbose=True): + model.train() + + loss = AverageMeter() + acc = AverageMeter() + + for i, (audio_inputs, visual_inputs, targets) in enumerate(data_loader): + targets = targets.to(device) + with torch.no_grad(): + audio_inputs, visual_inputs, targets = apply_mask(audio_inputs, visual_inputs, targets, mod_drop) + + audio_inputs = audio_inputs.to(device) + visual_inputs = visual_inputs.to(device) + + visual_inputs = visual_inputs.permute(0, 2, 1, 3, 4) + visual_inputs = visual_inputs.reshape( + visual_inputs.shape[0]*visual_inputs.shape[1], visual_inputs.shape[2], + visual_inputs.shape[3], visual_inputs.shape[4]) + + outputs = model(audio_inputs, visual_inputs) + loss_b = criterion(outputs, targets) + + loss.update(loss_b.data, audio_inputs.size(0)) + + acc_1, _ = calculate_accuracy(outputs.data, targets.data, topk=(1, 5)) + + acc.update(acc_1, audio_inputs.size(0)) + + optimizer.zero_grad() + loss_b.backward() + optimizer.step() + + if not silent and (verbose or (not verbose and epoch % 10 == 0)): + print('Epoch: [{0}][{1}/{2}]\t' + 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' + 'Acc {acc.val:.5f} ({acc.avg:.5f})\t'.format( + epoch, + i, + len(data_loader), + loss=loss, + acc=acc, + )) + + e_logger.log({ + 'epoch': epoch, + 'loss': loss.avg.item(), + 'acc': acc.avg.item() + }) + return loss.avg.item(), acc.avg.item() + + +def val_one_epoch(epoch, data_loader, model, criterion, logger=None, + mode='audiovisual', device='cpu', silent=False, verbose=True): + model.eval() + + loss = AverageMeter() + acc = AverageMeter() + + with torch.no_grad(): + for i, (inputs_audio, inputs_visual, targets) in enumerate(data_loader): + + if mode == 'onlyaudio': + inputs_visual = torch.zeros(inputs_visual.size()) + elif mode == 'noisyvideo': + inputs_visual = torch.randn(inputs_visual.size()) + elif mode == 'onlyvideo': + inputs_audio = torch.zeros(inputs_audio.size()) + elif mode == 'noisyaudio': + inputs_audio = torch.randn(inputs_audio.size()) + + inputs_visual = inputs_visual.permute(0, 2, 1, 3, 4) + inputs_visual = inputs_visual.reshape( + inputs_visual.shape[0]*inputs_visual.shape[1], inputs_visual.shape[2], + inputs_visual.shape[3], inputs_visual.shape[4]) + + targets = targets.to(device) + inputs_audio = inputs_audio.to(device) + inputs_visual = inputs_visual.to(device) + + outputs = model(inputs_audio, inputs_visual) + loss_b = criterion(outputs, targets) + + acc1, _ = calculate_accuracy(outputs.data, targets.data, topk=(1, 5)) + acc.update(acc1, inputs_audio.size(0)) + + loss.update(loss_b.data, inputs_audio.size(0)) + if not silent and (verbose or (not verbose and epoch % 10 == 0)): + print('Epoch: [{0}][{1}/{2}]\t' + 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' + 'Acc {acc.val:.5f} ({acc.avg:.5f})\t'.format( + epoch, + i + 1, + len(data_loader), + loss=loss, + acc=acc)) + + if logger is not None: + logger.log({'epoch': epoch, + 'loss': loss.avg.item(), + 'acc': acc.avg.item()}) + + return loss.avg.item(), acc.avg.item() diff --git a/src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/algorithm/transformer_timm.py b/src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/algorithm/transformer_timm.py new file mode 100644 index 0000000000..131b94d591 --- /dev/null +++ b/src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/algorithm/transformer_timm.py @@ -0,0 +1,141 @@ +# Copyright 2019 Ross Wightman +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is modified from https://github.com/rwightman/pytorch-image-models +""" + +from torch import nn +import torch + + +class Mlp(nn.Module): + """ MLP as used in Vision Transformer, MLP-Mixer and related networks + """ + + def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0., use_conv1=False): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.use_conv1 = use_conv1 + if use_conv1: + self.fc1 = nn.Conv1d(in_features, hidden_features, kernel_size=3, stride=1, padding='same') + else: + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + if use_conv1: + self.fc2 = nn.Conv1d(hidden_features, out_features, kernel_size=3, stride=1, padding='same') + else: + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + if self.use_conv1: + x = x.transpose(1, 2) + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + if self.use_conv1: + x = x.transpose(1, 2) + return x + + +def drop_path(x, drop_prob: float = 0., training: bool = False): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + This is the same as the DropConnect impl I created for EfficientNet, etc networks, however, + the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for + changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use + 'survival rate' as the argument. + + """ + if drop_prob == 0. or not training: + return x + keep_prob = 1 - drop_prob + shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device) + random_tensor.floor_() # binarize + output = x.div(keep_prob) * random_tensor + return output + + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + """ + + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) + + +class Attention(nn.Module): + def __init__(self, in_dim_k, in_dim_q, out_dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.): + super().__init__() + self.num_heads = num_heads + head_dim = out_dim // num_heads + self.scale = qk_scale or head_dim ** -0.5 + + self.q = nn.Linear(in_dim_q, out_dim, bias=qkv_bias) + self.kv = nn.Linear(in_dim_k, out_dim * 2, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(out_dim, out_dim) + self.proj_drop = nn.Dropout(proj_drop) + self.qkmatrix = None + + def forward(self, x, x_q): + B, Nk, Ck = x.shape + B, Nq, Cq = x_q.shape + q = self.q(x_q).reshape(B, Nq, 1, self.num_heads, -1).permute(2, 0, 3, 1, 4) + kv = self.kv(x).reshape(B, Nk, 2, self.num_heads, -1).permute(2, 0, 3, 1, 4) + k, v = kv[0], kv[1] # make torchscript happy (cannot use tensor as tuple) + q = q.squeeze(0) + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + self.qkmatrix = attn + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, Nq, -1) + x = self.proj(x) + x = self.proj_drop(x) + return x, self.qkmatrix + + +class AttentionBlock(nn.Module): + + def __init__(self, in_dim_k, in_dim_q, out_dim, num_heads, mlp_ratio=2., + qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, use_conv1=False): + super().__init__() + self.norm1_q = norm_layer(in_dim_q) + self.norm1_k = norm_layer(in_dim_k) + self.attn = Attention(in_dim_k=in_dim_k, in_dim_q=in_dim_q, + out_dim=out_dim, num_heads=num_heads, qkv_bias=qkv_bias, + qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(out_dim) + mlp_hidden_dim = int(out_dim * mlp_ratio) + self.mlp = Mlp(in_features=out_dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop, use_conv1=use_conv1) + + def forward(self, xk, xq): + + x, a = self.attn(self.norm1_k(xk), self.norm1_q(xq)) + x = self.drop_path(x) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x diff --git a/src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/algorithm/utils.py b/src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/algorithm/utils.py new file mode 100644 index 0000000000..e4b13321cc --- /dev/null +++ b/src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/algorithm/utils.py @@ -0,0 +1,111 @@ +# MIT License +# +# Copyright (c) 2019, Okan Köpüklü +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +''' +This code is modified from https://github.com/okankop/Efficient-3DCNNs/ +''' + +import csv +import torch +import shutil +import numpy as np + + +class AverageMeter(object): + """Computes and stores the average and current value""" + + def __init__(self): + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + + +class Logger(object): + + def __init__(self, path, header): + self.log_file = open(path, 'w') + self.logger = csv.writer(self.log_file, delimiter='\t') + + self.logger.writerow(header) + self.header = header + + def __del(self): + self.log_file.close() + + def log(self, values): + write_values = [] + for col in self.header: + assert col in values + write_values.append(values[col]) + + self.logger.writerow(write_values) + self.log_file.flush() + + +def load_value_file(file_path): + with open(file_path, 'r') as input_file: + value = float(input_file.read().rstrip('\n\r')) + + return value + + +def calculate_accuracy(output, target, topk=(1,), binary=False): + """Computes the precision@k for the specified values of k""" + + maxk = max(topk) + if maxk > output.size(1): + maxk = output.size(1) + batch_size = target.size(0) + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + + res = [] + for k in topk: + if k > maxk: + k = maxk + correct_k = correct[:k].reshape(-1).float().sum(0) + res.append(correct_k.mul_(100.0 / batch_size)) + return res + + +def save_checkpoint(state, is_best, save_path, save_name): + torch.save(state, '%s/%s_last' % (save_path, save_name)+'.pth') + if is_best: + shutil.copyfile('%s/%s_last' % (save_path, save_name)+'.pth', '%s/%s_best' % (save_path, save_name)+'.pth') + + +def adjust_learning_rate(optimizer, epoch, learning_rate, lr_steps): + """Sets the learning rate to the initial LR decayed by 10 every 30 epochs""" + lr_new = learning_rate * (0.1 ** (sum(epoch >= np.array(lr_steps)))) + for param_group in optimizer.param_groups: + param_group['lr'] = lr_new diff --git a/src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/avlearner.py b/src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/avlearner.py new file mode 100644 index 0000000000..9fc2b45b1f --- /dev/null +++ b/src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/avlearner.py @@ -0,0 +1,461 @@ +# Copyright 2020-2022 OpenDR European Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# general imports +import torch +from torch.utils.data import DataLoader +import os +import json +from torch.utils.tensorboard import SummaryWriter +from urllib.request import urlretrieve +import librosa + +# OpenDR engine imports +from opendr.engine.learners import Learner +from opendr.engine.data import Video, Timeseries +from opendr.engine.datasets import DatasetIterator +from opendr.engine.target import Category +from opendr.engine.constants import OPENDR_SERVER_URL + +# OpenDR imports +from opendr.perception.multimodal_human_centric.audiovisual_emotion_learner.algorithm import (data, + models, + trainer, + spatial_transforms as transforms, + data_utils + ) + +# constants +PRETRAINED_MODEL = ['ia_zerodrop'] + +__all__ = [] + + +class AudiovisualEmotionLearner(Learner): + def __init__(self, + num_class=8, + seq_length=15, + fusion='ia', + mod_drop='zerodrop', + pretr_ef=None, + lr=0.04, + lr_steps=[40, 55, 65, 70, 200, 250], + momentum=0.9, + dampening=0.9, + weight_decay=1e-3, + iters=100, + batch_size=8, + n_workers=4, + device='cpu', + ): + super(AudiovisualEmotionLearner, + self).__init__(batch_size=batch_size, + device=device) + assert fusion in ['ia', 'it', 'lt'], 'Unknown modality fusion type' + assert mod_drop in ['nodrop', 'noisedrop', 'zerodrop'], 'Unknown modlaity dropout type' + + self.model = models.MultiModalCNN(num_classes=num_class, fusion=fusion, seq_length=seq_length, pretr_ef=pretr_ef) + + self.num_class = num_class + self.fusion = fusion + + self.lr = lr + self.lr_steps = lr_steps + + self.momentum = momentum + self.dampening = dampening + self.weight_decay = weight_decay + + self.n_iters = iters + self.batch_size = batch_size + self.n_workers = n_workers + self.mod_drop = mod_drop + + self.seq_length = seq_length + + def _validate_x1(self, x): + if not isinstance(x, Timeseries): + msg = 'The 1st element returned by __getitem__ must be an instance of `engine.data.Timeseries` class\n' +\ + 'Received an instance of type: {}'.format(type(x)) + raise TypeError(msg) + + def _validate_x2(self, x): + if not isinstance(x, Video): + msg = 'The 2nd element returned by __getitem__ must be an instance of `engine.data.Video` class\n' +\ + 'Received an instance of type: {}'.format(type(x)) + raise TypeError(msg) + + if x.data.shape[0] != 3: + msg = 'The first dimension of data produced by dataset must be 3\n' +\ + 'Received input of shape: {}'.format(x.data.shape) + raise ValueError(msg) + + if x.data.shape[1] != self.seq_length: + msg = 'The temporal dimension of data does not match specified sequence length of the model\n' +\ + 'Received input with dimension: {} and sequence length is: {}.'.format(x.data.shape[1], self.seq_length) + raise ValueError(msg) + + def _validate_y(self, y): + if not isinstance(y, Category): + msg = 'The 2nd element returned by __getitem__ must be an instance of `engine.target.Cateogry` class\n' +\ + 'Received an instance of type: {}'.format(type(y)) + raise TypeError(msg) + + def _validate_dataset(self, dataset): + """ + This internal function is used to perform basic validation of the data dimensions + """ + if dataset is None: + return + + if not isinstance(dataset, data.RavdessDataset): + if not isinstance(dataset, DatasetIterator): + msg = 'Dataset must be an instance of engine.datasets.DatasetIterator class\n' +\ + 'Received an instance of type: {}'.format(type(dataset)) + raise TypeError(msg) + else: + x1, x2, y = dataset.__getitem__(0) + self._validate_x1(x1) + self._validate_x2(x2) + self._validate_y(y) + + def fit(self, dataset, val_dataset=None, logging_path='logs/', silent=False, verbose=True, + eval_mode='audiovisual', restore_best=False): + """ + Method to train the audiovisual emotion recognition model + + :param dataset: training dataset object + :type dataset: engine.datasets.DatasetIterator + :param val_dataset: validation samples, default to None + + if available, `val_set` is used to select + the best checkpoint for final model + :type val_dataset: engine.datasets.DatasetIterator + + :param logging_path: path to save checkpoints + and log data, default to "logs/" + :type logging_path: string + :param silent: disable performance printing, default to False + :type silent: bool + :param verbose: enable the performance printing, default to True + :type verbose: bool + + :return: the best accuracy on validation set + :rtype: float + """ + self._validate_dataset(dataset) + self._validate_dataset(val_dataset) + assert eval_mode in ['audiovisual', 'noisyaudio', 'noisyvideo', 'onlyaudio', 'onlyvideo'] + + if isinstance(dataset, data.RavdessDataset): + train_loader = DataLoader(dataset, + batch_size=self.batch_size, + pin_memory=self.device == 'cuda', + num_workers=self.n_workers, + shuffle=True) + else: + train_loader = DataLoader(data.DataWrapper(dataset), + batch_size=self.batch_size, + pin_memory=self.device == 'cuda', + num_workers=self.n_workers, + shuffle=True) + + if val_dataset is None: + val_loader = None + elif isinstance(val_dataset, data.RavdessDataset): + val_loader = DataLoader(val_dataset, + batch_size=self.batch_size, + num_workers=self.n_workers, + pin_memory=self.device == 'cuda', + shuffle=False) + else: + val_loader = DataLoader(data.DataWrapper(val_dataset), + batch_size=self.batch_size, + num_workers=self.n_workers, + pin_memory=self.device == 'cuda', + shuffle=False) + + if not os.path.exists(logging_path): + os.makedirs(logging_path) + tensorboard_logger = SummaryWriter(logging_path) + self.model = self.model.to(self.device) + metrics = trainer.train(self.model, train_loader, val_loader, self.lr, self.momentum, self.dampening, + self.weight_decay, self.n_iters, logging_path, self.lr_steps, + self.mod_drop, self.device, silent, verbose, + tensorboard_logger, eval_mode, restore_best) + + if tensorboard_logger is not None: + tensorboard_logger.close() + return metrics + + def eval(self, dataset, silent=False, verbose=True, mode='audiovisual'): + """ + This method is used to evaluate the performance of a given set of data + + :param dataset: object that holds the set of samples to evaluate + :type dataset: engine.datasets.DatasetIterator + :param mode: testing mode + :type mode: string + :return: a dictionary that contains `cross_entropy` and `acc` as keys + :rtype: dict + """ + self._validate_dataset(dataset) + if isinstance(dataset, data.RavdessDataset): + loader = DataLoader(dataset, + batch_size=self.batch_size, + num_workers=self.n_workers, + pin_memory=self.device == 'cuda', + shuffle=False) + else: + loader = DataLoader(data.DataWrapper(dataset), + batch_size=self.batch_size, + num_workers=self.n_workers, + pin_memory=self.device == 'cuda', + shuffle=False) + + self.model = self.model.to(self.device) + self.model.eval() + + L = torch.nn.CrossEntropyLoss() + loss, acc = trainer.val_one_epoch(-1, loader, self.model, L, mode=mode, device=self.device, + silent=silent, verbose=verbose) + if not silent and verbose: + print('Loss: {}, Accuracy: {}'.format(loss, acc)) + + return {'cross_entropy': loss, 'acc': acc} + + def _process_video(self, video_path, target_time=3.6, input_fps=30, save_frames=15, target_im_size=224): + """ + This function preprocesses input video file for inference + Parameters + ---------- + video_path : str + path to video file. + target_time : float, optional + Target time of processed video file in seconds. The default is 3.6. + input_fps : int, optional + Frames Per Second of input video file. The default is 30. + save_frames : int, optional + Length of target frame sequence. The default is 15. + target_im_size : int, optional + Target width and height of each frame. The default is 224. + + Returns + ------- + numpy_video: numpy.array + N frames as numpy array + + """ + numpy_video = data_utils.preprocess_video(video_path, target_time, input_fps, save_frames, + target_im_size, device=self.device) + video_scale = 255 + video_transform = transforms.Compose([ + transforms.ToTensor(video_scale)]) + video = [video_transform(img) for img in numpy_video] + video = torch.stack(video, 0).permute(1, 0, 2, 3) + + return video + + def _process_audio(self, audio_path, target_time=3.6, sr=22050): + """ + This function preprocesses an audio file for inference + + Parameters + ---------- + audio_path : str + Path to audio file. + target_time : int, optional + Target duration of audio. The default is 3.6. + sr : int, optional + Sampling rate of audio. The default is 22050. + + Returns + ------- + y : numpy array + audio file saved as numpy array. + """ + y = data_utils.preprocess_audio(audio_path, sr, target_time) + mfcc = librosa.feature.mfcc(y, sr=sr, n_mfcc=10) + return mfcc + + def load_inference_data(self, audio_path, video_path, target_time=3.6, sr=22050, input_fps=30, save_frames=15, + target_im_size=224): + video = Video(self._process_video(video_path, target_time, input_fps, save_frames, target_im_size)) + audio = Timeseries(self._process_audio(audio_path, target_time, sr)) + return audio, video + + def infer(self, audio, video): + """ + This method is used to generate prediction given Audio and Visual data + + :param video: video of a fronal view of a face + :type video: engine.data.Video + :param audio: audio features to generate class prediction + :type audio: engine.data.Timeseries + :return: predicted label + :rtype: engine.target.Category + + """ + self._validate_x1(audio) + self._validate_x2(video) + self.model.to(self.device) + self.model.eval() + + video = torch.tensor([video.data]).permute(0, 2, 1, 3, 4) + video = video.reshape(video.shape[0]*video.shape[1], video.shape[2], + video.shape[3], video.shape[4]).to(self.device) + + audio = torch.tensor(audio.data).unsqueeze(0).to(self.device) + with torch.no_grad(): + prob_prediction = torch.nn.functional.softmax(self.model(audio, video).flatten(), dim=0) + class_prediction = prob_prediction.argmax(dim=-1).cpu().item() + + prediction = Category(class_prediction, confidence=prob_prediction[class_prediction].cpu().item()) + + return prediction + + def pred_to_label(self, prediction): + """ + This function converts the numeric class value to huamn-interpretable emotion label for RAVDESS dataset + """ + assert self.num_class == 8, 'Unknown emotion class vocabulary for given number of classes' + + NUM_2_CLASS = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful', 6: 'disgust', 7: 'surprised'} + return NUM_2_CLASS[prediction.data] + + def save(self, path, verbose=True): + """ + This function is used to save the current model given a directory path. + Metadata and model weights are saved under `path/metadata.json` + and path/model_weights.pt` + :param path: path to the directory that the model will be saved + :type path: str + :param verbose: default to True + :type verbose: bool + + """ + if not os.path.exists(path): + os.makedirs(path) + + model_weight_file = os.path.join(path, 'model_weights.pt') + metadata_file = os.path.join(path, 'metadata.json') + + metadata = {'framework': 'pytorch', + 'model_paths': ['model_weights.pt'], + 'format': 'pt', + 'has_data': False, + 'inference_params': {}, + 'optimized': False, + 'optimimizer_info': {} + } + + try: + torch.save(self.model.cpu().state_dict(), model_weight_file) + if verbose: + print('Model weights saved to {}'.format(model_weight_file)) + except Exception as error: + raise error + + try: + fid = open(metadata_file, 'w') + json.dump(metadata, fid) + fid.close() + + if verbose: + print('Model metadata saved to {}'.format(metadata_file)) + + except Exception as error: + raise error + + return True + + def load(self, path, verbose=True): + """ + This function is used to load a pretrained model that + has been saved with .save(), given the path to the directory. + `path/metadata.json` and `path/model_weights.pt` should exist + :param path: path to the saved location + :type path: str + :param verbose: defaul to True + :type verbose: bool + + """ + + if not os.path.exists(path): + raise FileNotFoundError('Directory "{}" does not exist'.format(path)) + + if not os.path.isdir(path): + raise ValueError('Given path "{}" is not a directory'.format(path)) + + metadata_file = os.path.join(path, 'metadata.json') + assert os.path.exists(metadata_file),\ + 'Metadata file ("metadata.json")' +\ + 'does not exist under the given path "{}"'.format(path) + + fid = open(metadata_file, 'r') + metadata = json.load(fid) + fid.close() + + model_weight_file = os.path.join(path, metadata['model_paths'][0]) + assert os.path.exists(model_weight_file),\ + 'Model weights "{}" does not exist'.format(model_weight_file) + + self.model.cpu() + self.model.load_state_dict(torch.load(model_weight_file, + map_location=torch.device('cpu'))) + + if verbose: + print('Pretrained model is loaded successfully') + + def download(self, path): + """ + This function is used to download a pretrained model for the audiovisual emotion recognition task + Calling load(path) after this function will load the downloaded model weights + + :param path: path to the saved location. Under this path `model_weights.pt` and `metadata.json` + will be downloaded so different paths for different models should be given to avoid + overwriting previously downloaded model + :type path: str + """ + print('Downloading pre-trained model trained on RAVDESS dataset under CC BY-NC-SA 4.0 license') + if not os.path.exists(path): + os.makedirs(path, exist_ok=True) + + if self.fusion + '_' + self.mod_drop in PRETRAINED_MODEL: + assert self.num_class == 8,\ + 'For pretrained audiovisual emotionrecognition model, `num_cluss` must be 8' + + server_url = os.path.join(OPENDR_SERVER_URL, + 'perception', + 'multimodal_human_centric', + 'audiovisual_emotion_learner') + + model_name = '{}_{}_{}'.format('av_emotion', self.fusion, self.mod_drop) + + metadata_url = os.path.join(server_url, '{}.json'.format(model_name)) + metadata_file = os.path.join(path, 'metadata.json') + urlretrieve(metadata_url, metadata_file) + + weights_url = os.path.join(server_url, '{}.pt'.format(model_name)) + weights_file = os.path.join(path, 'model_weights.pt') + urlretrieve(weights_url, weights_file) + print('Pretrained model downloaded to the following directory\n{}'.format(path)) + else: + raise UserWarning('No pretrained model for fusion "{}" and modality drop "{}"'.format(self.fusion, self.mod_drop)) + + def optimize(self): + raise NotImplementedError + + def reset(self): + raise NotImplementedError diff --git a/src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/dependencies.ini b/src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/dependencies.ini new file mode 100644 index 0000000000..2c7af07831 --- /dev/null +++ b/src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/dependencies.ini @@ -0,0 +1,8 @@ +[runtime] +python=torch==1.9.0 + torchvision==0.10.0 + librosa==0.8.0 + opencv-python + tqdm + tensorboard==2.4.1 + imageio==2.6.0 diff --git a/tests/sources/tools/perception/multimodal_human_centric/audiovisual_emotion_recognition/__init__.py b/tests/sources/tools/perception/multimodal_human_centric/audiovisual_emotion_recognition/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/sources/tools/perception/multimodal_human_centric/audiovisual_emotion_recognition/test_audiovisual_emotion_learner.py b/tests/sources/tools/perception/multimodal_human_centric/audiovisual_emotion_recognition/test_audiovisual_emotion_learner.py new file mode 100644 index 0000000000..8bf2b1fedf --- /dev/null +++ b/tests/sources/tools/perception/multimodal_human_centric/audiovisual_emotion_recognition/test_audiovisual_emotion_learner.py @@ -0,0 +1,130 @@ +# Copyright 2020-2022 OpenDR European Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import torch +import tempfile +import numpy as np +import random +import os + +# OpenDR imports +from opendr.engine.data import Video, Timeseries +from opendr.engine.target import Category +from opendr.perception.multimodal_human_centric import AudiovisualEmotionLearner +from opendr.engine.datasets import DatasetIterator + + +DEVICE = os.getenv('TEST_DEVICE') if os.getenv('TEST_DEVICE') else 'cpu' + + +class DummyDataset(DatasetIterator): + def __init__(self, n_class=8, n_sample=4): + super(DummyDataset, self).__init__() + self.n_sample = n_sample + self.n_class = n_class + + def __len__(self,): + return self.n_sample + + def __getitem__(self, i): + xa = np.float32(np.random.rand(10, 156)) + xv = np.float32(np.random.rand(3, 15, 224, 224)) + y = np.random.randint(low=0, high=self.n_class) + return Timeseries(xa), Video(xv), Category(y) + + +def get_random_learner(): + n_class = np.random.randint(low=2, high=10) + + fusion = random.choice(['it', 'ia', 'lt']) + mod_drop = random.choice(['nodrop', 'noisedrop', 'zerodrop']) + + learner = AudiovisualEmotionLearner(num_class=n_class, + iters=1, + batch_size=2, + fusion=fusion, + mod_drop=mod_drop, device=DEVICE) + return learner + + +class TestAudiovisualEmotionLearner(unittest.TestCase): + @classmethod + def setUpClass(cls): + print("\n\n**********************************\nTEST AudiovisualEmotionLearner\n" + "**********************************") + pass + + @classmethod + def tearDownClass(cls): + return + + def test_fit(self): + learner = get_random_learner() + train_set = DummyDataset(learner.num_class) + val_set = DummyDataset(learner.num_class) + + old_weight = list(learner.model.parameters())[0].clone() + learner.fit(train_set, val_set, silent=True, verbose=False) + new_weight = list(learner.model.parameters())[0].clone() + + self.assertFalse(torch.equal(old_weight, new_weight), + msg="Model parameters did not change after running fit.") + + def test_eval(self): + learner = get_random_learner() + dataset = DummyDataset(learner.num_class) + performance = learner.eval(dataset, silent=True, verbose=False) + + self.assertTrue('cross_entropy' in performance.keys()) + self.assertTrue('acc' in performance.keys()) + + def test_infer(self): + temp_dir = tempfile.TemporaryDirectory() + + xa = Timeseries(np.float32(np.random.rand(10, 156))) + xv = Video(np.float32(np.random.rand(3, 15, 224, 224))) + + # create learner and download pretrained model + learner = AudiovisualEmotionLearner(num_class=8) + + # make inference + pred = learner.infer(xa, xv) + self.assertTrue(isinstance(pred, Category)) + + self.assertTrue(pred.confidence <= 1, + msg="Confidence of prediction must be less or equal than 1") + temp_dir.cleanup() + + def test_save_load(self): + temp_dir = tempfile.TemporaryDirectory() + learner = get_random_learner() + + learner.save(temp_dir.name, verbose=False) + + new_learner = AudiovisualEmotionLearner( + num_class=learner.num_class, fusion=learner.fusion, mod_drop=learner.mod_drop) + + new_learner.load(temp_dir.name, verbose=False) + xa = Timeseries(np.float32(np.random.rand(10, 156))) + xv = Video(np.float32(np.random.rand(3, 15, 224, 224))) + old_pred = learner.infer(xa, xv).confidence + new_pred = new_learner.infer(xa, xv).confidence + + self.assertEqual(old_pred, new_pred) + temp_dir.cleanup() + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_license.py b/tests/test_license.py index 053ef43960..90f9726d7d 100755 --- a/tests/test_license.py +++ b/tests/test_license.py @@ -111,7 +111,12 @@ def setUp(self): 'src/opendr/perception/activity_recognition/datasets/utils/decoder.py', 'projects/perception/lightweight_open_pose/jetbot/utils/pid.py', 'src/opendr/perception/compressive_learning/multilinear_compressive_learning/algorithm/trainers.py', - 'src/opendr/perception/object_detection_2d/retinaface/Makefile' + 'src/opendr/perception/object_detection_2d/retinaface/Makefile', + 'src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/algorithm/efficientface_modulator.py', + 'src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/algorithm/efficientface_utils.py', + 'src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/algorithm/spatial_transforms.py', + 'src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/algorithm/transformer_timm.py', + 'src/opendr/perception/multimodal_human_centric/audiovisual_emotion_learner/algorithm/utils.py' ] skippedDirectories = [