From 007082a112083797b89c447e212d6aa670fa69a8 Mon Sep 17 00:00:00 2001 From: Lewen Wang <49936435+lwwang1995@users.noreply.github.com> Date: Tue, 16 Nov 2021 11:24:43 +0800 Subject: [PATCH] Add AdaRNN baseline. (#689) * Update TCTS. * Update TCTS README. * Update TCTS README. * Update TCTS. * Add ADARNN. * Update README. * Reformat ADARNN. * Add README for adarnn. Co-authored-by: lewwang --- README.md | 2 + examples/benchmarks/ADARNN/README.md | 4 + examples/benchmarks/ADARNN/requirements.txt | 4 + .../workflow_config_adarnn_Alpha360.yaml | 88 ++ examples/benchmarks/README.md | 6 +- .../TCTS/workflow_config_tcts_Alpha360.yaml | 2 +- qlib/contrib/model/pytorch_adarnn.py | 789 ++++++++++++++++++ 7 files changed, 891 insertions(+), 4 deletions(-) create mode 100644 examples/benchmarks/ADARNN/README.md create mode 100644 examples/benchmarks/ADARNN/requirements.txt create mode 100644 examples/benchmarks/ADARNN/workflow_config_adarnn_Alpha360.yaml create mode 100644 qlib/contrib/model/pytorch_adarnn.py diff --git a/README.md b/README.md index 261d6cf75b..7125ee8fe6 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,7 @@ Recent released features | Feature | Status | | -- | ------ | +| ADARNN model | [Released](https://github.com/microsoft/qlib/pull/689) on Nov 14, 2021 | | TCN model | [Released](https://github.com/microsoft/qlib/pull/668) on Nov 4, 2021 | |Temporal Routing Adaptor (TRA) | [Released](https://github.com/microsoft/qlib/pull/531) on July 30, 2021 | | Transformer & Localformer | [Released](https://github.com/microsoft/qlib/pull/508) on July 22, 2021 | @@ -296,6 +297,7 @@ Here is a list of models built on `Qlib`. - [Localformer based on pytorch (Juyong Jiang, et al.)](examples/benchmarks/Localformer/) - [TRA based on pytorch (Hengxu, Dong, et al. KDD 2021)](examples/benchmarks/TRA/) - [TCN based on pytorch (Shaojie Bai, et al. 2018)](examples/benchmarks/TCN/) +- [ADARNN based on pytorch (YunTao Du, et al. 2021)](examples/benchmarks/ADARNN/) Your PR of new Quant models is highly welcomed. diff --git a/examples/benchmarks/ADARNN/README.md b/examples/benchmarks/ADARNN/README.md new file mode 100644 index 0000000000..2296af9238 --- /dev/null +++ b/examples/benchmarks/ADARNN/README.md @@ -0,0 +1,4 @@ +# AdaRNN +* Code: [https://github.com/jindongwang/transferlearning/tree/master/code/deep/adarnn](https://github.com/jindongwang/transferlearning/tree/master/code/deep/adarnn) +* Paper: [AdaRNN: Adaptive Learning and Forecasting for Time Series](https://arxiv.org/pdf/2108.04443.pdf). + diff --git a/examples/benchmarks/ADARNN/requirements.txt b/examples/benchmarks/ADARNN/requirements.txt new file mode 100644 index 0000000000..16de0a4384 --- /dev/null +++ b/examples/benchmarks/ADARNN/requirements.txt @@ -0,0 +1,4 @@ +pandas==1.1.2 +numpy==1.17.4 +scikit_learn==0.23.2 +torch==1.7.0 diff --git a/examples/benchmarks/ADARNN/workflow_config_adarnn_Alpha360.yaml b/examples/benchmarks/ADARNN/workflow_config_adarnn_Alpha360.yaml new file mode 100644 index 0000000000..ac49d01457 --- /dev/null +++ b/examples/benchmarks/ADARNN/workflow_config_adarnn_Alpha360.yaml @@ -0,0 +1,88 @@ +qlib_init: + provider_uri: "~/.qlib/qlib_data/cn_data" + region: cn +market: &market csi300 +benchmark: &benchmark SH000300 +data_handler_config: &data_handler_config + start_time: 2008-01-01 + end_time: 2020-08-01 + fit_start_time: 2008-01-01 + fit_end_time: 2014-12-31 + instruments: *market + infer_processors: + - class: RobustZScoreNorm + kwargs: + fields_group: feature + clip_outlier: true + - class: Fillna + kwargs: + fields_group: feature + learn_processors: + - class: DropnaLabel + - class: CSRankNorm + kwargs: + fields_group: label + label: ["Ref($close, -2) / Ref($close, -1) - 1"] +port_analysis_config: &port_analysis_config + strategy: + class: TopkDropoutStrategy + module_path: qlib.contrib.strategy + kwargs: + model: + dataset: + topk: 50 + n_drop: 5 + backtest: + start_time: 2017-01-01 + end_time: 2020-08-01 + account: 100000000 + benchmark: *benchmark + exchange_kwargs: + limit_threshold: 0.095 + deal_price: close + open_cost: 0.0005 + close_cost: 0.0015 + min_cost: 5 +task: + model: + class: ADARNN + module_path: qlib.contrib.model.pytorch_adarnn + kwargs: + d_feat: 6 + hidden_size: 64 + num_layers: 2 + dropout: 0.0 + n_epochs: 200 + lr: 1e-3 + early_stop: 20 + batch_size: 800 + metric: loss + loss: mse + GPU: 0 + dataset: + class: DatasetH + module_path: qlib.data.dataset + kwargs: + handler: + class: Alpha360 + module_path: qlib.contrib.data.handler + kwargs: *data_handler_config + segments: + train: [2008-01-01, 2014-12-31] + valid: [2015-01-01, 2016-12-31] + test: [2017-01-01, 2020-08-01] + record: + - class: SignalRecord + module_path: qlib.workflow.record_temp + kwargs: + model: + dataset: + - class: SigAnaRecord + module_path: qlib.workflow.record_temp + kwargs: + ana_long_short: False + ann_scaler: 252 + - class: PortAnaRecord + module_path: qlib.workflow.record_temp + kwargs: + config: *port_analysis_config diff --git a/examples/benchmarks/README.md b/examples/benchmarks/README.md index 1a7d2fc269..779a2bc127 100644 --- a/examples/benchmarks/README.md +++ b/examples/benchmarks/README.md @@ -21,6 +21,7 @@ The numbers shown below demonstrate the performance of the entire `workflow` of | Model Name | Dataset | IC | ICIR | Rank IC | Rank ICIR | Annualized Return | Information Ratio | Max Drawdown | |------------------------------------------|-------------------------------------|-------------|-------------|-------------|-------------|-------------------|-------------------|--------------| +| TCN(Shaojie Bai, et al.) | Alpha158 | 0.0275±0.00 | 0.2157±0.01 | 0.0411±0.00 | 0.3379±0.01 | 0.0190±0.02 | 0.2887±0.27 | -0.1202±0.03 | | TabNet(Sercan O. Arik, et al.) | Alpha158 | 0.0204±0.01 | 0.1554±0.07 | 0.0333±0.00 | 0.2552±0.05 | 0.0227±0.04 | 0.3676±0.54 | -0.1089±0.08 | | Transformer(Ashish Vaswani, et al.) | Alpha158 | 0.0264±0.00 | 0.2053±0.02 | 0.0407±0.00 | 0.3273±0.02 | 0.0273±0.02 | 0.3970±0.26 | -0.1101±0.02 | | GRU(Kyunghyun Cho, et al.) | Alpha158(with selected 20 features) | 0.0315±0.00 | 0.2450±0.04 | 0.0428±0.00 | 0.3440±0.03 | 0.0344±0.02 | 0.5160±0.25 | -0.1017±0.02 | @@ -38,8 +39,6 @@ The numbers shown below demonstrate the performance of the entire `workflow` of | MLP | Alpha158 | 0.0376±0.00 | 0.2846±0.02 | 0.0429±0.00 | 0.3220±0.01 | 0.0895±0.02 | 1.1408±0.23 | -0.1103±0.02 | | LightGBM(Guolin Ke, et al.) | Alpha158 | 0.0448±0.00 | 0.3660±0.00 | 0.0469±0.00 | 0.3877±0.00 | 0.0901±0.00 | 1.0164±0.00 | -0.1038±0.00 | | DoubleEnsemble(Chuheng Zhang, et al.) | Alpha158 | 0.0544±0.00 | 0.4340±0.00 | 0.0523±0.00 | 0.4284±0.01 | 0.1168±0.01 | 1.3384±0.12 | -0.1036±0.01 | -| TCN | Alpha158 | 0.0275±0.00 | 0.2157±0.01 | 0.0411±0.00 | 0.3379±0.01 | 0.0190±0.02 | 0.2887±0.27 | -0.1202±0.03 | - ## Alpha360 dataset @@ -54,13 +53,14 @@ The numbers shown below demonstrate the performance of the entire `workflow` of | XGBoost(Tianqi Chen, et al.) | Alpha360 | 0.0394±0.00 | 0.2909±0.00 | 0.0448±0.00 | 0.3679±0.00 | 0.0344±0.00 | 0.4527±0.02 | -0.1004±0.00 | | DoubleEnsemble(Chuheng Zhang, et al.) | Alpha360 | 0.0404±0.00 | 0.3023±0.00 | 0.0495±0.00 | 0.3898±0.00 | 0.0468±0.01 | 0.6302±0.20 | -0.0860±0.01 | | LightGBM(Guolin Ke, et al.) | Alpha360 | 0.0400±0.00 | 0.3037±0.00 | 0.0499±0.00 | 0.4042±0.00 | 0.0558±0.00 | 0.7632±0.00 | -0.0659±0.00 | +| TCN(Shaojie Bai, et al.) | Alpha360 | 0.0441±0.00 | 0.3301±0.02 | 0.0519±0.00 | 0.4130±0.01 | 0.0604±0.02 | 0.8295±0.34 | -0.1018±0.03 | | ALSTM (Yao Qin, et al.) | Alpha360 | 0.0497±0.00 | 0.3829±0.04 | 0.0599±0.00 | 0.4736±0.03 | 0.0626±0.02 | 0.8651±0.31 | -0.0994±0.03 | | LSTM(Sepp Hochreiter, et al.) | Alpha360 | 0.0448±0.00 | 0.3474±0.04 | 0.0549±0.00 | 0.4366±0.03 | 0.0647±0.03 | 0.8963±0.39 | -0.0875±0.02 | | GRU(Kyunghyun Cho, et al.) | Alpha360 | 0.0493±0.00 | 0.3772±0.04 | 0.0584±0.00 | 0.4638±0.03 | 0.0720±0.02 | 0.9730±0.33 | -0.0821±0.02 | +| AdaRNN(Yuntao Du, et al.) | Alpha360 | 0.0464±0.01 | 0.3619±0.08 | 0.0539±0.01 | 0.4287±0.06 | 0.0753±0.03 | 1.0200±0.40 | -0.0936±0.03 | | GATs (Petar Velickovic, et al.) | Alpha360 | 0.0476±0.00 | 0.3508±0.02 | 0.0598±0.00 | 0.4604±0.01 | 0.0824±0.02 | 1.1079±0.26 | -0.0894±0.03 | | TCTS(Xueqing Wu, et al.) | Alpha360 | 0.0508±0.00 | 0.3931±0.04 | 0.0599±0.00 | 0.4756±0.03 | 0.0893±0.03 | 1.2256±0.36 | -0.0857±0.02 | | TRA(Hengxu Lin, et al.) | Alpha360 | 0.0485±0.00 | 0.3787±0.03 | 0.0587±0.00 | 0.4756±0.03 | 0.0920±0.03 | 1.2789±0.42 | -0.0834±0.02 | -| TCN(Shaojie Bai, et al.) | Alpha360 | 0.0441±0.00 | 0.3301±0.02 | 0.0519±0.00 | 0.4130±0.01 | 0.0604±0.02 | 0.8295±0.34 | -0.1018±0.03 | - The selected 20 features are based on the feature importance of a lightgbm-based model. - The base model of DoubleEnsemble is LGBM. diff --git a/examples/benchmarks/TCTS/workflow_config_tcts_Alpha360.yaml b/examples/benchmarks/TCTS/workflow_config_tcts_Alpha360.yaml index 9e0e735d11..460a470bb6 100644 --- a/examples/benchmarks/TCTS/workflow_config_tcts_Alpha360.yaml +++ b/examples/benchmarks/TCTS/workflow_config_tcts_Alpha360.yaml @@ -95,4 +95,4 @@ task: - class: PortAnaRecord module_path: qlib.workflow.record_temp kwargs: - config: *port_analysis_config + config: *port_analysis_config \ No newline at end of file diff --git a/qlib/contrib/model/pytorch_adarnn.py b/qlib/contrib/model/pytorch_adarnn.py new file mode 100644 index 0000000000..aad01011cc --- /dev/null +++ b/qlib/contrib/model/pytorch_adarnn.py @@ -0,0 +1,789 @@ +# Copyright (c) Microsoft Corporation. +import os +from pdb import set_trace +from torch.utils.data import Dataset, DataLoader + +import copy +from typing import Text, Union + +import math +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +from torch.autograd import Function +from qlib.contrib.model.pytorch_utils import count_parameters +from qlib.data.dataset import DatasetH +from qlib.data.dataset.handler import DataHandlerLP +from qlib.log import get_module_logger +from qlib.model.base import Model +from qlib.utils import get_or_create_path + + +class ADARNN(Model): + """ADARNN Model + + Parameters + ---------- + d_feat : int + input dimension for each time step + metric: str + the evaluate metric used in early stop + optimizer : str + optimizer name + GPU : str + the GPU ID(s) used for training + """ + + def __init__( + self, + d_feat=6, + hidden_size=64, + num_layers=2, + dropout=0.0, + n_epochs=200, + pre_epoch=40, + dw=0.5, + loss_type="cosine", + len_seq=60, + len_win=0, + lr=0.001, + metric="mse", + batch_size=2000, + early_stop=20, + loss="mse", + optimizer="adam", + n_splits=2, + GPU=0, + seed=None, + **kwargs + ): + # Set logger. + self.logger = get_module_logger("ADARNN") + self.logger.info("ADARNN pytorch version...") + os.environ["CUDA_VISIBLE_DEVICES"] = str(GPU) + + # set hyper-parameters. + self.d_feat = d_feat + self.hidden_size = hidden_size + self.num_layers = num_layers + self.dropout = dropout + self.n_epochs = n_epochs + self.pre_epoch = pre_epoch + self.dw = dw + self.loss_type = loss_type + self.len_seq = len_seq + self.len_win = len_win + self.lr = lr + self.metric = metric + self.batch_size = batch_size + self.early_stop = early_stop + self.optimizer = optimizer.lower() + self.loss = loss + self.n_splits = n_splits + self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() and GPU >= 0 else "cpu") + self.seed = seed + + self.logger.info( + "ADARNN parameters setting:" + "\nd_feat : {}" + "\nhidden_size : {}" + "\nnum_layers : {}" + "\ndropout : {}" + "\nn_epochs : {}" + "\nlr : {}" + "\nmetric : {}" + "\nbatch_size : {}" + "\nearly_stop : {}" + "\noptimizer : {}" + "\nloss_type : {}" + "\nvisible_GPU : {}" + "\nuse_GPU : {}" + "\nseed : {}".format( + d_feat, + hidden_size, + num_layers, + dropout, + n_epochs, + lr, + metric, + batch_size, + early_stop, + optimizer.lower(), + loss, + GPU, + self.use_gpu, + seed, + ) + ) + + if self.seed is not None: + np.random.seed(self.seed) + torch.manual_seed(self.seed) + + n_hiddens = [hidden_size for _ in range(num_layers)] + self.model = AdaRNN( + use_bottleneck=False, + bottleneck_width=64, + n_input=d_feat, + n_hiddens=n_hiddens, + n_output=1, + dropout=dropout, + model_type="AdaRNN", + len_seq=len_seq, + trans_loss=loss_type, + ) + self.logger.info("model:\n{:}".format(self.model)) + self.logger.info("model size: {:.4f} MB".format(count_parameters(self.model))) + + if optimizer.lower() == "adam": + self.train_optimizer = optim.Adam(self.model.parameters(), lr=self.lr) + elif optimizer.lower() == "gd": + self.train_optimizer = optim.SGD(self.model.parameters(), lr=self.lr) + else: + raise NotImplementedError("optimizer {} is not supported!".format(optimizer)) + + self.fitted = False + self.model.cuda() + + @property + def use_gpu(self): + return self.device != torch.device("cpu") + + def train_AdaRNN(self, train_loader_list, epoch, dist_old=None, weight_mat=None): + self.model.train() + criterion = nn.MSELoss() + dist_mat = torch.zeros(self.num_layers, self.len_seq).cuda() + len_loader = np.inf + for loader in train_loader_list: + if len(loader) < len_loader: + len_loader = len(loader) + for data_all in zip(*train_loader_list): + # for data_all in zip(*train_loader_list): + self.train_optimizer.zero_grad() + list_feat = [] + list_label = [] + for data in data_all: + # feature :[36, 24, 6] + feature, label_reg = data[0].cuda().float(), data[1].cuda().float() + list_feat.append(feature) + list_label.append(label_reg) + flag = False + index = get_index(len(data_all) - 1) + for temp_index in index: + s1 = temp_index[0] + s2 = temp_index[1] + if list_feat[s1].shape[0] != list_feat[s2].shape[0]: + flag = True + break + if flag: + continue + + total_loss = torch.zeros(1).cuda() + for i in range(len(index)): + feature_s = list_feat[index[i][0]] + feature_t = list_feat[index[i][1]] + label_reg_s = list_label[index[i][0]] + label_reg_t = list_label[index[i][1]] + feature_all = torch.cat((feature_s, feature_t), 0) + + if epoch < self.pre_epoch: + pred_all, loss_transfer, out_weight_list = self.model.forward_pre_train( + feature_all, len_win=self.len_win + ) + else: + pred_all, loss_transfer, dist, weight_mat = self.model.forward_Boosting(feature_all, weight_mat) + dist_mat = dist_mat + dist + pred_s = pred_all[0 : feature_s.size(0)] + pred_t = pred_all[feature_s.size(0) :] + + loss_s = criterion(pred_s, label_reg_s) + loss_t = criterion(pred_t, label_reg_t) + + total_loss = total_loss + loss_s + loss_t + self.dw * loss_transfer + self.train_optimizer.zero_grad() + total_loss.backward() + torch.nn.utils.clip_grad_value_(self.model.parameters(), 3.0) + self.train_optimizer.step() + if epoch >= self.pre_epoch: + if epoch > self.pre_epoch: + weight_mat = self.model.update_weight_Boosting(weight_mat, dist_old, dist_mat) + return weight_mat, dist_mat + else: + weight_mat = self.transform_type(out_weight_list) + return weight_mat, None + + def calc_all_metrics(self, pred): + """pred is a pandas dataframe that has two attributes: score (pred) and label (real)""" + res = {} + ic = pred.groupby(level="datetime").apply(lambda x: x.label.corr(x.score)) + rank_ic = pred.groupby(level="datetime").apply(lambda x: x.label.corr(x.score, method="spearman")) + res["ic"] = ic.mean() + res["icir"] = ic.mean() / ic.std() + res["ric"] = rank_ic.mean() + res["ricir"] = rank_ic.mean() / rank_ic.std() + res["mse"] = -(pred["label"] - pred["score"]).mean() + res["loss"] = res["mse"] + return res + + def test_epoch(self, df): + self.model.eval() + preds = self.infer(df["feature"]) + label = df["label"].squeeze() + preds = pd.DataFrame({"label": label, "score": preds}, index=df.index) + metrics = self.calc_all_metrics(preds) + return metrics + + def log_metrics(self, mode, metrics): + metrics = ["{}/{}: {:.6f}".format(k, mode, v) for k, v in metrics.items()] + metrics = ", ".join(metrics) + self.logger.info(metrics) + + def fit( + self, + dataset: DatasetH, + evals_result=dict(), + save_path=None, + ): + + df_train, df_valid = dataset.prepare( + ["train", "valid"], + col_set=["feature", "label"], + data_key=DataHandlerLP.DK_L, + ) + # splits = ['2011-06-30'] + days = df_train.index.get_level_values(level=0).unique() + train_splits = np.array_split(days, self.n_splits) + train_splits = [df_train[s[0] : s[-1]] for s in train_splits] + train_loader_list = [get_stock_loader(df, self.batch_size) for df in train_splits] + + save_path = get_or_create_path(save_path) + stop_steps = 0 + best_score = -np.inf + best_epoch = 0 + evals_result["train"] = [] + evals_result["valid"] = [] + + # train + self.logger.info("training...") + self.fitted = True + best_score = -np.inf + best_epoch = 0 + weight_mat, dist_mat = None, None + + for step in range(self.n_epochs): + self.logger.info("Epoch%d:", step) + self.logger.info("training...") + weight_mat, dist_mat = self.train_AdaRNN(train_loader_list, step, dist_mat, weight_mat) + self.logger.info("evaluating...") + train_metrics = self.test_epoch(df_train) + valid_metrics = self.test_epoch(df_valid) + self.log_metrics("train: ", train_metrics) + self.log_metrics("valid: ", valid_metrics) + + valid_score = valid_metrics[self.metric] + train_score = train_metrics[self.metric] + evals_result["train"].append(train_score) + evals_result["valid"].append(valid_score) + if valid_score > best_score: + best_score = valid_score + stop_steps = 0 + best_epoch = step + best_param = copy.deepcopy(self.model.state_dict()) + else: + stop_steps += 1 + if stop_steps >= self.early_stop: + self.logger.info("early stop") + break + + self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch)) + self.model.load_state_dict(best_param) + torch.save(best_param, save_path) + + if self.use_gpu: + torch.cuda.empty_cache() + return best_score + + def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): + if not self.fitted: + raise ValueError("model is not fitted yet!") + x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I) + return self.infer(x_test) + + def infer(self, x_test): + index = x_test.index + self.model.eval() + x_values = x_test.values + sample_num = x_values.shape[0] + x_values = x_values.reshape(sample_num, self.d_feat, -1).transpose(0, 2, 1) + preds = [] + + for begin in range(sample_num)[:: self.batch_size]: + + if sample_num - begin < self.batch_size: + end = sample_num + else: + end = begin + self.batch_size + + x_batch = torch.from_numpy(x_values[begin:end]).float().cuda() + + with torch.no_grad(): + pred = self.model.predict(x_batch).detach().cpu().numpy() + + preds.append(pred) + + return pd.Series(np.concatenate(preds), index=index) + + def transform_type(self, init_weight): + weight = torch.ones(self.num_layers, self.len_seq).cuda() + for i in range(self.num_layers): + for j in range(self.len_seq): + weight[i, j] = init_weight[i][j].item() + return weight + + +class data_loader(Dataset): + def __init__(self, df): + self.df_feature = df["feature"] + self.df_label_reg = df["label"] + self.df_index = df.index + self.df_feature = torch.tensor( + self.df_feature.values.reshape(-1, 6, 60).transpose(0, 2, 1), dtype=torch.float32 + ) + self.df_label_reg = torch.tensor(self.df_label_reg.values.reshape(-1), dtype=torch.float32) + + def __getitem__(self, index): + sample, label_reg = self.df_feature[index], self.df_label_reg[index] + return sample, label_reg + + def __len__(self): + return len(self.df_feature) + + +def get_stock_loader(df, batch_size, shuffle=True): + train_loader = DataLoader(data_loader(df), batch_size=batch_size, shuffle=shuffle) + return train_loader + + +def get_index(num_domain=2): + index = [] + for i in range(num_domain): + for j in range(i + 1, num_domain + 1): + index.append((i, j)) + return index + + +class AdaRNN(nn.Module): + """ + model_type: 'Boosting', 'AdaRNN' + """ + + def __init__( + self, + use_bottleneck=False, + bottleneck_width=256, + n_input=128, + n_hiddens=[64, 64], + n_output=6, + dropout=0.0, + len_seq=9, + model_type="AdaRNN", + trans_loss="mmd", + ): + super(AdaRNN, self).__init__() + self.use_bottleneck = use_bottleneck + self.n_input = n_input + self.num_layers = len(n_hiddens) + self.hiddens = n_hiddens + self.n_output = n_output + self.model_type = model_type + self.trans_loss = trans_loss + self.len_seq = len_seq + in_size = self.n_input + + features = nn.ModuleList() + for hidden in n_hiddens: + rnn = nn.GRU(input_size=in_size, num_layers=1, hidden_size=hidden, batch_first=True, dropout=dropout) + features.append(rnn) + in_size = hidden + self.features = nn.Sequential(*features) + + if use_bottleneck == True: # finance + self.bottleneck = nn.Sequential( + nn.Linear(n_hiddens[-1], bottleneck_width), + nn.Linear(bottleneck_width, bottleneck_width), + nn.BatchNorm1d(bottleneck_width), + nn.ReLU(), + nn.Dropout(), + ) + self.bottleneck[0].weight.data.normal_(0, 0.005) + self.bottleneck[0].bias.data.fill_(0.1) + self.bottleneck[1].weight.data.normal_(0, 0.005) + self.bottleneck[1].bias.data.fill_(0.1) + self.fc = nn.Linear(bottleneck_width, n_output) + torch.nn.init.xavier_normal_(self.fc.weight) + else: + self.fc_out = nn.Linear(n_hiddens[-1], self.n_output) + + if self.model_type == "AdaRNN": + gate = nn.ModuleList() + for i in range(len(n_hiddens)): + gate_weight = nn.Linear(len_seq * self.hiddens[i] * 2, len_seq) + gate.append(gate_weight) + self.gate = gate + + bnlst = nn.ModuleList() + for i in range(len(n_hiddens)): + bnlst.append(nn.BatchNorm1d(len_seq)) + self.bn_lst = bnlst + self.softmax = torch.nn.Softmax(dim=0) + self.init_layers() + + def init_layers(self): + for i in range(len(self.hiddens)): + self.gate[i].weight.data.normal_(0, 0.05) + self.gate[i].bias.data.fill_(0.0) + + def forward_pre_train(self, x, len_win=0): + out = self.gru_features(x) + fea = out[0] # [2N,L,H] + if self.use_bottleneck == True: + fea_bottleneck = self.bottleneck(fea[:, -1, :]) + fc_out = self.fc(fea_bottleneck).squeeze() + else: + fc_out = self.fc_out(fea[:, -1, :]).squeeze() # [N,] + + out_list_all, out_weight_list = out[1], out[2] + out_list_s, out_list_t = self.get_features(out_list_all) + loss_transfer = torch.zeros((1,)).cuda() + for i in range(len(out_list_s)): + criterion_transder = TransferLoss(loss_type=self.trans_loss, input_dim=out_list_s[i].shape[2]) + h_start = 0 + for j in range(h_start, self.len_seq, 1): + i_start = j - len_win if j - len_win >= 0 else 0 + i_end = j + len_win if j + len_win < self.len_seq else self.len_seq - 1 + for k in range(i_start, i_end + 1): + weight = ( + out_weight_list[i][j] + if self.model_type == "AdaRNN" + else 1 / (self.len_seq - h_start) * (2 * len_win + 1) + ) + loss_transfer = loss_transfer + weight * criterion_transder.compute( + out_list_s[i][:, j, :], out_list_t[i][:, k, :] + ) + return fc_out, loss_transfer, out_weight_list + + def gru_features(self, x, predict=False): + x_input = x + out = None + out_lis = [] + out_weight_list = [] if (self.model_type == "AdaRNN") else None + for i in range(self.num_layers): + out, _ = self.features[i](x_input.float()) + x_input = out + out_lis.append(out) + if self.model_type == "AdaRNN" and predict == False: + out_gate = self.process_gate_weight(x_input, i) + out_weight_list.append(out_gate) + return out, out_lis, out_weight_list + + def process_gate_weight(self, out, index): + x_s = out[0 : int(out.shape[0] // 2)] + x_t = out[out.shape[0] // 2 : out.shape[0]] + x_all = torch.cat((x_s, x_t), 2) + x_all = x_all.view(x_all.shape[0], -1) + weight = torch.sigmoid(self.bn_lst[index](self.gate[index](x_all.float()))) + weight = torch.mean(weight, dim=0) + res = self.softmax(weight).squeeze() + return res + + def get_features(self, output_list): + fea_list_src, fea_list_tar = [], [] + for fea in output_list: + fea_list_src.append(fea[0 : fea.size(0) // 2]) + fea_list_tar.append(fea[fea.size(0) // 2 :]) + return fea_list_src, fea_list_tar + + # For Boosting-based + def forward_Boosting(self, x, weight_mat=None): + out = self.gru_features(x) + fea = out[0] + if self.use_bottleneck: + fea_bottleneck = self.bottleneck(fea[:, -1, :]) + fc_out = self.fc(fea_bottleneck).squeeze() + else: + fc_out = self.fc_out(fea[:, -1, :]).squeeze() + + out_list_all = out[1] + out_list_s, out_list_t = self.get_features(out_list_all) + loss_transfer = torch.zeros((1,)).cuda() + if weight_mat is None: + weight = (1.0 / self.len_seq * torch.ones(self.num_layers, self.len_seq)).cuda() + else: + weight = weight_mat + dist_mat = torch.zeros(self.num_layers, self.len_seq).cuda() + for i in range(len(out_list_s)): + criterion_transder = TransferLoss(loss_type=self.trans_loss, input_dim=out_list_s[i].shape[2]) + for j in range(self.len_seq): + loss_trans = criterion_transder.compute(out_list_s[i][:, j, :], out_list_t[i][:, j, :]) + loss_transfer = loss_transfer + weight[i, j] * loss_trans + dist_mat[i, j] = loss_trans + return fc_out, loss_transfer, dist_mat, weight + + # For Boosting-based + def update_weight_Boosting(self, weight_mat, dist_old, dist_new): + epsilon = 1e-5 + dist_old = dist_old.detach() + dist_new = dist_new.detach() + ind = dist_new > dist_old + epsilon + weight_mat[ind] = weight_mat[ind] * (1 + torch.sigmoid(dist_new[ind] - dist_old[ind])) + weight_norm = torch.norm(weight_mat, dim=1, p=1) + weight_mat = weight_mat / weight_norm.t().unsqueeze(1).repeat(1, self.len_seq) + return weight_mat + + def predict(self, x): + out = self.gru_features(x, predict=True) + fea = out[0] + if self.use_bottleneck == True: + fea_bottleneck = self.bottleneck(fea[:, -1, :]) + fc_out = self.fc(fea_bottleneck).squeeze() + else: + fc_out = self.fc_out(fea[:, -1, :]).squeeze() + return fc_out + + +class TransferLoss(object): + def __init__(self, loss_type="cosine", input_dim=512): + """ + Supported loss_type: mmd(mmd_lin), mmd_rbf, coral, cosine, kl, js, mine, adv + """ + self.loss_type = loss_type + self.input_dim = input_dim + + def compute(self, X, Y): + """Compute adaptation loss + + Arguments: + X {tensor} -- source matrix + Y {tensor} -- target matrix + + Returns: + [tensor] -- transfer loss + """ + if self.loss_type == "mmd_lin" or self.loss_type == "mmd": + mmdloss = MMD_loss(kernel_type="linear") + loss = mmdloss(X, Y) + elif self.loss_type == "coral": + loss = CORAL(X, Y) + elif self.loss_type == "cosine" or self.loss_type == "cos": + loss = 1 - cosine(X, Y) + elif self.loss_type == "kl": + loss = kl_div(X, Y) + elif self.loss_type == "js": + loss = js(X, Y) + elif self.loss_type == "mine": + mine_model = Mine_estimator(input_dim=self.input_dim, hidden_dim=60).cuda() + loss = mine_model(X, Y) + elif self.loss_type == "adv": + loss = adv(X, Y, input_dim=self.input_dim, hidden_dim=32) + elif self.loss_type == "mmd_rbf": + mmdloss = MMD_loss(kernel_type="rbf") + loss = mmdloss(X, Y) + elif self.loss_type == "pairwise": + pair_mat = pairwise_dist(X, Y) + loss = torch.norm(pair_mat) + + return loss + + +def cosine(source, target): + source, target = source.mean(), target.mean() + cos = nn.CosineSimilarity(dim=0) + loss = cos(source, target) + return loss.mean() + + +class ReverseLayerF(Function): + @staticmethod + def forward(ctx, x, alpha): + ctx.alpha = alpha + return x.view_as(x) + + @staticmethod + def backward(ctx, grad_output): + output = grad_output.neg() * ctx.alpha + return output, None + + +class Discriminator(nn.Module): + def __init__(self, input_dim=256, hidden_dim=256): + super(Discriminator, self).__init__() + self.input_dim = input_dim + self.hidden_dim = hidden_dim + self.dis1 = nn.Linear(input_dim, hidden_dim) + self.dis2 = nn.Linear(hidden_dim, 1) + + def forward(self, x): + x = F.relu(self.dis1(x)) + x = self.dis2(x) + x = torch.sigmoid(x) + return x + + +def adv(source, target, input_dim=256, hidden_dim=512): + domain_loss = nn.BCELoss() + # !!! Pay attention to .cuda !!! + adv_net = Discriminator(input_dim, hidden_dim).cuda() + domain_src = torch.ones(len(source)).cuda() + domain_tar = torch.zeros(len(target)).cuda() + domain_src, domain_tar = domain_src.view(domain_src.shape[0], 1), domain_tar.view(domain_tar.shape[0], 1) + reverse_src = ReverseLayerF.apply(source, 1) + reverse_tar = ReverseLayerF.apply(target, 1) + pred_src = adv_net(reverse_src) + pred_tar = adv_net(reverse_tar) + loss_s, loss_t = domain_loss(pred_src, domain_src), domain_loss(pred_tar, domain_tar) + loss = loss_s + loss_t + return loss + + +def CORAL(source, target): + d = source.size(1) + ns, nt = source.size(0), target.size(0) + + # source covariance + tmp_s = torch.ones((1, ns)).cuda() @ source + cs = (source.t() @ source - (tmp_s.t() @ tmp_s) / ns) / (ns - 1) + + # target covariance + tmp_t = torch.ones((1, nt)).cuda() @ target + ct = (target.t() @ target - (tmp_t.t() @ tmp_t) / nt) / (nt - 1) + + # frobenius norm + loss = (cs - ct).pow(2).sum() + loss = loss / (4 * d * d) + + return loss + + +class MMD_loss(nn.Module): + def __init__(self, kernel_type="linear", kernel_mul=2.0, kernel_num=5): + super(MMD_loss, self).__init__() + self.kernel_num = kernel_num + self.kernel_mul = kernel_mul + self.fix_sigma = None + self.kernel_type = kernel_type + + def guassian_kernel(self, source, target, kernel_mul=2.0, kernel_num=5, fix_sigma=None): + n_samples = int(source.size()[0]) + int(target.size()[0]) + total = torch.cat([source, target], dim=0) + total0 = total.unsqueeze(0).expand(int(total.size(0)), int(total.size(0)), int(total.size(1))) + total1 = total.unsqueeze(1).expand(int(total.size(0)), int(total.size(0)), int(total.size(1))) + L2_distance = ((total0 - total1) ** 2).sum(2) + if fix_sigma: + bandwidth = fix_sigma + else: + bandwidth = torch.sum(L2_distance.data) / (n_samples ** 2 - n_samples) + bandwidth /= kernel_mul ** (kernel_num // 2) + bandwidth_list = [bandwidth * (kernel_mul ** i) for i in range(kernel_num)] + kernel_val = [torch.exp(-L2_distance / bandwidth_temp) for bandwidth_temp in bandwidth_list] + return sum(kernel_val) + + def linear_mmd(self, X, Y): + delta = X.mean(axis=0) - Y.mean(axis=0) + loss = delta.dot(delta.T) + return loss + + def forward(self, source, target): + if self.kernel_type == "linear": + return self.linear_mmd(source, target) + elif self.kernel_type == "rbf": + batch_size = int(source.size()[0]) + kernels = self.guassian_kernel( + source, target, kernel_mul=self.kernel_mul, kernel_num=self.kernel_num, fix_sigma=self.fix_sigma + ) + with torch.no_grad(): + XX = torch.mean(kernels[:batch_size, :batch_size]) + YY = torch.mean(kernels[batch_size:, batch_size:]) + XY = torch.mean(kernels[:batch_size, batch_size:]) + YX = torch.mean(kernels[batch_size:, :batch_size]) + loss = torch.mean(XX + YY - XY - YX) + return loss + + +class Mine_estimator(nn.Module): + def __init__(self, input_dim=2048, hidden_dim=512): + super(Mine_estimator, self).__init__() + self.mine_model = Mine(input_dim, hidden_dim) + + def forward(self, X, Y): + Y_shffle = Y[torch.randperm(len(Y))] + loss_joint = self.mine_model(X, Y) + loss_marginal = self.mine_model(X, Y_shffle) + ret = torch.mean(loss_joint) - torch.log(torch.mean(torch.exp(loss_marginal))) + loss = -ret + return loss + + +class Mine(nn.Module): + def __init__(self, input_dim=2048, hidden_dim=512): + super(Mine, self).__init__() + self.fc1_x = nn.Linear(input_dim, hidden_dim) + self.fc1_y = nn.Linear(input_dim, hidden_dim) + self.fc2 = nn.Linear(hidden_dim, 1) + + def forward(self, x, y): + h1 = F.leaky_relu(self.fc1_x(x) + self.fc1_y(y)) + h2 = self.fc2(h1) + return h2 + + +def pairwise_dist(X, Y): + n, d = X.shape + m, _ = Y.shape + assert d == Y.shape[1] + a = X.unsqueeze(1).expand(n, m, d) + b = Y.unsqueeze(0).expand(n, m, d) + return torch.pow(a - b, 2).sum(2) + + +def pairwise_dist_np(X, Y): + n, d = X.shape + m, _ = Y.shape + assert d == Y.shape[1] + a = np.expand_dims(X, 1) + b = np.expand_dims(Y, 0) + a = np.tile(a, (1, m, 1)) + b = np.tile(b, (n, 1, 1)) + return np.power(a - b, 2).sum(2) + + +def pa(X, Y): + XY = np.dot(X, Y.T) + XX = np.sum(np.square(X), axis=1) + XX = np.transpose([XX]) + YY = np.sum(np.square(Y), axis=1) + dist = XX + YY - 2 * XY + + return dist + + +def kl_div(source, target): + if len(source) < len(target): + target = target[: len(source)] + elif len(source) > len(target): + source = source[: len(target)] + criterion = nn.KLDivLoss(reduction="batchmean") + loss = criterion(source.log(), target) + return loss + + +def js(source, target): + if len(source) < len(target): + target = target[: len(source)] + elif len(source) > len(target): + source = source[: len(target)] + M = 0.5 * (source + target) + loss_1, loss_2 = kl_div(source, M), kl_div(target, M) + return 0.5 * (loss_1 + loss_2)