From 0225a516004b14d33f0e437f0832f68d57f23779 Mon Sep 17 00:00:00 2001 From: alex-hse-repository <55380696+alex-hse-repository@users.noreply.github.com> Date: Mon, 10 Jul 2023 18:11:57 +0300 Subject: [PATCH] Fix mrmr working with categoricals (#1311) --- CHANGELOG.md | 2 +- .../feature_selection/mrmr_selection.py | 6 ++++ tests/test_analysis/conftest.py | 35 +++++++++++++++++++ .../test_relevance_table.py | 30 ---------------- .../test_feature_selection/test_mrmr.py | 21 +++++++++++ 5 files changed, 63 insertions(+), 31 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b53ebaae1..cbd644ea9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,7 +23,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - - - -- +- `mrmr` feature selection working with categoricals ([#1311](https://github.com/tinkoff-ai/etna/pull/1311)) - ### Removed diff --git a/etna/analysis/feature_selection/mrmr_selection.py b/etna/analysis/feature_selection/mrmr_selection.py index ba1d84eb3..a926b172f 100644 --- a/etna/analysis/feature_selection/mrmr_selection.py +++ b/etna/analysis/feature_selection/mrmr_selection.py @@ -72,6 +72,12 @@ def mrmr( relevance_aggregation_fn = AGGREGATION_FN[AggregationMode(relevance_aggregation_mode)] redundancy_aggregation_fn = AGGREGATION_FN[AggregationMode(redundancy_aggregation_mode)] + # can't compute correlation of categorical column with the others + try: + regressors = regressors.astype(float) + except ValueError as e: + raise ValueError(f"Only convertible to float features are allowed! Error: {str(e)}") + relevance = relevance_table.apply(relevance_aggregation_fn).fillna(0) all_features = relevance.index.to_list() diff --git a/tests/test_analysis/conftest.py b/tests/test_analysis/conftest.py index 280cf50e8..3873300a5 100644 --- a/tests/test_analysis/conftest.py +++ b/tests/test_analysis/conftest.py @@ -1,8 +1,43 @@ import matplotlib.pyplot as plt +import numpy as np +import pandas as pd import pytest +from etna.datasets import TSDataset +from etna.datasets import duplicate_data + @pytest.fixture(autouse=True) def close_plots(): yield plt.close() + + +@pytest.fixture +def exog_and_target_dfs(): + seg = ["a"] * 30 + ["b"] * 30 + time = list(pd.date_range("2020-01-01", "2021-01-01")[:30]) + timestamps = time * 2 + target = np.arange(60) + df = pd.DataFrame({"segment": seg, "timestamp": timestamps, "target": target}) + ts = TSDataset.to_dataset(df) + + cast = ["1.1"] * 10 + ["2"] * 9 + [None] + ["56.1"] * 10 + no_cast = ["1.1"] * 10 + ["two"] * 10 + ["56.1"] * 10 + none = [1] * 10 + [2] * 10 + [56.1] * 10 + none[10] = None + df = pd.DataFrame( + { + "timestamp": time, + "exog1": np.arange(100, 70, -1), + "exog2": np.sin(np.arange(30) / 10), + "exog3": np.exp(np.arange(30)), + "cast": cast, + "no_cast": no_cast, + "none": none, + } + ) + df["cast"] = df["cast"].astype("category") + df["no_cast"] = df["no_cast"].astype("category") + df_exog = duplicate_data(df, segments=["a", "b"]) + return ts, df_exog diff --git a/tests/test_analysis/test_feature_relevance/test_relevance_table.py b/tests/test_analysis/test_feature_relevance/test_relevance_table.py index d68bd40ea..6b6e9f8fb 100644 --- a/tests/test_analysis/test_feature_relevance/test_relevance_table.py +++ b/tests/test_analysis/test_feature_relevance/test_relevance_table.py @@ -38,36 +38,6 @@ def test_model_relevance_table(simple_df_relevance): assert relevance_table["regressor_1"]["2"] < relevance_table["regressor_2"]["2"] -@pytest.fixture() -def exog_and_target_dfs(): - seg = ["a"] * 30 + ["b"] * 30 - time = list(pd.date_range("2020-01-01", "2021-01-01")[:30]) - timestamps = time * 2 - target = np.arange(60) - df = pd.DataFrame({"segment": seg, "timestamp": timestamps, "target": target}) - ts = TSDataset.to_dataset(df) - - cast = ["1.1"] * 10 + ["2"] * 9 + [None] + ["56.1"] * 10 - no_cast = ["1.1"] * 10 + ["two"] * 10 + ["56.1"] * 10 - none = [1] * 10 + [2] * 10 + [56.1] * 10 - none[10] = None - df = pd.DataFrame( - { - "timestamp": time, - "exog1": np.arange(100, 70, -1), - "exog2": np.sin(np.arange(30) / 10), - "exog3": np.exp(np.arange(30)), - "cast": cast, - "no_cast": no_cast, - "none": none, - } - ) - df["cast"] = df["cast"].astype("category") - df["no_cast"] = df["no_cast"].astype("category") - df_exog = duplicate_data(df, segments=["a", "b"]) - return ts, df_exog - - @pytest.fixture() def exog_and_target_dfs_with_none(): seg = ["a"] * 30 + ["b"] * 30 diff --git a/tests/test_analysis/test_feature_selection/test_mrmr.py b/tests/test_analysis/test_feature_selection/test_mrmr.py index a45190bec..c2caf7c71 100644 --- a/tests/test_analysis/test_feature_selection/test_mrmr.py +++ b/tests/test_analysis/test_feature_selection/test_mrmr.py @@ -1,4 +1,5 @@ from typing import Dict +from unittest.mock import Mock import numpy as np import pandas as pd @@ -32,6 +33,12 @@ def df_with_regressors() -> Dict[str, pd.DataFrame]: regressor = df_regressors_useless[df_regressors_useless["segment"] == segment]["target"].values df_exog[f"regressor_useless_{i}"] = regressor + # useless categorical regressors + num_cat_useless = 3 + for i in range(num_cat_useless): + df_exog[f"categorical_regressor_useless_{i}"] = i + df_exog[f"categorical_regressor_useless_{i}"] = df_exog[f"categorical_regressor_useless_{i}"].astype("category") + # useful regressors: the same as target but with little noise df_regressors_useful = df.copy() sampler = RandomState(seed=2).normal @@ -174,3 +181,17 @@ def test_fast_redundancy_deprecation_warning(df_with_regressors): relevance_table = ModelRelevanceTable()(df=df, df_exog=regressors, model=RandomForestRegressor()) with pytest.warns(DeprecationWarning, match="Option `fast_redundancy=False` was added for backward compatibility"): mrmr(relevance_table=relevance_table, regressors=regressors, top_k=2, fast_redundancy=False) + + +@pytest.mark.parametrize("fast_redundancy", [True, False]) +def test_mrmr_with_castable_categorical_regressor(df_with_regressors, fast_redundancy): + df, regressors = df_with_regressors["df"], df_with_regressors["regressors"] + relevance_table = ModelRelevanceTable()(df=df, df_exog=regressors, model=RandomForestRegressor()) + mrmr(relevance_table=relevance_table, regressors=regressors, top_k=len(regressors), fast_redundancy=fast_redundancy) + + +@pytest.mark.parametrize("fast_redundancy", [True, False]) +def test_mrmr_with_uncastable_categorical_regressor_fails(exog_and_target_dfs, fast_redundancy): + df, regressors = exog_and_target_dfs + with pytest.raises(ValueError, match="Only convertible to float features are allowed!"): + mrmr(relevance_table=Mock(), regressors=regressors, top_k=len(regressors), fast_redundancy=fast_redundancy)