From 0225a516004b14d33f0e437f0832f68d57f23779 Mon Sep 17 00:00:00 2001
From: alex-hse-repository
 <55380696+alex-hse-repository@users.noreply.github.com>
Date: Mon, 10 Jul 2023 18:11:57 +0300
Subject: [PATCH] Fix mrmr working with categoricals (#1311)

---
 CHANGELOG.md                                  |  2 +-
 .../feature_selection/mrmr_selection.py       |  6 ++++
 tests/test_analysis/conftest.py               | 35 +++++++++++++++++++
 .../test_relevance_table.py                   | 30 ----------------
 .../test_feature_selection/test_mrmr.py       | 21 +++++++++++
 5 files changed, 63 insertions(+), 31 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b53ebaae1..cbd644ea9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -23,7 +23,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - 
 - 
 - 
-- 
+- `mrmr` feature selection working with categoricals ([#1311](https://github.com/tinkoff-ai/etna/pull/1311))
 - 
 
 ### Removed
diff --git a/etna/analysis/feature_selection/mrmr_selection.py b/etna/analysis/feature_selection/mrmr_selection.py
index ba1d84eb3..a926b172f 100644
--- a/etna/analysis/feature_selection/mrmr_selection.py
+++ b/etna/analysis/feature_selection/mrmr_selection.py
@@ -72,6 +72,12 @@ def mrmr(
     relevance_aggregation_fn = AGGREGATION_FN[AggregationMode(relevance_aggregation_mode)]
     redundancy_aggregation_fn = AGGREGATION_FN[AggregationMode(redundancy_aggregation_mode)]
 
+    # can't compute correlation of categorical column with the others
+    try:
+        regressors = regressors.astype(float)
+    except ValueError as e:
+        raise ValueError(f"Only convertible to float features are allowed! Error: {str(e)}")
+
     relevance = relevance_table.apply(relevance_aggregation_fn).fillna(0)
 
     all_features = relevance.index.to_list()
diff --git a/tests/test_analysis/conftest.py b/tests/test_analysis/conftest.py
index 280cf50e8..3873300a5 100644
--- a/tests/test_analysis/conftest.py
+++ b/tests/test_analysis/conftest.py
@@ -1,8 +1,43 @@
 import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
 import pytest
 
+from etna.datasets import TSDataset
+from etna.datasets import duplicate_data
+
 
 @pytest.fixture(autouse=True)
 def close_plots():
     yield
     plt.close()
+
+
+@pytest.fixture
+def exog_and_target_dfs():
+    seg = ["a"] * 30 + ["b"] * 30
+    time = list(pd.date_range("2020-01-01", "2021-01-01")[:30])
+    timestamps = time * 2
+    target = np.arange(60)
+    df = pd.DataFrame({"segment": seg, "timestamp": timestamps, "target": target})
+    ts = TSDataset.to_dataset(df)
+
+    cast = ["1.1"] * 10 + ["2"] * 9 + [None] + ["56.1"] * 10
+    no_cast = ["1.1"] * 10 + ["two"] * 10 + ["56.1"] * 10
+    none = [1] * 10 + [2] * 10 + [56.1] * 10
+    none[10] = None
+    df = pd.DataFrame(
+        {
+            "timestamp": time,
+            "exog1": np.arange(100, 70, -1),
+            "exog2": np.sin(np.arange(30) / 10),
+            "exog3": np.exp(np.arange(30)),
+            "cast": cast,
+            "no_cast": no_cast,
+            "none": none,
+        }
+    )
+    df["cast"] = df["cast"].astype("category")
+    df["no_cast"] = df["no_cast"].astype("category")
+    df_exog = duplicate_data(df, segments=["a", "b"])
+    return ts, df_exog
diff --git a/tests/test_analysis/test_feature_relevance/test_relevance_table.py b/tests/test_analysis/test_feature_relevance/test_relevance_table.py
index d68bd40ea..6b6e9f8fb 100644
--- a/tests/test_analysis/test_feature_relevance/test_relevance_table.py
+++ b/tests/test_analysis/test_feature_relevance/test_relevance_table.py
@@ -38,36 +38,6 @@ def test_model_relevance_table(simple_df_relevance):
     assert relevance_table["regressor_1"]["2"] < relevance_table["regressor_2"]["2"]
 
 
-@pytest.fixture()
-def exog_and_target_dfs():
-    seg = ["a"] * 30 + ["b"] * 30
-    time = list(pd.date_range("2020-01-01", "2021-01-01")[:30])
-    timestamps = time * 2
-    target = np.arange(60)
-    df = pd.DataFrame({"segment": seg, "timestamp": timestamps, "target": target})
-    ts = TSDataset.to_dataset(df)
-
-    cast = ["1.1"] * 10 + ["2"] * 9 + [None] + ["56.1"] * 10
-    no_cast = ["1.1"] * 10 + ["two"] * 10 + ["56.1"] * 10
-    none = [1] * 10 + [2] * 10 + [56.1] * 10
-    none[10] = None
-    df = pd.DataFrame(
-        {
-            "timestamp": time,
-            "exog1": np.arange(100, 70, -1),
-            "exog2": np.sin(np.arange(30) / 10),
-            "exog3": np.exp(np.arange(30)),
-            "cast": cast,
-            "no_cast": no_cast,
-            "none": none,
-        }
-    )
-    df["cast"] = df["cast"].astype("category")
-    df["no_cast"] = df["no_cast"].astype("category")
-    df_exog = duplicate_data(df, segments=["a", "b"])
-    return ts, df_exog
-
-
 @pytest.fixture()
 def exog_and_target_dfs_with_none():
     seg = ["a"] * 30 + ["b"] * 30
diff --git a/tests/test_analysis/test_feature_selection/test_mrmr.py b/tests/test_analysis/test_feature_selection/test_mrmr.py
index a45190bec..c2caf7c71 100644
--- a/tests/test_analysis/test_feature_selection/test_mrmr.py
+++ b/tests/test_analysis/test_feature_selection/test_mrmr.py
@@ -1,4 +1,5 @@
 from typing import Dict
+from unittest.mock import Mock
 
 import numpy as np
 import pandas as pd
@@ -32,6 +33,12 @@ def df_with_regressors() -> Dict[str, pd.DataFrame]:
         regressor = df_regressors_useless[df_regressors_useless["segment"] == segment]["target"].values
         df_exog[f"regressor_useless_{i}"] = regressor
 
+    # useless categorical regressors
+    num_cat_useless = 3
+    for i in range(num_cat_useless):
+        df_exog[f"categorical_regressor_useless_{i}"] = i
+        df_exog[f"categorical_regressor_useless_{i}"] = df_exog[f"categorical_regressor_useless_{i}"].astype("category")
+
     # useful regressors: the same as target but with little noise
     df_regressors_useful = df.copy()
     sampler = RandomState(seed=2).normal
@@ -174,3 +181,17 @@ def test_fast_redundancy_deprecation_warning(df_with_regressors):
     relevance_table = ModelRelevanceTable()(df=df, df_exog=regressors, model=RandomForestRegressor())
     with pytest.warns(DeprecationWarning, match="Option `fast_redundancy=False` was added for backward compatibility"):
         mrmr(relevance_table=relevance_table, regressors=regressors, top_k=2, fast_redundancy=False)
+
+
+@pytest.mark.parametrize("fast_redundancy", [True, False])
+def test_mrmr_with_castable_categorical_regressor(df_with_regressors, fast_redundancy):
+    df, regressors = df_with_regressors["df"], df_with_regressors["regressors"]
+    relevance_table = ModelRelevanceTable()(df=df, df_exog=regressors, model=RandomForestRegressor())
+    mrmr(relevance_table=relevance_table, regressors=regressors, top_k=len(regressors), fast_redundancy=fast_redundancy)
+
+
+@pytest.mark.parametrize("fast_redundancy", [True, False])
+def test_mrmr_with_uncastable_categorical_regressor_fails(exog_and_target_dfs, fast_redundancy):
+    df, regressors = exog_and_target_dfs
+    with pytest.raises(ValueError, match="Only convertible to float features are allowed!"):
+        mrmr(relevance_table=Mock(), regressors=regressors, top_k=len(regressors), fast_redundancy=fast_redundancy)