Skip to content

Fix mrmr working with categoricals #1311

Merged
merged 4 commits into from
Jul 10, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
-
-
-
-
- `mrmr` feature selection working with categoricals ([#1311](https://github.com/tinkoff-ai/etna/pull/1311))
-

### Removed
Expand Down
8 changes: 8 additions & 0 deletions etna/analysis/feature_selection/mrmr_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,14 @@ def mrmr(
redundancy_table = pd.DataFrame(np.inf, index=all_features, columns=all_features)
top_k = min(top_k, len(all_features))

# can't compute correlation of categorical column with the others
cat_cols = regressors.dtypes[regressors.dtypes == "category"].index
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Won't it be easier to cast all regressors to float?

for cat_col in cat_cols:
try:
regressors[cat_col] = regressors[cat_col].astype(float)
except ValueError:
raise ValueError(f"{cat_col} column cannot be cast to float type! Please, use encoders.")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should test this error.


for i in range(top_k):
score_numerator = relevance.loc[not_selected_features]
score_denominator = pd.Series(1, index=not_selected_features)
Expand Down
13 changes: 13 additions & 0 deletions tests/test_analysis/test_feature_selection/test_mrmr.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,12 @@ def df_with_regressors() -> Dict[str, pd.DataFrame]:
regressor = df_regressors_useless[df_regressors_useless["segment"] == segment]["target"].values
df_exog[f"regressor_useless_{i}"] = regressor

# useless categorical regressor
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

regressor -> regressors?

num_cat_useless = 3
for i in range(num_cat_useless):
df_exog[f"categorical_regressor_useless_{i}"] = i
df_exog[f"categorical_regressor_useless_{i}"] = df_exog[f"categorical_regressor_useless_{i}"].astype("category")

# useful regressors: the same as target but with little noise
df_regressors_useful = df.copy()
sampler = RandomState(seed=2).normal
Expand Down Expand Up @@ -174,3 +180,10 @@ def test_fast_redundancy_deprecation_warning(df_with_regressors):
relevance_table = ModelRelevanceTable()(df=df, df_exog=regressors, model=RandomForestRegressor())
with pytest.warns(DeprecationWarning, match="Option `fast_redundancy=False` was added for backward compatibility"):
mrmr(relevance_table=relevance_table, regressors=regressors, top_k=2, fast_redundancy=False)


@pytest.mark.parametrize("fast_redundancy", [True, False])
def test_mrmr_with_categorical_regressor(df_with_regressors, fast_redundancy):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

test_mrmr_with_categorical_regressor -> test_mrmr_with_castable_categorical_regressor?

df, regressors = df_with_regressors["df"], df_with_regressors["regressors"]
relevance_table = ModelRelevanceTable()(df=df, df_exog=regressors, model=RandomForestRegressor())
mrmr(relevance_table=relevance_table, regressors=regressors, top_k=len(regressors), fast_redundancy=fast_redundancy)