Skip to content

Commit

Permalink
ENH rename collapsed "rest-n" to "other n" in binned strings (#197)
Browse files Browse the repository at this point in the history
  • Loading branch information
lorentzenchr authored Feb 17, 2025
1 parent f8bd180 commit 70e376a
Show file tree
Hide file tree
Showing 5 changed files with 22 additions and 35 deletions.
17 changes: 2 additions & 15 deletions docs/examples/regression_on_workers_compensation.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -502,23 +502,10 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": null,
"id": "70f55852-de16-4dc6-b35e-ad3dfef93b3e",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/lorentzen/github/model-diagnostics/.hatch/jupyter/lib/python3.12/site-packages/sklearn/preprocessing/_function_transformer.py:303: UserWarning: When `set_output` is configured to be 'pandas', `func` should return a pandas DataFrame to follow the `set_output` API or `feature_names_out` should be defined.\n",
" warnings.warn(warn_msg.format(\"pandas\"))\n",
"/Users/lorentzen/github/model-diagnostics/.hatch/jupyter/lib/python3.12/site-packages/sklearn/preprocessing/_function_transformer.py:303: UserWarning: When `set_output` is configured to be 'pandas', `func` should return a pandas DataFrame to follow the `set_output` API or `feature_names_out` should be defined.\n",
" warnings.warn(warn_msg.format(\"pandas\"))\n",
"/Users/lorentzen/github/model-diagnostics/.hatch/jupyter/lib/python3.12/site-packages/sklearn/preprocessing/_function_transformer.py:303: UserWarning: When `set_output` is configured to be 'pandas', `func` should return a pandas DataFrame to follow the `set_output` API or `feature_names_out` should be defined.\n",
" warnings.warn(warn_msg.format(\"pandas\"))\n"
]
}
],
"outputs": [],
"source": [
"from sklearn.compose import ColumnTransformer, TransformedTargetRegressor\n",
"from sklearn.linear_model import LinearRegression\n",
Expand Down
8 changes: 4 additions & 4 deletions src/model_diagnostics/_utils/binning.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def bin_feature(
For other features, columns are:
- `bin`: The binned version of it, i.e. the many too small values are put
together as `"rest-n"` where `n` is the number of unique values it contains.
together as `"other n"` where `n` is the number of unique values it contains.
"""
is_categorical = False
is_enum = False
Expand Down Expand Up @@ -171,7 +171,7 @@ def bin_feature(
# "d" 1
# with n_bins = 3. As we want the effective number of bins to be at most
# n_bins, we want, in the above case, only "a" and "b" in the final result. All
# the others a put into the second bin and called "rest-2" because it comprises
# the others a put into the second bin and called "other 2" because it comprises
# 2 unique features values (c, d). Ties are dealt with by sorting.

# value_counts(sort=True) sorts ties by first occurence, we want
Expand Down Expand Up @@ -201,10 +201,10 @@ def bin_feature(
keep_values[-1] = None
else:
keep_values = value_counts[feature_name].head(n_bins_ef - 1)
# Number of feature values to put into one bin, called "rest-n",
# Number of feature values to put into one bin, called "other n",
# n = n_remaining.
n_remaining = value_counts.shape[0] - (n_bins_ef - 1)
remaining_name = "rest-" + _format_integer(n_remaining)
remaining_name = "other " + _format_integer(n_remaining)
while remaining_name in keep_values:
remaining_name = "_" + remaining_name
return_dtype = feature.dtype
Expand Down
14 changes: 7 additions & 7 deletions src/model_diagnostics/_utils/tests/test_binning.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,15 @@ def test_binning_strings_categorical(with_null, feature_type, n_bins):
if with_null:
feature[0] = None
assert feature.has_nulls()
rest_name = "rest-3"
rest_name = "other 3"
f_binned_expected = pl.Series(
name="bin", values=["rest-3", "b", "rest-3"] * 3 + ["rest-3"]
name="bin", values=["other 3", "b", "other 3"] * 3 + ["other 3"]
)
f_binned_expected[0] = None
else:
rest_name = "rest-2"
rest_name = "other 2"
f_binned_expected = pl.Series(
name="bin", values=["a", "b", "rest-2"] * 3 + ["rest-2"]
name="bin", values=["a", "b", "other 2"] * 3 + ["other 2"]
)
if feature_type == "cat":
feature = feature.cast(pl.Categorical)
Expand Down Expand Up @@ -166,11 +166,11 @@ def test_binning_auto():


def test_binning_strings_with_rest():
"""Test what happens if 'rest-n' is already taken."""
"""Test what happens if 'other n' is already taken."""
n_bins = 3
feature = pl.Series(name="my_feature", values=["a", "rest-2"] * 5 + ["c", "d"] * 2)
feature = pl.Series(name="my_feature", values=["a", "other 2"] * 5 + ["c", "d"] * 2)
f_binned_expected = pl.Series(
name="bin", values=["a", "rest-2"] * 5 + ["_rest-2"] * 4
name="bin", values=["a", "other 2"] * 5 + ["_other 2"] * 4
)
n_obs = len(feature)

Expand Down
4 changes: 2 additions & 2 deletions src/model_diagnostics/calibration/identification.py
Original file line number Diff line number Diff line change
Expand Up @@ -853,10 +853,10 @@ def compute_marginal(
# Add partial dependence.
with_pd = predict_function is not None and feature_name is not None
if with_pd:
# In case we have "rest-n" string/cat/enum, we must exclude it from pd
# In case we have "other n" string/cat/enum, we must exclude it from pd
# because it is an artificual value and not part of the real data.
has_rest_n = (
is_cat_or_string and "rest-" in df.get_column(feature_name)[-1]
is_cat_or_string and "other " in df.get_column(feature_name)[-1]
)
if has_rest_n:
# Note that null, if present, is the first not the last values.
Expand Down
14 changes: 7 additions & 7 deletions src/model_diagnostics/calibration/tests/test_identification.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,10 +311,10 @@ def test_compute_bias_n_bins_string_like_feature(feature_type):

if feature_type == "enum":
expected_feature = pl.Series(
values=[None, "b", "rest-2"], dtype=pl.Enum(["b", "a", "c", "rest-2"])
values=[None, "b", "other 2"], dtype=pl.Enum(["b", "a", "c", "other 2"])
)
else:
expected_feature = pl.Series(values=[None, "a", "rest-2"], dtype=dtype)
expected_feature = pl.Series(values=[None, "a", "other 2"], dtype=dtype)

df_expected = pl.DataFrame(
{
Expand Down Expand Up @@ -839,11 +839,11 @@ def test_compute_marginal_n_bins_string_like_feature(feature_type):

if feature_type == "enum":
feature_expected = pl.Series(
[None, "b", "rest-2"], dtype=pl.Enum(["b", "a", "c", "rest-2"])
[None, "b", "other 2"], dtype=pl.Enum(["b", "a", "c", "other 2"])
)
y_obs_mean = [0.0, 2, 4 / 3]
else:
feature_expected = pl.Series([None, "a", "rest-2"], dtype=dtype)
feature_expected = pl.Series([None, "a", "other 2"], dtype=dtype)
y_obs_mean = [0.0, 1, 2]

df_expected = pl.DataFrame(
Expand Down Expand Up @@ -1018,7 +1018,7 @@ def predict(X):

@pytest.mark.parametrize("with_null", [False, True])
def test_compute_marginal_with_partial_dependence_on_strings(with_null):
"""Test partial_dependence in compute_marginal with a rest-n value."""
"""Test partial_dependence in compute_marginal with a other n value."""
n_obs = 20
n_bins = 3
X = pl.DataFrame(
Expand All @@ -1028,7 +1028,7 @@ def test_compute_marginal_with_partial_dependence_on_strings(with_null):
}
)

# The Python function ord throws an error if "rest-3" is passed to it.
# The Python function ord throws an error if "other 3" is passed to it.
def predict(X):
a = X.get_column("a")
b = (
Expand All @@ -1053,7 +1053,7 @@ def predict(X):

df_expected = pl.DataFrame(
{
"b": [None, "a", "rest-3"] if with_null else ["a", "b", "rest-3"],
"b": [None, "a", "other 3"] if with_null else ["a", "b", "other 3"],
"y_obs_mean": 0.0,
"partial_dependence": [-10.0, 2, None] if with_null else [2.0, -1, None],
}
Expand Down

0 comments on commit 70e376a

Please # to comment.