ENH rename collapsed "rest-n" to "other n" in binned strings (#197)

lorentzenchr · Feb 17, 2025 · 70e376a · 70e376a
1 parent f8bd180
commit 70e376a
Show file tree

Hide file tree

Showing 5 changed files with 22 additions and 35 deletions.
diff --git a/docs/examples/regression_on_workers_compensation.ipynb b/docs/examples/regression_on_workers_compensation.ipynb
@@ -502,23 +502,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "id": "70f55852-de16-4dc6-b35e-ad3dfef93b3e",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Users/lorentzen/github/model-diagnostics/.hatch/jupyter/lib/python3.12/site-packages/sklearn/preprocessing/_function_transformer.py:303: UserWarning: When `set_output` is configured to be 'pandas', `func` should return a pandas DataFrame to follow the `set_output` API  or `feature_names_out` should be defined.\n",
-      "  warnings.warn(warn_msg.format(\"pandas\"))\n",
-      "/Users/lorentzen/github/model-diagnostics/.hatch/jupyter/lib/python3.12/site-packages/sklearn/preprocessing/_function_transformer.py:303: UserWarning: When `set_output` is configured to be 'pandas', `func` should return a pandas DataFrame to follow the `set_output` API  or `feature_names_out` should be defined.\n",
-      "  warnings.warn(warn_msg.format(\"pandas\"))\n",
-      "/Users/lorentzen/github/model-diagnostics/.hatch/jupyter/lib/python3.12/site-packages/sklearn/preprocessing/_function_transformer.py:303: UserWarning: When `set_output` is configured to be 'pandas', `func` should return a pandas DataFrame to follow the `set_output` API  or `feature_names_out` should be defined.\n",
-      "  warnings.warn(warn_msg.format(\"pandas\"))\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from sklearn.compose import ColumnTransformer, TransformedTargetRegressor\n",
     "from sklearn.linear_model import LinearRegression\n",

diff --git a/src/model_diagnostics/_utils/binning.py b/src/model_diagnostics/_utils/binning.py
@@ -81,7 +81,7 @@ def bin_feature(
         For other features, columns are:
 
         - `bin`: The binned version of it, i.e. the many too small values are put
-          together as `"rest-n"` where `n` is the number of unique values it contains.
+          together as `"other n"` where `n` is the number of unique values it contains.
     """
     is_categorical = False
     is_enum = False
@@ -171,7 +171,7 @@ def bin_feature(
         #         "d"      1
         # with n_bins = 3. As we want the effective number of bins to be at most
         # n_bins, we want, in the above case, only "a" and "b" in the final result. All
-        # the others a put into the second bin and called "rest-2" because it comprises
+        # the others a put into the second bin and called "other 2" because it comprises
         # 2 unique features values (c, d). Ties are dealt with by sorting.
 
         # value_counts(sort=True) sorts ties by first occurence, we want
@@ -201,10 +201,10 @@ def bin_feature(
                     keep_values[-1] = None
             else:
                 keep_values = value_counts[feature_name].head(n_bins_ef - 1)
-            # Number of feature values to put into one bin, called "rest-n",
+            # Number of feature values to put into one bin, called "other n",
             # n = n_remaining.
             n_remaining = value_counts.shape[0] - (n_bins_ef - 1)
-            remaining_name = "rest-" + _format_integer(n_remaining)
+            remaining_name = "other " + _format_integer(n_remaining)
             while remaining_name in keep_values:
                 remaining_name = "_" + remaining_name
             return_dtype = feature.dtype

diff --git a/src/model_diagnostics/_utils/tests/test_binning.py b/src/model_diagnostics/_utils/tests/test_binning.py
@@ -31,15 +31,15 @@ def test_binning_strings_categorical(with_null, feature_type, n_bins):
     if with_null:
         feature[0] = None
         assert feature.has_nulls()
-        rest_name = "rest-3"
+        rest_name = "other 3"
         f_binned_expected = pl.Series(
-            name="bin", values=["rest-3", "b", "rest-3"] * 3 + ["rest-3"]
+            name="bin", values=["other 3", "b", "other 3"] * 3 + ["other 3"]
         )
         f_binned_expected[0] = None
     else:
-        rest_name = "rest-2"
+        rest_name = "other 2"
         f_binned_expected = pl.Series(
-            name="bin", values=["a", "b", "rest-2"] * 3 + ["rest-2"]
+            name="bin", values=["a", "b", "other 2"] * 3 + ["other 2"]
         )
     if feature_type == "cat":
         feature = feature.cast(pl.Categorical)
@@ -166,11 +166,11 @@ def test_binning_auto():
 
 
 def test_binning_strings_with_rest():
-    """Test what happens if 'rest-n' is already taken."""
+    """Test what happens if 'other n' is already taken."""
     n_bins = 3
-    feature = pl.Series(name="my_feature", values=["a", "rest-2"] * 5 + ["c", "d"] * 2)
+    feature = pl.Series(name="my_feature", values=["a", "other 2"] * 5 + ["c", "d"] * 2)
     f_binned_expected = pl.Series(
-        name="bin", values=["a", "rest-2"] * 5 + ["_rest-2"] * 4
+        name="bin", values=["a", "other 2"] * 5 + ["_other 2"] * 4
     )
     n_obs = len(feature)
 

diff --git a/src/model_diagnostics/calibration/identification.py b/src/model_diagnostics/calibration/identification.py
@@ -853,10 +853,10 @@ def compute_marginal(
             # Add partial dependence.
             with_pd = predict_function is not None and feature_name is not None
             if with_pd:
-                # In case we have "rest-n" string/cat/enum, we must exclude it from pd
+                # In case we have "other n" string/cat/enum, we must exclude it from pd
                 # because it is an artificual value and not part of the real data.
                 has_rest_n = (
-                    is_cat_or_string and "rest-" in df.get_column(feature_name)[-1]
+                    is_cat_or_string and "other " in df.get_column(feature_name)[-1]
                 )
                 if has_rest_n:
                     # Note that null, if present, is the first not the last values.

diff --git a/src/model_diagnostics/calibration/tests/test_identification.py b/src/model_diagnostics/calibration/tests/test_identification.py
@@ -311,10 +311,10 @@ def test_compute_bias_n_bins_string_like_feature(feature_type):
 
         if feature_type == "enum":
             expected_feature = pl.Series(
-                values=[None, "b", "rest-2"], dtype=pl.Enum(["b", "a", "c", "rest-2"])
+                values=[None, "b", "other 2"], dtype=pl.Enum(["b", "a", "c", "other 2"])
             )
         else:
-            expected_feature = pl.Series(values=[None, "a", "rest-2"], dtype=dtype)
+            expected_feature = pl.Series(values=[None, "a", "other 2"], dtype=dtype)
 
         df_expected = pl.DataFrame(
             {
@@ -839,11 +839,11 @@ def test_compute_marginal_n_bins_string_like_feature(feature_type):
 
         if feature_type == "enum":
             feature_expected = pl.Series(
-                [None, "b", "rest-2"], dtype=pl.Enum(["b", "a", "c", "rest-2"])
+                [None, "b", "other 2"], dtype=pl.Enum(["b", "a", "c", "other 2"])
             )
             y_obs_mean = [0.0, 2, 4 / 3]
         else:
-            feature_expected = pl.Series([None, "a", "rest-2"], dtype=dtype)
+            feature_expected = pl.Series([None, "a", "other 2"], dtype=dtype)
             y_obs_mean = [0.0, 1, 2]
 
         df_expected = pl.DataFrame(
@@ -1018,7 +1018,7 @@ def predict(X):
 
 @pytest.mark.parametrize("with_null", [False, True])
 def test_compute_marginal_with_partial_dependence_on_strings(with_null):
-    """Test partial_dependence in compute_marginal with a rest-n value."""
+    """Test partial_dependence in compute_marginal with a other n value."""
     n_obs = 20
     n_bins = 3
     X = pl.DataFrame(
@@ -1028,7 +1028,7 @@ def test_compute_marginal_with_partial_dependence_on_strings(with_null):
         }
     )
 
-    # The Python function ord throws an error if "rest-3" is passed to it.
+    # The Python function ord throws an error if "other 3" is passed to it.
     def predict(X):
         a = X.get_column("a")
         b = (
@@ -1053,7 +1053,7 @@ def predict(X):
 
     df_expected = pl.DataFrame(
         {
-            "b": [None, "a", "rest-3"] if with_null else ["a", "b", "rest-3"],
+            "b": [None, "a", "other 3"] if with_null else ["a", "b", "other 3"],
             "y_obs_mean": 0.0,
             "partial_dependence": [-10.0, 2, None] if with_null else [2.0, -1, None],
         }