fix(api): Return most native type for metrics in EstimatorReport (#1283)

closes #1275 Make metric methods from `EstimatorReport` return the most native type possible: `float`, `dict` or numpy array.
probabl-ai · Feb 10, 2025 · 24b2bc9 · 24b2bc9
1 parent 0a2b82a
commit 24b2bc9
Show file tree

Hide file tree

Showing 8 changed files with 263 additions and 269 deletions.
diff --git a/examples/model_evaluation/plot_estimator_report.py b/examples/model_evaluation/plot_estimator_report.py
@@ -13,8 +13,9 @@
 # Loading our dataset and defining our estimator
 # ==============================================
 #
-# First, we load a dataset from skrub. Our goal is to predict if a company paid a physician. The ultimate goal is to
-# detect potential conflict of interest when it comes to the actual problem that we want to solve.
+# First, we load a dataset from skrub. Our goal is to predict if a company paid a
+# physician. The ultimate goal is to detect potential conflict of interest when it comes
+# to the actual problem that we want to solve.
 
 # %%
 from skrub.datasets import fetch_open_payments
@@ -79,15 +80,12 @@
 report = EstimatorReport(
     estimator, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test
 )
-report
 
 # %%
 #
 # Once the report is created, we get some information regarding the available tools
-# allowing us to get some insights from our specific model on our specific task.
-#
-# We can get a similar information if we call the :meth:`~skore.EstimatorReport.help`
-# method.
+# allowing us to get some insights from our specific model on our specific task by
+# calling the :meth:`~skore.EstimatorReport.help` method.
 report.help()
 
 # %%
@@ -257,10 +255,7 @@ def operational_decision_cost(y_true, y_pred, amount):
 # We can now compute the cost of our operational decision.
 start = time.time()
 cost = report.metrics.custom_metric(
-    metric_function=operational_decision_cost,
-    metric_name="Operational Decision Cost",
-    response_method="predict",
-    amount=amount,
+    metric_function=operational_decision_cost, response_method="predict", amount=amount
 )
 end = time.time()
 cost
@@ -276,10 +271,7 @@ def operational_decision_cost(y_true, y_pred, amount):
 # %%
 start = time.time()
 cost = report.metrics.custom_metric(
-    metric_function=operational_decision_cost,
-    metric_name="Operational Decision Cost",
-    response_method="predict",
-    amount=amount,
+    metric_function=operational_decision_cost, response_method="predict", amount=amount
 )
 end = time.time()
 cost
@@ -294,12 +286,9 @@ def operational_decision_cost(y_true, y_pred, amount):
 # the predictions.
 report.metrics.report_metrics(
     scoring=["precision", "recall", operational_decision_cost],
+    scoring_names=["Precision", "Recall", "Operational Decision Cost"],
     pos_label=pos_label,
-    scoring_kwargs={
-        "amount": amount,
-        "response_method": "predict",
-        "metric_name": "Operational Decision Cost",
-    },
+    scoring_kwargs={"amount": amount, "response_method": "predict"},
 )
 
 # %%
@@ -310,16 +299,14 @@ def operational_decision_cost(y_true, y_pred, amount):
 # function.
 from sklearn.metrics import make_scorer, f1_score
 
-f1_scorer = make_scorer(
-    f1_score, response_method="predict", metric_name="F1 Score", pos_label=pos_label
-)
+f1_scorer = make_scorer(f1_score, response_method="predict", pos_label=pos_label)
 operational_decision_cost_scorer = make_scorer(
-    operational_decision_cost,
-    response_method="predict",
-    metric_name="Operational Decision Cost",
-    amount=amount,
+    operational_decision_cost, response_method="predict", amount=amount
+)
+report.metrics.report_metrics(
+    scoring=[f1_scorer, operational_decision_cost_scorer],
+    scoring_names=["F1 Score", "Operational Decision Cost"],
 )
-report.metrics.report_metrics(scoring=[f1_scorer, operational_decision_cost_scorer])
 
 # %%
 #

diff --git a/skore/src/skore/persistence/view/view.py b/skore/src/skore/persistence/view/view.py
@@ -12,6 +12,7 @@ class View:
 
     Examples
     --------
+    >>> from skore.persistence.view import View
     >>> View(layout=["a", "b"])
     View(...)
     """

diff --git a/skore/src/skore/sklearn/_cross_validation/metrics_accessor.py b/skore/src/skore/sklearn/_cross_validation/metrics_accessor.py
@@ -1,6 +1,7 @@
 import joblib
 import numpy as np
 import pandas as pd
+from sklearn.metrics import make_scorer
 from sklearn.utils.metaestimators import available_if
 
 from skore.externals._pandas_accessors import DirNamesMixin
@@ -20,17 +21,17 @@ class _MetricsAccessor(_BaseAccessor, DirNamesMixin):
     You can access this accessor using the `metrics` attribute.
     """
 
-    _SCORE_OR_LOSS_ICONS = {
-        "accuracy": "(↗︎)",
-        "precision": "(↗︎)",
-        "recall": "(↗︎)",
-        "brier_score": "(↘︎)",
-        "roc_auc": "(↗︎)",
-        "log_loss": "(↘︎)",
-        "r2": "(↗︎)",
-        "rmse": "(↘︎)",
-        "report_metrics": "",
-        "custom_metric": "",
+    _SCORE_OR_LOSS_INFO = {
+        "accuracy": {"name": "Accuracy", "icon": "(↗︎)"},
+        "precision": {"name": "Precision", "icon": "(↗︎)"},
+        "recall": {"name": "Recall", "icon": "(↗︎)"},
+        "brier_score": {"name": "Brier score", "icon": "(↘︎)"},
+        "roc_auc": {"name": "ROC AUC", "icon": "(↗︎)"},
+        "log_loss": {"name": "Log loss", "icon": "(↘︎)"},
+        "r2": {"name": "R²", "icon": "(↗︎)"},
+        "rmse": {"name": "RMSE", "icon": "(↘︎)"},
+        "custom_metric": {"name": "Custom metric", "icon": ""},
+        "report_metrics": {"name": "Report metrics", "icon": ""},
     }
 
     def __init__(self, parent):
@@ -210,8 +211,8 @@ def accuracy(self, *, data_source="test", aggregate=None):
         LogisticRegression Split #0       0.94...
                            Split #1       0.94...
         """
-        return self._compute_metric_scores(
-            report_metric_name="accuracy",
+        return self.report_metrics(
+            scoring=["accuracy"],
             data_source=data_source,
             aggregate=aggregate,
         )
@@ -285,16 +286,16 @@ def precision(
         >>> report = CrossValidationReport(classifier, X=X, y=y, cv_splitter=2)
         >>> report.metrics.precision()
         Metric                      Precision (↗︎)
-        Class label                              0         1
+        Label / Average                          0         1
         LogisticRegression Split #0       0.96...   0.93...
                            Split #1       0.90...   0.96...
         """
-        return self._compute_metric_scores(
-            report_metric_name="precision",
+        return self.report_metrics(
+            scoring=["precision"],
             data_source=data_source,
             aggregate=aggregate,
-            average=average,
             pos_label=pos_label,
+            scoring_kwargs={"average": average},
         )
 
     @available_if(
@@ -367,16 +368,16 @@ def recall(
         >>> report = CrossValidationReport(classifier, X=X, y=y, cv_splitter=2)
         >>> report.metrics.recall()
         Metric                      Recall (↗︎)
-        Class label                           0        1
+        Label / Average                          0         1
         LogisticRegression Split #0    0.87...   0.98...
                            Split #1    0.94...   0.94...
         """
-        return self._compute_metric_scores(
-            report_metric_name="recall",
+        return self.report_metrics(
+            scoring=["recall"],
             data_source=data_source,
             aggregate=aggregate,
-            average=average,
             pos_label=pos_label,
+            scoring_kwargs={"average": average},
         )
 
     @available_if(
@@ -414,8 +415,8 @@ def brier_score(self, *, data_source="test", aggregate=None):
         LogisticRegression Split #0          0.04...
                            Split #1          0.04...
         """
-        return self._compute_metric_scores(
-            report_metric_name="brier_score",
+        return self.report_metrics(
+            scoring=["brier_score"],
             data_source=data_source,
             aggregate=aggregate,
         )
@@ -443,8 +444,7 @@ def roc_auc(
             - "test" : use the test set provided when creating the report.
             - "train" : use the train set provided when creating the report.
 
-        average : {"auto", "macro", "micro", "weighted", "samples"}, \
-                default=None
+        average : {"macro", "micro", "weighted", "samples"}, default=None
             Average to compute the ROC AUC score in a multiclass setting. By default,
             no average is computed. Otherwise, this determines the type of averaging
             performed on the data.
@@ -498,12 +498,11 @@ def roc_auc(
         LogisticRegression Split #0      0.99...
                            Split #1      0.98...
         """
-        return self._compute_metric_scores(
-            report_metric_name="roc_auc",
+        return self.report_metrics(
+            scoring=["roc_auc"],
             data_source=data_source,
             aggregate=aggregate,
-            average=average,
-            multi_class=multi_class,
+            scoring_kwargs={"average": average, "multi_class": multi_class},
         )
 
     @available_if(
@@ -543,8 +542,8 @@ def log_loss(self, *, data_source="test", aggregate=None):
         LogisticRegression Split #0       0.1...
                            Split #1       0.1...
         """
-        return self._compute_metric_scores(
-            report_metric_name="log_loss",
+        return self.report_metrics(
+            scoring=["log_loss"],
             data_source=data_source,
             aggregate=aggregate,
         )
@@ -598,11 +597,11 @@ def r2(
         Ridge Split #0  0.36...
               Split #1  0.39...
         """
-        return self._compute_metric_scores(
-            report_metric_name="r2",
+        return self.report_metrics(
+            scoring=["r2"],
             data_source=data_source,
             aggregate=aggregate,
-            multioutput=multioutput,
+            scoring_kwargs={"multioutput": multioutput},
         )
 
     @available_if(_check_supported_ml_task(supported_ml_tasks=["regression"]))
@@ -654,11 +653,11 @@ def rmse(
         Ridge Split #0  59.9...
               Split #1  61.4...
         """
-        return self._compute_metric_scores(
-            report_metric_name="rmse",
+        return self.report_metrics(
+            scoring=["rmse"],
             data_source=data_source,
             aggregate=aggregate,
-            multioutput=multioutput,
+            scoring_kwargs={"multioutput": multioutput},
         )
 
     def custom_metric(
@@ -732,15 +731,20 @@ def custom_metric(
         Ridge Split #0  50.1...
               Split #1  52.6...
         """
-        return self._compute_metric_scores(
-            report_metric_name="custom_metric",
-            data_source=data_source,
-            aggregate=aggregate,
-            metric_function=metric_function,
+        # create a scorer with `greater_is_better=True` to not alter the output of
+        # `metric_function`
+        scorer = make_scorer(
+            metric_function,
+            greater_is_better=True,
             response_method=response_method,
-            metric_name=metric_name,
             **kwargs,
         )
+        return self.report_metrics(
+            scoring=[scorer],
+            data_source=data_source,
+            aggregate=aggregate,
+            scoring_names=[metric_name],
+        )
 
     ####################################################################################
     # Methods related to the help tree
@@ -768,15 +772,16 @@ def _format_method_name(self, name):
         """Override format method for metrics-specific naming."""
         method_name = f"{name}(...)"
         method_name = method_name.ljust(22)
-        if name in self._SCORE_OR_LOSS_ICONS and self._SCORE_OR_LOSS_ICONS[name] in (
-            "(↗︎)",
-            "(↘︎)",
-        ):
-            if self._SCORE_OR_LOSS_ICONS[name] == "(↗︎)":
-                method_name += f"[cyan]{self._SCORE_OR_LOSS_ICONS[name]}[/cyan]"
+        if name in self._SCORE_OR_LOSS_INFO and self._SCORE_OR_LOSS_INFO[name][
+            "icon"
+        ] in ("(↗︎)", "(↘︎)"):
+            if self._SCORE_OR_LOSS_INFO[name]["icon"] == "(↗︎)":
+                method_name += f"[cyan]{self._SCORE_OR_LOSS_INFO[name]['name']}[/cyan]"
                 return method_name.ljust(43)
             else:  # (↘︎)
-                method_name += f"[orange1]{self._SCORE_OR_LOSS_ICONS[name]}[/orange1]"
+                method_name += (
+                    f"[orange1]{self._SCORE_OR_LOSS_INFO[name]['name']}[/orange1]"
+                )
                 return method_name.ljust(49)
         else:
             return method_name.ljust(29)