From d4ddd4d54b15df14e44f9e65d387763814eb42c2 Mon Sep 17 00:00:00 2001 From: jegorus Date: Thu, 4 May 2023 18:18:17 +0300 Subject: [PATCH 1/3] adding coverage mvp after pull --- rectools/metrics/__init__.py | 5 ++ rectools/metrics/coverage.py | 109 +++++++++++++++++++++++++++++++++++ 2 files changed, 114 insertions(+) create mode 100644 rectools/metrics/coverage.py diff --git a/rectools/metrics/__init__.py b/rectools/metrics/__init__.py index 16fbb95e..f4080e85 100644 --- a/rectools/metrics/__init__.py +++ b/rectools/metrics/__init__.py @@ -29,6 +29,8 @@ `metrics.MAP` `metrics.NDCG` `metrics.MRR` +`metrics.ItemCoverage` +`metrics.NumRetrieved` `metrics.MeanInvUserFreq` `metrics.IntraListDiversity` `metrics.Serendipity` @@ -52,6 +54,7 @@ from .ranking import MAP, MRR, NDCG from .scoring import calc_metrics from .serendipity import Serendipity +from .coverage import ItemCoverage, NumRetrieved __all__ = ( "Precision", @@ -65,6 +68,8 @@ "MeanInvUserFreq", "IntraListDiversity", "Serendipity", + "ItemCoverage", + "NumRetrieved", "calc_metrics", "PairwiseDistanceCalculator", "PairwiseHammingDistanceCalculator", diff --git a/rectools/metrics/coverage.py b/rectools/metrics/coverage.py new file mode 100644 index 00000000..a6fc38fd --- /dev/null +++ b/rectools/metrics/coverage.py @@ -0,0 +1,109 @@ +import attr +import pandas as pd + +from rectools import Columns + +from .base import Catalog, MetricAtK + + +@attr.s +class ItemCoverage(MetricAtK): + """ + Item space coverage (also referred as catalog coverage) is a metric that shows + what part of the items is covered by first k recommendations + ItemCoverage = #recommended_items / num_items + + Parameters + ---------- + k : int + Number of items in top of recommendations list that will be used to calculate metric. + + """ + + def calc(self, reco: pd.DataFrame, catalog: Catalog) -> float: + """ + Calculate item space coverage for all users + + Parameters + ---------- + reco : pd.DataFrame + Recommendations table with columns `Columns.User`, `Columns.Item`, `Columns.Rank`. + catalog : collection + Collection of unique item ids that could be used for recommendations. + + Returns + ------- + float + Value of metric. + """ + reco_k_first_ranks = reco[reco[Columns.Rank] <= self.k] + return len(reco_k_first_ranks[Columns.Item].unique()) / len(catalog) + + def calc_per_user(self, reco: pd.DataFrame, catalog: Catalog) -> pd.Series: + """ + Calculate item space coverage per user + + Parameters + ---------- + reco : pd.DataFrame + Recommendations table with columns `Columns.User`, `Columns.Item`, `Columns.Rank`. + catalog : collection + Collection of unique item ids that could be used for recommendations. + + Returns + ------- + pd.Series + Values of metric (index - user id, values - metric value for every user). + """ + reco_k_first_ranks = reco[reco[Columns.Rank] <= self.k] + return reco_k_first_ranks.groupby(Columns.User)[Columns.Item].nunique() / len(catalog) + + +@attr.s +class NumRetrieved(MetricAtK): + """ + Number of recommendations retrieved is a metric that shows + how much items retrieved by first k recommendations (less or equal k) + See more: https://elliot.readthedocs.io/en/latest/guide/metrics/coverage.html + + Parameters + ---------- + k : int + Number of items in top of recommendations list that will be used to calculate metric. + + """ + + def calc(self, reco: pd.DataFrame) -> float: + """ + Calculate average num retrieved for all users. + If num retrieved equals k, it means that k items were recommended to every user + + Parameters + ---------- + reco : pd.DataFrame + Recommendations table with columns `Columns.User`, `Columns.Item`, `Columns.Rank`. + + Returns + ------- + float + Value of metric (average between users). + """ + per_user = self.calc_per_user(reco) + return per_user.mean() + + def calc_per_user(self, reco: pd.DataFrame) -> pd.Series: + """ + Calculate num retrieved per user. + + Parameters + ---------- + reco : pd.DataFrame + Recommendations table with columns `Columns.User`, `Columns.Item`, `Columns.Rank`. + + Returns + ------- + pd.Series + Values of metric (index - user id, values - metric value for every user). + """ + reco_k_first_ranks = reco[reco[Columns.Rank] <= self.k] + return reco_k_first_ranks.groupby(Columns.User)[Columns.Item].count() From 77dbb7bf2369a284242865d354732f70b8c5d66e Mon Sep 17 00:00:00 2001 From: jegorus Date: Thu, 4 May 2023 20:01:43 +0300 Subject: [PATCH 2/3] adding tests for item coverage and num retrieved --- rectools/metrics/__init__.py | 2 +- rectools/metrics/coverage.py | 4 ++-- tests/metrics/test_coverage.py | 42 ++++++++++++++++++++++++++++++++++ 3 files changed, 45 insertions(+), 3 deletions(-) create mode 100644 tests/metrics/test_coverage.py diff --git a/rectools/metrics/__init__.py b/rectools/metrics/__init__.py index f4080e85..8e3b7893 100644 --- a/rectools/metrics/__init__.py +++ b/rectools/metrics/__init__.py @@ -44,6 +44,7 @@ """ from .classification import MCC, Accuracy, F1Beta, Precision, Recall +from .coverage import ItemCoverage, NumRetrieved from .distances import ( PairwiseDistanceCalculator, PairwiseHammingDistanceCalculator, @@ -54,7 +55,6 @@ from .ranking import MAP, MRR, NDCG from .scoring import calc_metrics from .serendipity import Serendipity -from .coverage import ItemCoverage, NumRetrieved __all__ = ( "Precision", diff --git a/rectools/metrics/coverage.py b/rectools/metrics/coverage.py index a6fc38fd..0c8bc16d 100644 --- a/rectools/metrics/coverage.py +++ b/rectools/metrics/coverage.py @@ -56,7 +56,7 @@ def calc_per_user(self, reco: pd.DataFrame, catalog: Catalog) -> pd.Series: Values of metric (index - user id, values - metric value for every user). """ reco_k_first_ranks = reco[reco[Columns.Rank] <= self.k] - return reco_k_first_ranks.groupby(Columns.User)[Columns.Item].nunique() / len(catalog) + return reco_k_first_ranks.groupby(Columns.User)[Columns.Item].nunique().rename(None) / len(catalog) @attr.s @@ -106,4 +106,4 @@ def calc_per_user(self, reco: pd.DataFrame) -> pd.Series: Values of metric (index - user id, values - metric value for every user). """ reco_k_first_ranks = reco[reco[Columns.Rank] <= self.k] - return reco_k_first_ranks.groupby(Columns.User)[Columns.Item].count() + return reco_k_first_ranks.groupby(Columns.User)[Columns.Item].count().rename(None) diff --git a/tests/metrics/test_coverage.py b/tests/metrics/test_coverage.py new file mode 100644 index 00000000..7b7661cc --- /dev/null +++ b/tests/metrics/test_coverage.py @@ -0,0 +1,42 @@ +# pylint: disable=attribute-defined-outside-init + +import pandas as pd + +from rectools import Columns +from rectools.metrics import ItemCoverage, NumRetrieved + +RECO = pd.DataFrame( + { + Columns.User: [1, 1, 1, 1, 2, 2, 3, 4, 4], + Columns.Item: [1, 2, 3, 4, 1, 2, 1, 1, 5], + Columns.Rank: [1, 2, 3, 4, 1, 2, 1, 1, 2], + } +) + +CATALOG = list(range(10)) + + +class TestItemCoverage: + def setup(self) -> None: + self.metric = ItemCoverage(k=3) + + def test_calc(self) -> None: + expected_metric_per_user = pd.Series( + [0.3, 0.2, 0.1, 0.2], + index=pd.Series([1, 2, 3, 4], name=Columns.User), + ) + pd.testing.assert_series_equal(self.metric.calc_per_user(RECO, CATALOG), expected_metric_per_user) + assert self.metric.calc(RECO, CATALOG) == 0.4 + + +class TestNumRetrieved: + def setup(self) -> None: + self.metric = NumRetrieved(k=3) + + def test_calc(self) -> None: + expected_metric_per_user = pd.Series( + [3, 2, 1, 2], + index=pd.Series([1, 2, 3, 4], name=Columns.User), + ) + pd.testing.assert_series_equal(self.metric.calc_per_user(RECO), expected_metric_per_user) + assert self.metric.calc(RECO) == expected_metric_per_user.mean() From 3ffc09cd1cbb07a646d1e8109a537c71ba68c358 Mon Sep 17 00:00:00 2001 From: jegorus Date: Wed, 10 May 2023 10:18:49 +0300 Subject: [PATCH 3/3] recommit for ci --- rectools/metrics/coverage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rectools/metrics/coverage.py b/rectools/metrics/coverage.py index 0c8bc16d..932dea91 100644 --- a/rectools/metrics/coverage.py +++ b/rectools/metrics/coverage.py @@ -63,7 +63,7 @@ def calc_per_user(self, reco: pd.DataFrame, catalog: Catalog) -> pd.Series: class NumRetrieved(MetricAtK): """ Number of recommendations retrieved is a metric that shows - how much items retrieved by first k recommendations (less or equal k) + how much items were recommended to users by first k recommendations (less or equal k) See more: https://elliot.readthedocs.io/en/latest/guide/metrics/coverage.html Parameters