-
Notifications
You must be signed in to change notification settings - Fork 47
Coverage metrics #38
New issue
Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? # to your account
base: main
Are you sure you want to change the base?
Coverage metrics #38
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
import attr | ||
import pandas as pd | ||
|
||
from rectools import Columns | ||
|
||
from .base import Catalog, MetricAtK | ||
|
||
|
||
@attr.s | ||
class ItemCoverage(MetricAtK): | ||
""" | ||
Item space coverage (also referred as catalog coverage) is a metric that shows | ||
what part of the items is covered by first k recommendations | ||
ItemCoverage = #recommended_items / num_items | ||
|
||
Parameters | ||
---------- | ||
k : int | ||
Number of items in top of recommendations list that will be used to calculate metric. | ||
|
||
""" | ||
|
||
def calc(self, reco: pd.DataFrame, catalog: Catalog) -> float: | ||
""" | ||
Calculate item space coverage for all users | ||
|
||
Parameters | ||
---------- | ||
reco : pd.DataFrame | ||
Recommendations table with columns `Columns.User`, `Columns.Item`, `Columns.Rank`. | ||
catalog : collection | ||
Collection of unique item ids that could be used for recommendations. | ||
|
||
Returns | ||
------- | ||
float | ||
Value of metric. | ||
""" | ||
reco_k_first_ranks = reco[reco[Columns.Rank] <= self.k] | ||
return len(reco_k_first_ranks[Columns.Item].unique()) / len(catalog) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is |
||
|
||
def calc_per_user(self, reco: pd.DataFrame, catalog: Catalog) -> pd.Series: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe for this metric |
||
""" | ||
Calculate item space coverage per user | ||
|
||
Parameters | ||
---------- | ||
reco : pd.DataFrame | ||
Recommendations table with columns `Columns.User`, `Columns.Item`, `Columns.Rank`. | ||
catalog : collection | ||
Collection of unique item ids that could be used for recommendations. | ||
|
||
Returns | ||
------- | ||
pd.Series | ||
Values of metric (index - user id, values - metric value for every user). | ||
""" | ||
reco_k_first_ranks = reco[reco[Columns.Rank] <= self.k] | ||
return reco_k_first_ranks.groupby(Columns.User)[Columns.Item].nunique().rename(None) / len(catalog) | ||
|
||
|
||
@attr.s | ||
class NumRetrieved(MetricAtK): | ||
""" | ||
Number of recommendations retrieved is a metric that shows | ||
how much items were recommended to users by first k recommendations (less or equal k) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
See more: https://elliot.readthedocs.io/en/latest/guide/metrics/coverage.html | ||
|
||
Parameters | ||
---------- | ||
k : int | ||
Number of items in top of recommendations list that will be used to calculate metric. | ||
|
||
""" | ||
|
||
def calc(self, reco: pd.DataFrame) -> float: | ||
""" | ||
Calculate average num retrieved for all users. | ||
If num retrieved equals k, it means that k items were recommended to every user | ||
|
||
Parameters | ||
---------- | ||
reco : pd.DataFrame | ||
Recommendations table with columns `Columns.User`, `Columns.Item`, `Columns.Rank`. | ||
|
||
Returns | ||
------- | ||
float | ||
Value of metric (average between users). | ||
""" | ||
per_user = self.calc_per_user(reco) | ||
return per_user.mean() | ||
|
||
def calc_per_user(self, reco: pd.DataFrame) -> pd.Series: | ||
""" | ||
Calculate num retrieved per user. | ||
|
||
Parameters | ||
---------- | ||
reco : pd.DataFrame | ||
Recommendations table with columns `Columns.User`, `Columns.Item`, `Columns.Rank`. | ||
|
||
Returns | ||
------- | ||
pd.Series | ||
Values of metric (index - user id, values - metric value for every user). | ||
""" | ||
reco_k_first_ranks = reco[reco[Columns.Rank] <= self.k] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here |
||
return reco_k_first_ranks.groupby(Columns.User)[Columns.Item].count().rename(None) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's better to store results of complex expressions to the separate variables |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
# pylint: disable=attribute-defined-outside-init | ||
|
||
import pandas as pd | ||
|
||
from rectools import Columns | ||
from rectools.metrics import ItemCoverage, NumRetrieved | ||
|
||
RECO = pd.DataFrame( | ||
{ | ||
Columns.User: [1, 1, 1, 1, 2, 2, 3, 4, 4], | ||
Columns.Item: [1, 2, 3, 4, 1, 2, 1, 1, 5], | ||
Columns.Rank: [1, 2, 3, 4, 1, 2, 1, 1, 2], | ||
} | ||
) | ||
|
||
CATALOG = list(range(10)) | ||
|
||
|
||
class TestItemCoverage: | ||
def setup(self) -> None: | ||
self.metric = ItemCoverage(k=3) | ||
|
||
def test_calc(self) -> None: | ||
expected_metric_per_user = pd.Series( | ||
[0.3, 0.2, 0.1, 0.2], | ||
index=pd.Series([1, 2, 3, 4], name=Columns.User), | ||
) | ||
pd.testing.assert_series_equal(self.metric.calc_per_user(RECO, CATALOG), expected_metric_per_user) | ||
assert self.metric.calc(RECO, CATALOG) == 0.4 | ||
|
||
|
||
class TestNumRetrieved: | ||
def setup(self) -> None: | ||
self.metric = NumRetrieved(k=3) | ||
|
||
def test_calc(self) -> None: | ||
expected_metric_per_user = pd.Series( | ||
[3, 2, 1, 2], | ||
index=pd.Series([1, 2, 3, 4], name=Columns.User), | ||
) | ||
pd.testing.assert_series_equal(self.metric.calc_per_user(RECO), expected_metric_per_user) | ||
assert self.metric.calc(RECO) == expected_metric_per_user.mean() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As we need only the items column, let's take only it.
This will be more memory efficient and faster as well