Skip to content

Coverage metrics #38

New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions rectools/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
`metrics.MAP`
`metrics.NDCG`
`metrics.MRR`
`metrics.ItemCoverage`
`metrics.NumRetrieved`
`metrics.MeanInvUserFreq`
`metrics.IntraListDiversity`
`metrics.Serendipity`
Expand All @@ -42,6 +44,7 @@
"""

from .classification import MCC, Accuracy, F1Beta, Precision, Recall
from .coverage import ItemCoverage, NumRetrieved
from .distances import (
PairwiseDistanceCalculator,
PairwiseHammingDistanceCalculator,
Expand All @@ -65,6 +68,8 @@
"MeanInvUserFreq",
"IntraListDiversity",
"Serendipity",
"ItemCoverage",
"NumRetrieved",
"calc_metrics",
"PairwiseDistanceCalculator",
"PairwiseHammingDistanceCalculator",
Expand Down
109 changes: 109 additions & 0 deletions rectools/metrics/coverage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import attr
import pandas as pd

from rectools import Columns

from .base import Catalog, MetricAtK


@attr.s
class ItemCoverage(MetricAtK):
"""
Item space coverage (also referred as catalog coverage) is a metric that shows
what part of the items is covered by first k recommendations
ItemCoverage = #recommended_items / num_items

Parameters
----------
k : int
Number of items in top of recommendations list that will be used to calculate metric.

"""

def calc(self, reco: pd.DataFrame, catalog: Catalog) -> float:
"""
Calculate item space coverage for all users

Parameters
----------
reco : pd.DataFrame
Recommendations table with columns `Columns.User`, `Columns.Item`, `Columns.Rank`.
catalog : collection
Collection of unique item ids that could be used for recommendations.

Returns
-------
float
Value of metric.
"""
reco_k_first_ranks = reco[reco[Columns.Rank] <= self.k]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As we need only the items column, let's take only it.

This will be more memory efficient and faster as well

items = reco.loc[reco[Columns.Rank] <= self.k, Columns.Item]

return len(reco_k_first_ranks[Columns.Item].unique()) / len(catalog)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is nunique method, no need to use len


def calc_per_user(self, reco: pd.DataFrame, catalog: Catalog) -> pd.Series:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe for this metric calc_per_user method is meaningless since recommendations for user are unique and coverage depends only on k that we set up here

"""
Calculate item space coverage per user

Parameters
----------
reco : pd.DataFrame
Recommendations table with columns `Columns.User`, `Columns.Item`, `Columns.Rank`.
catalog : collection
Collection of unique item ids that could be used for recommendations.

Returns
-------
pd.Series
Values of metric (index - user id, values - metric value for every user).
"""
reco_k_first_ranks = reco[reco[Columns.Rank] <= self.k]
return reco_k_first_ranks.groupby(Columns.User)[Columns.Item].nunique().rename(None) / len(catalog)


@attr.s
class NumRetrieved(MetricAtK):
"""
Number of recommendations retrieved is a metric that shows
how much items were recommended to users by first k recommendations (less or equal k)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

much -> many

See more: https://elliot.readthedocs.io/en/latest/guide/metrics/coverage.html

Parameters
----------
k : int
Number of items in top of recommendations list that will be used to calculate metric.

"""

def calc(self, reco: pd.DataFrame) -> float:
"""
Calculate average num retrieved for all users.
If num retrieved equals k, it means that k items were recommended to every user

Parameters
----------
reco : pd.DataFrame
Recommendations table with columns `Columns.User`, `Columns.Item`, `Columns.Rank`.

Returns
-------
float
Value of metric (average between users).
"""
per_user = self.calc_per_user(reco)
return per_user.mean()

def calc_per_user(self, reco: pd.DataFrame) -> pd.Series:
"""
Calculate num retrieved per user.

Parameters
----------
reco : pd.DataFrame
Recommendations table with columns `Columns.User`, `Columns.Item`, `Columns.Rank`.

Returns
-------
pd.Series
Values of metric (index - user id, values - metric value for every user).
"""
reco_k_first_ranks = reco[reco[Columns.Rank] <= self.k]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here

return reco_k_first_ranks.groupby(Columns.User)[Columns.Item].count().rename(None)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's better to store results of complex expressions to the separate variables

42 changes: 42 additions & 0 deletions tests/metrics/test_coverage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# pylint: disable=attribute-defined-outside-init

import pandas as pd

from rectools import Columns
from rectools.metrics import ItemCoverage, NumRetrieved

RECO = pd.DataFrame(
{
Columns.User: [1, 1, 1, 1, 2, 2, 3, 4, 4],
Columns.Item: [1, 2, 3, 4, 1, 2, 1, 1, 5],
Columns.Rank: [1, 2, 3, 4, 1, 2, 1, 1, 2],
}
)

CATALOG = list(range(10))


class TestItemCoverage:
def setup(self) -> None:
self.metric = ItemCoverage(k=3)

def test_calc(self) -> None:
expected_metric_per_user = pd.Series(
[0.3, 0.2, 0.1, 0.2],
index=pd.Series([1, 2, 3, 4], name=Columns.User),
)
pd.testing.assert_series_equal(self.metric.calc_per_user(RECO, CATALOG), expected_metric_per_user)
assert self.metric.calc(RECO, CATALOG) == 0.4


class TestNumRetrieved:
def setup(self) -> None:
self.metric = NumRetrieved(k=3)

def test_calc(self) -> None:
expected_metric_per_user = pd.Series(
[3, 2, 1, 2],
index=pd.Series([1, 2, 3, 4], name=Columns.User),
)
pd.testing.assert_series_equal(self.metric.calc_per_user(RECO), expected_metric_per_user)
assert self.metric.calc(RECO) == expected_metric_per_user.mean()