Skip to content

Commit

Permalink
feat(api): make topk() and value_counts() more flexible
Browse files Browse the repository at this point in the history
  • Loading branch information
NickCrews committed Mar 3, 2025
1 parent b50bb48 commit 75204d0
Show file tree
Hide file tree
Showing 2 changed files with 129 additions and 18 deletions.
60 changes: 42 additions & 18 deletions ibis/expr/types/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2114,12 +2114,13 @@ def nunique(self, *, where: ir.BooleanValue | None = None) -> ir.IntegerScalar:
).to_expr()

def topk(
self, k: int, by: ir.Value | None = None, *, name: str | None = None
self,
k: int | None = None,
by: ir.Value | None = None,
*,
name: str | None = None,
) -> ir.Table:
"""Return a "top k" expression.
Computes a Table containing the top `k` values by a certain metric
(defaults to count).
"""Computes a Table of the top values by a metric (defaults to count).
::: {.callout-note title="Changed in version 9.5.0"}
Added `name` parameter.
Expand All @@ -2129,6 +2130,7 @@ def topk(
----------
k
The number of rows to return.
If `None`, all values are returned in descending order.
by
The metric to compute "top" by. Defaults to `count`.
name
Expand All @@ -2140,6 +2142,12 @@ def topk(
Table
The top `k` values.
See Also
--------
[`Column.value_counts`](./expression-generic.qmd#ibis.expr.types.generic.Column.value_counts)
[`Table.topk`](./expression-tables.qmd#ibis.expr.types.relations.Table.topk)
[`Table.value_counts`](./expression-tables.qmd#ibis.expr.types.relations.Table.value_counts)
Examples
--------
>>> import ibis
Expand All @@ -2149,15 +2157,15 @@ def topk(
Compute the top 3 diamond colors by frequency:
>>> t.color.topk(3)
┏━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━
┃ color ┃ CountStar(diamonds)
┡━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━
│ string │ int64
├────────┼─────────────────────
│ G │ 11292 │
│ E │ 9797 │
│ F │ 9542 │
└────────┴─────────────────────
┏━━━━━━━━┳━━━━━━━━━━━━━┓
┃ color ┃ color_count
┡━━━━━━━━╇━━━━━━━━━━━━━┩
│ string │ int64 │
├────────┼─────────────┤
│ G │ 11292 │
│ E │ 9797 │
│ F │ 9542 │
└────────┴─────────────┘
Compute the top 3 diamond colors by mean price:
Expand All @@ -2172,16 +2180,21 @@ def topk(
│ H │ 4486.669196 │
└────────┴─────────────┘
Compute the top 2 diamond colors by max carat:
Rank all the colors by max carat:
>>> t.color.topk(2, by=t.carat.max(), name="max_carat")
>>> t.color.topk(by=t.carat.max(), name="max_carat")
┏━━━━━━━━┳━━━━━━━━━━━┓
┃ color ┃ max_carat ┃
┡━━━━━━━━╇━━━━━━━━━━━┩
│ string │ float64 │
├────────┼───────────┤
│ J │ 5.01 │
│ H │ 4.13 │
│ I │ 4.01 │
│ D │ 3.40 │
│ E │ 3.05 │
│ F │ 3.01 │
│ G │ 3.01 │
└────────┴───────────┘
"""
from ibis.expr.types.relations import bind
Expand All @@ -2193,15 +2206,20 @@ def topk(

table = table.to_expr()

if by is None and name is None:
# if `by` is something more complex, the _count doesn't make sense.
name = f"{self.get_name()}_count"
if by is None:
by = lambda t: t.count()

(metric,) = bind(table, by)

if name is not None:
metric = metric.name(name)

return table.aggregate(metric, by=[self]).order_by(metric.desc()).limit(k)
in_desc = table.aggregate(metric, by=[self]).order_by(metric.desc())
if k is not None:
in_desc = in_desc.limit(k)
return in_desc

def arbitrary(self, *, where: ir.BooleanValue | None = None) -> Scalar:
"""Select an arbitrary value in a column.
Expand Down Expand Up @@ -2295,6 +2313,12 @@ def value_counts(self, *, name: str | None = None) -> ir.Table:
Table
The frequency table.
See Also
--------
[`Column.topk`](./expression-generic.qmd#ibis.expr.types.generic.Column.topk)
[`Table.value_counts`](./expression-tables.qmd#ibis.expr.types.relations.Table.value_counts)
[`Table.topk`](./expression-tables.qmd#ibis.expr.types.relations.Table.topk)
Examples
--------
>>> import ibis
Expand Down
87 changes: 87 additions & 0 deletions ibis/expr/types/relations.py
Original file line number Diff line number Diff line change
Expand Up @@ -4718,6 +4718,12 @@ def value_counts(self, *, name: str | None = None) -> ir.Table:
Table
Frequency table of this table's values.
See Also
--------
[`Table.topk`](./expression-tables.qmd#ibis.expr.types.relations.Table.topk)
[`Column.value_counts`](./expression-generic.qmd#ibis.expr.types.generic.Column.value_counts)
[`Column.topk`](./expression-generic.qmd#ibis.expr.types.generic.Column.topk)
Examples
--------
>>> from ibis import examples
Expand Down Expand Up @@ -4767,6 +4773,87 @@ def value_counts(self, *, name: str | None = None) -> ir.Table:
name = "_".join(columns) + "_count"
return self.group_by(columns).agg(lambda t: t.count().name(name))

def topk(self, k: int | None = None, *, name: str | None = None) -> ir.Table:
"""Get the most frequent values of this table.
Parameters
----------
k
Number of top values to return.
If `None`, all values are returned in descending order.
name
The name to use for the frequency column. A suitable name will be
automatically generated if not provided.
Returns
-------
Table
Frequency table of this table's values.
See Also
--------
[`Table.value_counts`](./expression-tables.qmd#ibis.expr.types.relations.Table.value_counts)
[`Column.topk`](./expression-generic.qmd#ibis.expr.types.generic.Column.topk)
[`Column.value_counts`](./expression-generic.qmd#ibis.expr.types.generic.Column.value_counts)
Examples
--------
>>> from ibis import examples
>>> ibis.options.interactive = True
>>> t = examples.penguins.fetch().select("species", "island", "sex", "year")
>>> t.head()
┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━┓
┃ species ┃ island ┃ sex ┃ year ┃
┡━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━┩
│ string │ string │ string │ int64 │
├─────────┼───────────┼────────┼───────┤
│ Adelie │ Torgersen │ male │ 2007 │
│ Adelie │ Torgersen │ female │ 2007 │
│ Adelie │ Torgersen │ female │ 2007 │
│ Adelie │ Torgersen │ NULL │ 2007 │
│ Adelie │ Torgersen │ female │ 2007 │
└─────────┴───────────┴────────┴───────┘
>>> t.topk()
┏━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ species ┃ island ┃ sex ┃ year ┃ species_island_sex_year_count ┃
┡━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ string │ string │ string │ int64 │ int64 │
├───────────┼────────┼────────┼───────┼───────────────────────────────┤
│ Gentoo │ Biscoe │ male │ 2008 │ 23 │
│ Gentoo │ Biscoe │ female │ 2008 │ 22 │
│ Gentoo │ Biscoe │ male │ 2009 │ 21 │
│ Gentoo │ Biscoe │ female │ 2009 │ 20 │
│ Gentoo │ Biscoe │ male │ 2007 │ 17 │
│ Gentoo │ Biscoe │ female │ 2007 │ 16 │
│ Chinstrap │ Dream │ female │ 2007 │ 13 │
│ Chinstrap │ Dream │ male │ 2007 │ 13 │
│ Chinstrap │ Dream │ male │ 2009 │ 12 │
│ Chinstrap │ Dream │ female │ 2009 │ 12 │
│ … │ … │ … │ … │ … │
└───────────┴────────┴────────┴───────┴───────────────────────────────┘
>>> t.topk(3, name="n")
┏━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━┳━━━━━━━┓
┃ species ┃ island ┃ sex ┃ year ┃ n ┃
┡━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━╇━━━━━━━┩
│ string │ string │ string │ int64 │ int64 │
├─────────┼────────┼────────┼───────┼───────┤
│ Gentoo │ Biscoe │ male │ 2008 │ 23 │
│ Gentoo │ Biscoe │ female │ 2008 │ 22 │
│ Gentoo │ Biscoe │ male │ 2009 │ 21 │
└─────────┴────────┴────────┴───────┴───────┘
"""
columns = self.columns
if name is None:
name = "_".join(columns) + "_count"
in_desc = (
self.group_by(columns)
.agg(lambda t: t.count().name(name))
.order_by(ibis.desc(name))
)
if k is not None:
in_desc = in_desc.limit(k)
return in_desc

def unnest(
self, column, /, *, offset: str | None = None, keep_empty: bool = False
) -> Table:
Expand Down

0 comments on commit 75204d0

Please # to comment.