Skip to content

Commit

Permalink
Fix groupby agg/apply behaviour when no key columns are provided(#6945)
Browse files Browse the repository at this point in the history
More Pandas-like behaviour for groupby when no keys are passed.

Possibly fixes #6927.

Authors:
  - Ashwin Srinath <shwina@users.noreply.github.com>

Approvers:
  - Keith Kraus

URL: #6945
  • Loading branch information
shwina authored Dec 11, 2020
1 parent ea9c689 commit b136469
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 13 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

- PR #6922 Fix N/A detection for empty fields in CSV reader
- PR #6912 Fix rmm_mode=managed parameter for gtests
- PR #6945 Fix groupby agg/apply behaviour when no key columns are provided
- PR #6942 Fix cudf::merge gtest for dictionary columns


Expand Down
30 changes: 20 additions & 10 deletions python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,9 @@ def agg(self, func):
"""
normalized_aggs = self._normalize_aggs(func)

# Note: When there are no key columns, the below produces
# a Float64Index, while Pandas returns an Int64Index
# (GH: 6945)
result = self._groupby.aggregate(self.obj, normalized_aggs)

result = cudf.DataFrame._from_table(result)
Expand All @@ -190,7 +193,8 @@ def agg(self, func):
raise

# set index names to be group key names
result.index.names = self.grouping.names
if len(result):
result.index.names = self.grouping.names

# copy categorical information from keys to the result index:
result.index._postprocess_columns(self.grouping.keys)
Expand Down Expand Up @@ -417,16 +421,15 @@ def mult(df):
]
chunk_results = [function(chk) for chk in chunks]

if len(chunk_results) > 0 and cudf.utils.dtypes.is_scalar(
chunk_results[0]
):
if not len(chunk_results):
return self.obj.__class__()

if cudf.utils.dtypes.is_scalar(chunk_results[0]):
result = cudf.Series(
chunk_results, index=self.grouping.keys[offsets[:-1]]
)
result.index.names = self.grouping.names
elif len(chunk_results) > 0 and isinstance(
chunk_results[0], cudf.Series
):
elif isinstance(chunk_results[0], cudf.Series):
result = cudf.concat(chunk_results, axis=1).T
result.index.names = self.grouping.names
else:
Expand Down Expand Up @@ -775,8 +778,9 @@ def agg(self, func):
result = super().agg(func)

# downcast the result to a Series:
if result.shape[1] == 1 and not pd.api.types.is_list_like(func):
return result.iloc[:, 0]
if len(result._data):
if result.shape[1] == 1 and not pd.api.types.is_list_like(func):
return result.iloc[:, 0]

# drop the first level if we have a multiindex
if (
Expand Down Expand Up @@ -809,6 +813,9 @@ def __init__(self, obj, by=None, level=None):
self._named_columns = []
self._handle_by_or_level(by, level)

if len(obj) and not len(self._key_columns):
raise ValueError("No group keys passed")

def _handle_by_or_level(self, by=None, level=None):
if level is not None:
if by is not None:
Expand Down Expand Up @@ -839,7 +846,10 @@ def _handle_by_or_level(self, by=None, level=None):
@property
def keys(self):
nkeys = len(self._key_columns)
if nkeys > 1:

if nkeys == 0:
return cudf.core.index.as_index([], name=None)
elif nkeys > 1:
return cudf.MultiIndex(
source_data=cudf.DataFrame(
dict(zip(range(nkeys), self._key_columns))
Expand Down
37 changes: 37 additions & 0 deletions python/cudf/cudf/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1345,3 +1345,40 @@ def test_groupby_apply_return_series_dataframe(cust_func):
actual = gdf.groupby(["key"]).apply(cust_func)

assert_eq(expected, actual)


@pytest.mark.parametrize(
"pdf", [pd.DataFrame(), pd.DataFrame({"a": []}), pd.Series([])]
)
def test_groupby_no_keys(pdf):
gdf = cudf.from_pandas(pdf)
assert_eq(
pdf.groupby([]).max(),
gdf.groupby([]).max(),
check_dtype=False,
check_index_type=False, # Int64Index v/s Float64Index
)


@pytest.mark.parametrize(
"pdf", [pd.DataFrame(), pd.DataFrame({"a": []}), pd.Series([])]
)
def test_groupby_apply_no_keys(pdf):
gdf = cudf.from_pandas(pdf)
assert_eq(
pdf.groupby([]).apply(lambda x: x.max()),
gdf.groupby([]).apply(lambda x: x.max()),
)


@pytest.mark.parametrize(
"pdf",
[pd.DataFrame({"a": [1, 2]}), pd.DataFrame({"a": [1, 2], "b": [2, 3]})],
)
def test_groupby_nonempty_no_keys(pdf):
gdf = cudf.from_pandas(pdf)
assert_exceptions_equal(
lambda: pdf.groupby([]),
lambda: gdf.groupby([]),
compare_error_message=False,
)
4 changes: 1 addition & 3 deletions python/cudf/cudf/tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
from pandas import testing as tm

import cudf
from cudf.core.column.datetime import _numpy_to_pandas_conversion
from cudf._lib.null_mask import bitmask_allocation_size_bytes
from cudf.core.column.datetime import _numpy_to_pandas_conversion
from cudf.utils import dtypes as dtypeutils

supported_numpy_dtypes = [
Expand Down Expand Up @@ -74,8 +74,6 @@ def assert_eq(left, right, **kwargs):
without switching between assert_frame_equal/assert_series_equal/...
functions.
"""
__tracebackhide__ = True

if hasattr(left, "to_pandas"):
left = left.to_pandas()
if hasattr(right, "to_pandas"):
Expand Down

0 comments on commit b136469

Please # to comment.