Fix groupby agg/apply behaviour when no key columns are provided(#6945)

More Pandas-like behaviour for groupby when no keys are passed. Possibly fixes #6927. Authors: - Ashwin Srinath <shwina@users.noreply.github.com> Approvers: - Keith Kraus URL: #6945
rapidsai · Dec 11, 2020 · b136469 · b136469
1 parent ea9c689
commit b136469
Show file tree

Hide file tree

Showing 4 changed files with 59 additions and 13 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -18,6 +18,7 @@
 
 - PR #6922 Fix N/A detection for empty fields in CSV reader
 - PR #6912 Fix rmm_mode=managed parameter for gtests
+- PR #6945 Fix groupby agg/apply behaviour when no key columns are provided 
 - PR #6942 Fix cudf::merge gtest for dictionary columns
 
 

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
@@ -164,6 +164,9 @@ def agg(self, func):
         """
         normalized_aggs = self._normalize_aggs(func)
 
+        # Note: When there are no key columns, the below produces
+        # a Float64Index, while Pandas returns an Int64Index
+        # (GH: 6945)
         result = self._groupby.aggregate(self.obj, normalized_aggs)
 
         result = cudf.DataFrame._from_table(result)
@@ -190,7 +193,8 @@ def agg(self, func):
                         raise
 
         # set index names to be group key names
-        result.index.names = self.grouping.names
+        if len(result):
+            result.index.names = self.grouping.names
 
         # copy categorical information from keys to the result index:
         result.index._postprocess_columns(self.grouping.keys)
@@ -417,16 +421,15 @@ def mult(df):
         ]
         chunk_results = [function(chk) for chk in chunks]
 
-        if len(chunk_results) > 0 and cudf.utils.dtypes.is_scalar(
-            chunk_results[0]
-        ):
+        if not len(chunk_results):
+            return self.obj.__class__()
+
+        if cudf.utils.dtypes.is_scalar(chunk_results[0]):
             result = cudf.Series(
                 chunk_results, index=self.grouping.keys[offsets[:-1]]
             )
             result.index.names = self.grouping.names
-        elif len(chunk_results) > 0 and isinstance(
-            chunk_results[0], cudf.Series
-        ):
+        elif isinstance(chunk_results[0], cudf.Series):
             result = cudf.concat(chunk_results, axis=1).T
             result.index.names = self.grouping.names
         else:
@@ -775,8 +778,9 @@ def agg(self, func):
         result = super().agg(func)
 
         # downcast the result to a Series:
-        if result.shape[1] == 1 and not pd.api.types.is_list_like(func):
-            return result.iloc[:, 0]
+        if len(result._data):
+            if result.shape[1] == 1 and not pd.api.types.is_list_like(func):
+                return result.iloc[:, 0]
 
         # drop the first level if we have a multiindex
         if (
@@ -809,6 +813,9 @@ def __init__(self, obj, by=None, level=None):
         self._named_columns = []
         self._handle_by_or_level(by, level)
 
+        if len(obj) and not len(self._key_columns):
+            raise ValueError("No group keys passed")
+
     def _handle_by_or_level(self, by=None, level=None):
         if level is not None:
             if by is not None:
@@ -839,7 +846,10 @@ def _handle_by_or_level(self, by=None, level=None):
     @property
     def keys(self):
         nkeys = len(self._key_columns)
-        if nkeys > 1:
+
+        if nkeys == 0:
+            return cudf.core.index.as_index([], name=None)
+        elif nkeys > 1:
             return cudf.MultiIndex(
                 source_data=cudf.DataFrame(
                     dict(zip(range(nkeys), self._key_columns))

diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
@@ -1345,3 +1345,40 @@ def test_groupby_apply_return_series_dataframe(cust_func):
     actual = gdf.groupby(["key"]).apply(cust_func)
 
     assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "pdf", [pd.DataFrame(), pd.DataFrame({"a": []}), pd.Series([])]
+)
+def test_groupby_no_keys(pdf):
+    gdf = cudf.from_pandas(pdf)
+    assert_eq(
+        pdf.groupby([]).max(),
+        gdf.groupby([]).max(),
+        check_dtype=False,
+        check_index_type=False,  # Int64Index v/s Float64Index
+    )
+
+
+@pytest.mark.parametrize(
+    "pdf", [pd.DataFrame(), pd.DataFrame({"a": []}), pd.Series([])]
+)
+def test_groupby_apply_no_keys(pdf):
+    gdf = cudf.from_pandas(pdf)
+    assert_eq(
+        pdf.groupby([]).apply(lambda x: x.max()),
+        gdf.groupby([]).apply(lambda x: x.max()),
+    )
+
+
+@pytest.mark.parametrize(
+    "pdf",
+    [pd.DataFrame({"a": [1, 2]}), pd.DataFrame({"a": [1, 2], "b": [2, 3]})],
+)
+def test_groupby_nonempty_no_keys(pdf):
+    gdf = cudf.from_pandas(pdf)
+    assert_exceptions_equal(
+        lambda: pdf.groupby([]),
+        lambda: gdf.groupby([]),
+        compare_error_message=False,
+    )
diff --git a/python/cudf/cudf/tests/utils.py b/python/cudf/cudf/tests/utils.py
@@ -11,8 +11,8 @@
 from pandas import testing as tm
 
 import cudf
-from cudf.core.column.datetime import _numpy_to_pandas_conversion
 from cudf._lib.null_mask import bitmask_allocation_size_bytes
+from cudf.core.column.datetime import _numpy_to_pandas_conversion
 from cudf.utils import dtypes as dtypeutils
 
 supported_numpy_dtypes = [
@@ -74,8 +74,6 @@ def assert_eq(left, right, **kwargs):
     without switching between assert_frame_equal/assert_series_equal/...
     functions.
     """
-    __tracebackhide__ = True
-
     if hasattr(left, "to_pandas"):
         left = left.to_pandas()
     if hasattr(right, "to_pandas"):