From 4fd5a1562f122a0afd4cdd12339881831dd53972 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Mon, 4 Dec 2023 13:27:35 -0500 Subject: [PATCH] DEPR: Default of observed=False in DataFrame.pivot_table (#56237) * DEPR: Default of observed=False in DataFrame.pivot_table * Finish up * fixup * Convert to code-block * Kickoff builds --- doc/source/user_guide/categorical.rst | 2 +- doc/source/whatsnew/v0.23.0.rst | 31 ++++++++++++++++---- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/frame.py | 7 ++++- pandas/core/reshape/pivot.py | 20 +++++++++++-- pandas/tests/reshape/test_pivot.py | 41 +++++++++++++++++++-------- 6 files changed, 80 insertions(+), 22 deletions(-) diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index 34d04745ccdb5..8fb991dca02db 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -647,7 +647,7 @@ Pivot tables: raw_cat = pd.Categorical(["a", "a", "b", "b"], categories=["a", "b", "c"]) df = pd.DataFrame({"A": raw_cat, "B": ["c", "d", "c", "d"], "values": [1, 2, 3, 4]}) - pd.pivot_table(df, values="values", index=["A", "B"]) + pd.pivot_table(df, values="values", index=["A", "B"], observed=False) Data munging ------------ diff --git a/doc/source/whatsnew/v0.23.0.rst b/doc/source/whatsnew/v0.23.0.rst index cdffc6968a170..808741ccf4475 100644 --- a/doc/source/whatsnew/v0.23.0.rst +++ b/doc/source/whatsnew/v0.23.0.rst @@ -286,12 +286,33 @@ For pivoting operations, this behavior is *already* controlled by the ``dropna`` df = pd.DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) df -.. ipython:: python - pd.pivot_table(df, values='values', index=['A', 'B'], - dropna=True) - pd.pivot_table(df, values='values', index=['A', 'B'], - dropna=False) +.. code-block:: ipython + + In [1]: pd.pivot_table(df, values='values', index=['A', 'B'], dropna=True) + + Out[1]: + values + A B + a c 1.0 + d 2.0 + b c 3.0 + d 4.0 + + In [2]: pd.pivot_table(df, values='values', index=['A', 'B'], dropna=False) + + Out[2]: + values + A B + a c 1.0 + d 2.0 + y NaN + b c 3.0 + d 4.0 + y NaN + z c NaN + d NaN + y NaN .. _whatsnew_0230.enhancements.window_raw: diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 8bd9ac1aa366c..5ee2bb1778cb1 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -435,6 +435,7 @@ Other Deprecations - Deprecated the ``ordinal`` keyword in :class:`PeriodIndex`, use :meth:`PeriodIndex.from_ordinals` instead (:issue:`55960`) - Deprecated the ``unit`` keyword in :class:`TimedeltaIndex` construction, use :func:`to_timedelta` instead (:issue:`55499`) - Deprecated the behavior of :meth:`Series.value_counts` and :meth:`Index.value_counts` with object dtype; in a future version these will not perform dtype inference on the resulting :class:`Index`, do ``result.index = result.index.infer_objects()`` to retain the old behavior (:issue:`56161`) +- Deprecated the default of ``observed=False`` in :meth:`DataFrame.pivot_table`; will be ``True`` in a future version (:issue:`56236`) - Deprecated the extension test classes ``BaseNoReduceTests``, ``BaseBooleanReduceTests``, and ``BaseNumericReduceTests``, use ``BaseReduceTests`` instead (:issue:`54663`) - Deprecated the option ``mode.data_manager`` and the ``ArrayManager``; only the ``BlockManager`` will be available in future versions (:issue:`55043`) - Deprecated the previous implementation of :class:`DataFrame.stack`; specify ``future_stack=True`` to adopt the future version (:issue:`53515`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c9da273b99ce9..3edfea4480e47 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9296,6 +9296,11 @@ def pivot( If True: only show observed values for categorical groupers. If False: show all values for categorical groupers. + .. deprecated:: 2.2.0 + + The default value of ``False`` is deprecated and will change to + ``True`` in a future version of pandas. + sort : bool, default True Specifies if the result should be sorted. @@ -9406,7 +9411,7 @@ def pivot_table( margins: bool = False, dropna: bool = True, margins_name: Level = "All", - observed: bool = False, + observed: bool | lib.NoDefault = lib.no_default, sort: bool = True, ) -> DataFrame: from pandas.core.reshape.pivot import pivot_table diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index eba4f72b5eb8f..82718d4c43a65 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -10,6 +10,7 @@ Literal, cast, ) +import warnings import numpy as np @@ -18,6 +19,7 @@ Appender, Substitution, ) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.cast import maybe_downcast_to_dtype from pandas.core.dtypes.common import ( @@ -68,7 +70,7 @@ def pivot_table( margins: bool = False, dropna: bool = True, margins_name: Hashable = "All", - observed: bool = False, + observed: bool | lib.NoDefault = lib.no_default, sort: bool = True, ) -> DataFrame: index = _convert_by(index) @@ -123,7 +125,7 @@ def __internal_pivot_table( margins: bool, dropna: bool, margins_name: Hashable, - observed: bool, + observed: bool | lib.NoDefault, sort: bool, ) -> DataFrame: """ @@ -166,7 +168,18 @@ def __internal_pivot_table( pass values = list(values) - grouped = data.groupby(keys, observed=observed, sort=sort, dropna=dropna) + observed_bool = False if observed is lib.no_default else observed + grouped = data.groupby(keys, observed=observed_bool, sort=sort, dropna=dropna) + if observed is lib.no_default and any( + ping._passed_categorical for ping in grouped.grouper.groupings + ): + warnings.warn( + "The default value of observed=False is deprecated and will change " + "to observed=True in a future version of pandas. Specify " + "observed=False to silence this warning and retain the current behavior", + category=FutureWarning, + stacklevel=find_stack_level(), + ) agged = grouped.agg(aggfunc) if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns): @@ -719,6 +732,7 @@ def crosstab( margins=margins, margins_name=margins_name, dropna=dropna, + observed=False, **kwargs, # type: ignore[arg-type] ) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 4a852daaadf98..dab2b034d3fd4 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -201,7 +201,9 @@ def test_pivot_table_categorical(self): ["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True ) df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) - result = pivot_table(df, values="values", index=["A", "B"], dropna=True) + msg = "The default value of observed=False is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = pivot_table(df, values="values", index=["A", "B"], dropna=True) exp_index = MultiIndex.from_arrays([cat1, cat2], names=["A", "B"]) expected = DataFrame({"values": [1.0, 2.0, 3.0, 4.0]}, index=exp_index) @@ -220,7 +222,9 @@ def test_pivot_table_dropna_categoricals(self, dropna): ) df["A"] = df["A"].astype(CategoricalDtype(categories, ordered=False)) - result = df.pivot_table(index="B", columns="A", values="C", dropna=dropna) + msg = "The default value of observed=False is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.pivot_table(index="B", columns="A", values="C", dropna=dropna) expected_columns = Series(["a", "b", "c"], name="A") expected_columns = expected_columns.astype( CategoricalDtype(categories, ordered=False) @@ -250,7 +254,9 @@ def test_pivot_with_non_observable_dropna(self, dropna): } ) - result = df.pivot_table(index="A", values="B", dropna=dropna) + msg = "The default value of observed=False is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.pivot_table(index="A", values="B", dropna=dropna) if dropna: values = [2.0, 3.0] codes = [0, 1] @@ -283,7 +289,9 @@ def test_pivot_with_non_observable_dropna_multi_cat(self, dropna): } ) - result = df.pivot_table(index="A", values="B", dropna=dropna) + msg = "The default value of observed=False is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.pivot_table(index="A", values="B", dropna=dropna) expected = DataFrame( {"B": [2.0, 3.0, 0.0]}, index=Index( @@ -301,7 +309,10 @@ def test_pivot_with_non_observable_dropna_multi_cat(self, dropna): def test_pivot_with_interval_index(self, interval_values, dropna): # GH 25814 df = DataFrame({"A": interval_values, "B": 1}) - result = df.pivot_table(index="A", values="B", dropna=dropna) + + msg = "The default value of observed=False is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.pivot_table(index="A", values="B", dropna=dropna) expected = DataFrame( {"B": 1.0}, index=Index(interval_values.unique(), name="A") ) @@ -322,9 +333,11 @@ def test_pivot_with_interval_index_margins(self): } ) - pivot_tab = pivot_table( - df, index="C", columns="B", values="A", aggfunc="sum", margins=True - ) + msg = "The default value of observed=False is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + pivot_tab = pivot_table( + df, index="C", columns="B", values="A", aggfunc="sum", margins=True + ) result = pivot_tab["All"] expected = Series( @@ -1827,7 +1840,9 @@ def test_categorical_margins_category(self, observed): df.y = df.y.astype("category") df.z = df.z.astype("category") - table = df.pivot_table("x", "y", "z", dropna=observed, margins=True) + msg = "The default value of observed=False is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + table = df.pivot_table("x", "y", "z", dropna=observed, margins=True) tm.assert_frame_equal(table, expected) def test_margins_casted_to_float(self): @@ -1889,9 +1904,11 @@ def test_categorical_aggfunc(self, observed): {"C1": ["A", "B", "C", "C"], "C2": ["a", "a", "b", "b"], "V": [1, 2, 3, 4]} ) df["C1"] = df["C1"].astype("category") - result = df.pivot_table( - "V", index="C1", columns="C2", dropna=observed, aggfunc="count" - ) + msg = "The default value of observed=False is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.pivot_table( + "V", index="C1", columns="C2", dropna=observed, aggfunc="count" + ) expected_index = pd.CategoricalIndex( ["A", "B", "C"], categories=["A", "B", "C"], ordered=False, name="C1"