BUG: Fixes issue pandas-dev#3334: brittle margin in pivot_table.

guyrt · guyrt · commit 1feaf7b75fae · 2013-08-01T14:39:59.000-04:00
Adds support for margin computation when all columns are used in rows and cols
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -102,6 +102,7 @@ pandas 0.13
     set _ref_locs (:issue:`4403`)
   - Fixed an issue where hist subplots were being overwritten when they were
     called using the top level matplotlib API (:issue:`4408`)
+  - Fixed (:issue:`3334`). Margins did not compute if values is the index.
 
 pandas 0.12
 ===========
diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt
@@ -11,35 +11,12 @@ API changes
 
   - ``read_excel`` now supports an integer in its ``sheetname`` argument giving
     the index of the sheet to read in (:issue:`4301`).
-  - Text parser now treats anything that reads like inf ("inf", "Inf", "-Inf",
-    "iNf", etc.) as infinity. (:issue:`4220`, :issue:`4219`), affecting
-    ``read_table``, ``read_csv``, etc.
-  - ``pandas`` now is Python 2/3 compatible without the need for 2to3 thanks to
-    @jtratner. As a result, pandas now uses iterators more extensively. This
-    also led to the introduction of substantive parts of the Benjamin
-    Peterson's ``six`` library into compat. (:issue:`4384`, :issue:`4375`,
-    :issue:`4372`)
-  - ``pandas.util.compat`` and ``pandas.util.py3compat`` have been merged into
-    ``pandas.compat``. ``pandas.compat`` now includes many functions allowing
-    2/3 compatibility. It contains both list and iterator versions of range,
-    filter, map and zip, plus other necessary elements for Python 3
-    compatibility. ``lmap``, ``lzip``, ``lrange`` and ``lfilter`` all produce
-    lists instead of iterators, for compatibility with ``numpy``, subscripting
-    and ``pandas`` constructors.(:issue:`4384`, :issue:`4375`, :issue:`4372`)
-  - deprecated ``iterkv``, which will be removed in a future release (was just
-    an alias of iteritems used to get around ``2to3``'s changes).
-    (:issue:`4384`, :issue:`4375`, :issue:`4372`)
-  - ``Series.get`` with negative indexers now returns the same as ``[]`` (:issue:`4390`)
 
 Enhancements
 ~~~~~~~~~~~~
 
   - ``read_html`` now raises a ``URLError`` instead of catching and raising a
     ``ValueError`` (:issue:`4303`, :issue:`4305`)
-  - Added a test for ``read_clipboard()`` and ``to_clipboard()`` (:issue:`4282`)
-  - Clipboard functionality now works with PySide (:issue:`4282`)
-  - Added a more informative error message when plot arguments contain
-    overlapping color and style arguments (:issue:`4402`)
 
 Bug Fixes
 ~~~~~~~~~
@@ -52,22 +29,9 @@ Bug Fixes
 
   - Fixed bug in ``PeriodIndex.map`` where using ``str`` would return the str
     representation of the index (:issue:`4136`)
+    
+  - Fixed (:issue:`3334`). Margins did not compute if values is the index.
 
-  - Fixed test failure ``test_time_series_plot_color_with_empty_kwargs`` when
-    using custom matplotlib default colors (:issue:`4345`)
-
-  - Fix running of stata IO tests. Now uses temporary files to write
-    (:issue:`4353`)
-
-  - Fixed an issue where ``DataFrame.sum`` was slower than ``DataFrame.mean``
-    for integer valued frames (:issue:`4365`)
-
-  - ``read_html`` tests now work with Python 2.6 (:issue:`4351`)
-
-  - Fixed bug where ``network`` testing was throwing ``NameError`` because a
-    local variable was undefined (:issue:`4381`)
-
-  - Suppressed DeprecationWarning associated with internal calls issued by repr() (:issue:`4391`)
 
 See the :ref:`full release notes
 <release>` or issue tracker
diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py
@@ -2,11 +2,8 @@
 
 from pandas import Series, DataFrame
 from pandas.core.index import MultiIndex
-from pandas.core.reshape import _unstack_multiple
 from pandas.tools.merge import concat
 from pandas.tools.util import cartesian_product
-from pandas.compat import range, lrange, zip
-from pandas import compat
 import pandas.core.common as com
 import numpy as np
 
@@ -149,17 +146,64 @@ def pivot_table(data, values=None, rows=None, cols=None, aggfunc='mean',
 DataFrame.pivot_table = pivot_table
 
 
-def _add_margins(table, data, values, rows=None, cols=None, aggfunc=np.mean):
-    grand_margin = {}
-    for k, v in compat.iteritems(data[values]):
-        try:
-            if isinstance(aggfunc, compat.string_types):
-                grand_margin[k] = getattr(v, aggfunc)()
-            else:
-                grand_margin[k] = aggfunc(v)
-        except TypeError:
-            pass
+def _add_margins(table, data, values, rows, cols, aggfunc):
+
+    grand_margin = _compute_grand_margin(data, values, aggfunc)
+
+    if not values and isinstance(table, Series):
+        # If there are no values and the table is a series, then there is only
+        # one column in the data. Compute grand margin and return it.
+        row_key = ('All',) + ('',) * (len(rows) - 1) if len(rows) > 1 else 'All'
+        return table.append(Series({row_key: grand_margin['All']}))
+
+    if values:
+        marginal_result_set = _generate_marginal_results(table, data, values, rows, cols, aggfunc, grand_margin)
+        if not isinstance(marginal_result_set, tuple):
+            return marginal_result_set
+        result, margin_keys, row_margin = marginal_result_set
+    else:
+        marginal_result_set = _generate_marginal_results_without_values(table, data, rows, cols, aggfunc)
+        if not isinstance(marginal_result_set, tuple):
+            return marginal_result_set
+        result, margin_keys, row_margin = marginal_result_set
+
+    key = ('All',) + ('',) * (len(rows) - 1) if len(rows) > 1 else 'All'
+
+    row_margin = row_margin.reindex(result.columns)
+    # populate grand margin
+    for k in margin_keys:
+        if isinstance(k, basestring):
+            row_margin[k] = grand_margin[k]
+        else:
+            row_margin[k] = grand_margin[k[0]]
 
+    margin_dummy = DataFrame(row_margin, columns=[key]).T
+
+    row_names = result.index.names
+    result = result.append(margin_dummy)
+    result.index.names = row_names
+
+    return result
+
+
+def _compute_grand_margin(data, values, aggfunc):
+
+    if values:
+        grand_margin = {}
+        for k, v in data[values].iteritems():
+            try:
+                if isinstance(aggfunc, basestring):
+                    grand_margin[k] = getattr(v, aggfunc)()
+                else:
+                    grand_margin[k] = aggfunc(v)
+            except TypeError:
+                pass
+        return grand_margin
+    else:
+        return {'All': aggfunc(data.index)}
+
+
+def _generate_marginal_results(table, data, values, rows, cols, aggfunc, grand_margin):
     if len(cols) > 0:
         # need to "interleave" the margins
         table_pieces = []
@@ -198,28 +242,48 @@ def _all_key(key):
         row_margin = row_margin.stack()
 
         # slight hack
-        new_order = [len(cols)] + lrange(len(cols))
+        new_order = [len(cols)] + range(len(cols))
         row_margin.index = row_margin.index.reorder_levels(new_order)
     else:
         row_margin = Series(np.nan, index=result.columns)
 
-    key = ('All',) + ('',) * (len(rows) - 1) if len(rows) > 1 else 'All'
+    return result, margin_keys, row_margin
 
-    row_margin = row_margin.reindex(result.columns)
-    # populate grand margin
-    for k in margin_keys:
-        if len(cols) > 0:
-            row_margin[k] = grand_margin[k[0]]
-        else:
-            row_margin[k] = grand_margin[k]
 
-    margin_dummy = DataFrame(row_margin, columns=[key]).T
+def _generate_marginal_results_without_values(table, data, rows, cols, aggfunc):
+    if len(cols) > 0:
+        # need to "interleave" the margins
+        margin_keys = []
 
-    row_names = result.index.names
-    result = result.append(margin_dummy)
-    result.index.names = row_names
+        def _all_key():
+            if len(cols) == 1:
+                return 'All'
+            return ('All', ) + ('', ) * (len(cols) - 1)
 
-    return result
+        if len(rows) > 0:
+            margin = data[rows].groupby(rows).apply(aggfunc)
+            all_key = _all_key()
+            table[all_key] = margin
+            result = table
+            margin_keys.append(all_key)
+
+        else:
+            margin = data.groupby(level=0, axis=0).apply(aggfunc)
+            all_key = _all_key()
+            table[all_key] = margin
+            result = table
+            margin_keys.append(all_key)
+            return result
+    else:
+        result = table
+        margin_keys = table.columns
+
+    if len(cols):
+        row_margin = data[cols].groupby(cols).apply(aggfunc)
+    else:
+        row_margin = Series(np.nan, index=result.columns)
+
+    return result, margin_keys, row_margin
 
 
 def _convert_by(by):
diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py
@@ -1,14 +1,11 @@
-import datetime
 import unittest
 
 import numpy as np
 from numpy.testing import assert_equal
 
-import pandas
 from pandas import DataFrame, Series, Index, MultiIndex
 from pandas.tools.merge import concat
 from pandas.tools.pivot import pivot_table, crosstab
-from pandas.compat import range, u, product
 import pandas.util.testing as tm
 
 
@@ -75,18 +72,9 @@ def test_pivot_table_dropna(self):
         pv_col = df.pivot_table('quantity', 'month', ['customer', 'product'], dropna=False)
         pv_ind = df.pivot_table('quantity', ['customer', 'product'], 'month', dropna=False)
 
-        m = MultiIndex.from_tuples([(u('A'), u('a')),
-                                    (u('A'), u('b')),
-                                    (u('A'), u('c')),
-                                    (u('A'), u('d')),
-                                    (u('B'), u('a')),
-                                    (u('B'), u('b')),
-                                    (u('B'), u('c')),
-                                    (u('B'), u('d')),
-                                    (u('C'), u('a')),
-                                    (u('C'), u('b')),
-                                    (u('C'), u('c')),
-                                    (u('C'), u('d'))])
+        m = MultiIndex.from_tuples([(u'A', u'a'), (u'A', u'b'), (u'A', u'c'), (u'A', u'd'), 
+                                   (u'B', u'a'), (u'B', u'b'), (u'B', u'c'), (u'B', u'd'),
+                                   (u'C', u'a'), (u'C', u'b'), (u'C', u'c'), (u'C', u'd')])
 
         assert_equal(pv_col.columns.values, m.values)
         assert_equal(pv_ind.index.values, m.values)
@@ -211,17 +199,20 @@ def _check_output(res, col, rows=['A', 'B'], cols=['C']):
         # no rows
         rtable = self.data.pivot_table(cols=['AA', 'BB'], margins=True,
                                        aggfunc=np.mean)
-        tm.assert_isinstance(rtable, Series)
+        self.assert_(isinstance(rtable, Series))
         for item in ['DD', 'EE', 'FF']:
             gmarg = table[item]['All', '']
             self.assertEqual(gmarg, self.data[item].mean())
 
     def test_pivot_integer_columns(self):
         # caused by upstream bug in unstack
+        from pandas.util.compat import product
+        import datetime
+        import pandas
 
         d = datetime.date.min
         data = list(product(['foo', 'bar'], ['A', 'B', 'C'], ['x1', 'x2'],
-                            [d + datetime.timedelta(i) for i in range(20)], [1.0]))
+                            [d + datetime.timedelta(i) for i in xrange(20)], [1.0]))
         df = pandas.DataFrame(data)
         table = df.pivot_table(values=4, rows=[0, 1, 3], cols=[2])
 
@@ -245,6 +236,9 @@ def test_pivot_no_level_overlap(self):
         tm.assert_frame_equal(table, expected)
 
     def test_pivot_columns_lexsorted(self):
+        import datetime
+        import numpy as np
+        import pandas
 
         n = 10000
 
@@ -296,6 +290,28 @@ def test_pivot_complex_aggfunc(self):
 
         tm.assert_frame_equal(result, expected)
 
+    def test_margins_no_values_no_cols(self):
+        # Regression test on pivot table: no values or cols passed.
+        result = self.data[['A', 'B']].pivot_table(rows=['A', 'B'], aggfunc=len, margins=True)
+        result_list = result.tolist()
+        self.assertEqual(sum(result_list[:-1]), result_list[-1])
+
+    def test_margins_no_values_two_rows(self):
+        # Regression test on pivot table: no values passed but rows are a multi-index
+        result = self.data[['A', 'B', 'C']].pivot_table(rows=['A', 'B'], cols='C', aggfunc=len, margins=True)
+        self.assertEqual(result.All.tolist(), [3.0, 1.0, 4.0, 3.0, 11.0])
+
+    def test_margins_no_values_one_row_one_col(self):
+        # Regression test on pivot table: no values passed but row and col defined
+        result = self.data[['A', 'B']].pivot_table(rows='A', cols='B', aggfunc=len, margins=True)
+        self.assertEqual(result.All.tolist(), [4.0, 7.0, 11.0])
+
+    def test_margins_no_values_two_row_two_cols(self):
+        # Regression test on pivot table: no values passed but rows and cols are multi-indexed
+        self.data['D'] = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k']
+        result = self.data[['A', 'B', 'C', 'D']].pivot_table(rows=['A', 'B'], cols=['C', 'D'], aggfunc=len, margins=True)
+        self.assertEqual(result.All.tolist(), [3.0, 1.0, 4.0, 3.0, 11.0])
+
 
 class TestCrosstab(unittest.TestCase):