diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index bead5a5996d1a..678403d837805 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -127,6 +127,10 @@ def setup(self): 'value1': np.random.randn(N), 'value2': np.random.randn(N), 'value3': np.random.randn(N)}) + self.df2 = DataFrame({'col1': list('abcde'), 'col2': list('fghij'), + 'col3': [1, 2, 3, 4, 5]}) + self.df2.col1 = self.df2.col1.astype('category') + self.df2.col2 = self.df2.col2.astype('category') def time_pivot_table(self): self.df.pivot_table(index='key1', columns=['key2', 'key3']) @@ -139,6 +143,14 @@ def time_pivot_table_margins(self): self.df.pivot_table(index='key1', columns=['key2', 'key3'], margins=True) + def time_pivot_table_categorical(self): + self.df2.pivot_table(index='col1', values='col3', columns='col2', + aggfunc=np.sum, fill_value=0) + + def time_pivot_table_categorical_observed(self): + self.df2.pivot_table(index='col1', values='col3', columns='col2', + aggfunc=np.sum, fill_value=0, observed=True) + class Crosstab: diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 9afcf3ddcdbb1..fb36a083ec290 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -28,6 +28,7 @@ Other Enhancements - Indexing of ``DataFrame`` and ``Series`` now accepts zerodim ``np.ndarray`` (:issue:`24919`) - :meth:`Timestamp.replace` now supports the ``fold`` argument to disambiguate DST transition times (:issue:`25017`) - :meth:`DataFrame.at_time` and :meth:`Series.at_time` now support :meth:`datetime.time` objects with timezones (:issue:`24043`) +- :meth:`DataFrame.pivot_table` now accepts an ``observed`` parameter which is passed to underlying calls to :meth:`DataFrame.groupby` to speed up grouping categorical data. (:issue:`24923`) - ``Series.str`` has gained :meth:`Series.str.casefold` method to removes all case distinctions present in a string (:issue:`25405`) - :meth:`DataFrame.set_index` now works for instances of ``abc.Iterator``, provided their output is of the same length as the calling frame (:issue:`22484`, :issue:`24984`) - :meth:`DatetimeIndex.union` now supports the ``sort`` argument. The behaviour of the sort parameter matches that of :meth:`Index.union` (:issue:`24994`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2008c444fad5e..b127a1d28e22b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5701,6 +5701,12 @@ def pivot(self, index=None, columns=None, values=None): margins_name : string, default 'All' Name of the row / column that will contain the totals when margins is True. + observed : boolean, default False + This only applies if any of the groupers are Categoricals. + If True: only show observed values for categorical groupers. + If False: show all values for categorical groupers. + + .. versionchanged :: 0.25.0 Returns ------- @@ -5791,12 +5797,12 @@ def pivot(self, index=None, columns=None, values=None): @Appender(_shared_docs['pivot_table']) def pivot_table(self, values=None, index=None, columns=None, aggfunc='mean', fill_value=None, margins=False, - dropna=True, margins_name='All'): + dropna=True, margins_name='All', observed=False): from pandas.core.reshape.pivot import pivot_table return pivot_table(self, values=values, index=index, columns=columns, aggfunc=aggfunc, fill_value=fill_value, margins=margins, dropna=dropna, - margins_name=margins_name) + margins_name=margins_name, observed=observed) def stack(self, level=-1, dropna=True): """ diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 3aaae3b59a0d4..be0d74b460850 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -22,7 +22,7 @@ @Appender(_shared_docs['pivot_table'], indents=1) def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', fill_value=None, margins=False, dropna=True, - margins_name='All'): + margins_name='All', observed=False): index = _convert_by(index) columns = _convert_by(columns) @@ -34,7 +34,8 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', columns=columns, fill_value=fill_value, aggfunc=func, margins=margins, dropna=dropna, - margins_name=margins_name) + margins_name=margins_name, + observed=observed) pieces.append(table) keys.append(getattr(func, '__name__', func)) @@ -77,7 +78,7 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', pass values = list(values) - grouped = data.groupby(keys, observed=False) + grouped = data.groupby(keys, observed=observed) agged = grouped.agg(aggfunc) if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns): agged = agged.dropna(how='all') diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 5b757ac156078..64374cd9646eb 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -37,18 +37,18 @@ def setup_method(self, method): 'E': np.random.randn(11), 'F': np.random.randn(11)}) - def test_pivot_table(self): + def test_pivot_table(self, observed): index = ['A', 'B'] columns = 'C' table = pivot_table(self.data, values='D', - index=index, columns=columns) + index=index, columns=columns, observed=observed) table2 = self.data.pivot_table( - values='D', index=index, columns=columns) + values='D', index=index, columns=columns, observed=observed) tm.assert_frame_equal(table, table2) # this works - pivot_table(self.data, values='D', index=index) + pivot_table(self.data, values='D', index=index, observed=observed) if len(index) > 1: assert table.index.names == tuple(index) @@ -64,6 +64,28 @@ def test_pivot_table(self): index + [columns])['D'].agg(np.mean).unstack() tm.assert_frame_equal(table, expected) + def test_pivot_table_categorical_observed_equal(self, observed): + # issue #24923 + df = pd.DataFrame({'col1': list('abcde'), + 'col2': list('fghij'), + 'col3': [1, 2, 3, 4, 5]}) + + expected = df.pivot_table(index='col1', values='col3', + columns='col2', aggfunc=np.sum, + fill_value=0) + + expected.index = expected.index.astype('category') + expected.columns = expected.columns.astype('category') + + df.col1 = df.col1.astype('category') + df.col2 = df.col2.astype('category') + + result = df.pivot_table(index='col1', values='col3', + columns='col2', aggfunc=np.sum, + fill_value=0, observed=observed) + + tm.assert_frame_equal(result, expected) + def test_pivot_table_nocols(self): df = DataFrame({'rows': ['a', 'b', 'c'], 'cols': ['x', 'y', 'z'],