From ec577d7fefbc5dee0f295398433500b0c331762c Mon Sep 17 00:00:00 2001 From: smithto1 Date: Sat, 11 Jul 2020 21:29:00 +0100 Subject: [PATCH 1/7] set self.observed=True for .sum() and .count() so that reindex can be called with 0 --- pandas/core/groupby/generic.py | 18 +++++++++- pandas/core/groupby/groupby.py | 26 +++++++++++++-- pandas/tests/groupby/test_categorical.py | 42 ++++++------------------ 3 files changed, 50 insertions(+), 36 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 1f49ee2b0b665..2b656e3035d0e 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1820,7 +1820,23 @@ def count(self): ) blocks = [make_block(val, placement=loc) for val, loc in zip(counted, locs)] - return self._wrap_agged_blocks(blocks, items=data.items) + # If self.observed=False then self._reindex_output will fill the missing + # categories (if the grouper was grouped on pd.Categorical variables) with + # np.NaN. For .count() we want these values filled in with zero. So we set + # self.observed=True for the call to self._agg_general, and then set + # it back to its orignal value. We then call self._reindex_output with + # fill_value=0. If the original self.observed is False, then we will get + # our result with 0 for the missing categories. + observed_orig = self.observed + self.observed = True + try: + result = self._wrap_agged_blocks(blocks, items=data.items) + except Exception as e: + raise e + finally: + self.observed = observed_orig + + return self._reindex_output(result, fill_value=0) def nunique(self, dropna: bool = True): """ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d039b715b3c08..c7600327f46ef 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1531,9 +1531,29 @@ def size(self) -> FrameOrSeriesUnion: @doc(_groupby_agg_method_template, fname="sum", no=True, mc=0) def sum(self, numeric_only: bool = True, min_count: int = 0): - return self._agg_general( - numeric_only=numeric_only, min_count=min_count, alias="add", npfunc=np.sum - ) + + # If self.observed=False then self._reindex_output will fill the missing + # categories (if the grouper was grouped on pd.Categorical variables) with + # np.NaN. For .sum() we want these values filled in with zero. So we set + # self.observed=True for the call to self._agg_general, and then set + # it back to its orignal value. We then call self._reindex_output with + # fill_value=0. If the original self.observed is False, then we will get + # our result with 0 for the missing categories. + observed_orig = self.observed + self.observed = True + try: + result = self._agg_general( + numeric_only=numeric_only, + min_count=min_count, + alias="add", + npfunc=np.sum, + ) + except Exception as e: + raise e + finally: + self.observed = observed_orig + + return self._reindex_output(result, fill_value=0) @doc(_groupby_agg_method_template, fname="prod", no=True, mc=0) def prod(self, numeric_only: bool = True, min_count: int = 0): diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 118d928ac02f4..d96efe1c2c732 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -19,7 +19,7 @@ import pandas._testing as tm -def cartesian_product_for_groupers(result, args, names): +def cartesian_product_for_groupers(result, args, names, fill_value=np.NaN): """ Reindex to a cartesian production for the groupers, preserving the nature (Categorical) of each grouper """ @@ -33,7 +33,7 @@ def f(a): return a index = MultiIndex.from_product(map(f, args), names=names) - return result.reindex(index).sort_index() + return result.reindex(index, fill_value=fill_value).sort_index() _results_for_groupbys_with_missing_categories = dict( @@ -309,7 +309,7 @@ def test_observed(observed): result = gb.sum() if not observed: expected = cartesian_product_for_groupers( - expected, [cat1, cat2, ["foo", "bar"]], list("ABC") + expected, [cat1, cat2, ["foo", "bar"]], list("ABC"), fill_value=0 ) tm.assert_frame_equal(result, expected) @@ -319,7 +319,9 @@ def test_observed(observed): expected = DataFrame({"values": [1, 2, 3, 4]}, index=exp_index) result = gb.sum() if not observed: - expected = cartesian_product_for_groupers(expected, [cat1, cat2], list("AB")) + expected = cartesian_product_for_groupers( + expected, [cat1, cat2], list("AB"), fill_value=0 + ) tm.assert_frame_equal(result, expected) @@ -1189,8 +1191,11 @@ def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation): ).sortlevel() expected = Series(data=[2, 4, np.nan, 1, np.nan, 3], index=index, name="C") + if operation == "agg": + expected = expected.fillna(0, downcast="infer") grouped = df_cat.groupby(["A", "B"], observed=observed)["C"] result = getattr(grouped, operation)(sum) + tm.assert_series_equal(result, expected) @@ -1340,15 +1345,6 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans( ) request.node.add_marker(mark) - if reduction_func == "sum": # GH 31422 - mark = pytest.mark.xfail( - reason=( - "sum should return 0 but currently returns NaN. " - "This is a known bug. See GH 31422." - ) - ) - request.node.add_marker(mark) - df = pd.DataFrame( { "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")), @@ -1370,7 +1366,7 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans( assert (pd.isna(zero_or_nan) and pd.isna(val)) or (val == zero_or_nan) # If we expect unobserved values to be zero, we also expect the dtype to be int - if zero_or_nan == 0: + if zero_or_nan == 0 and reduction_func != "sum": assert np.issubdtype(result.dtype, np.integer) @@ -1412,24 +1408,6 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_false( if reduction_func == "ngroup": pytest.skip("ngroup does not return the Categories on the index") - if reduction_func == "count": # GH 35028 - mark = pytest.mark.xfail( - reason=( - "DataFrameGroupBy.count returns np.NaN for missing " - "categories, when it should return 0. See GH 35028" - ) - ) - request.node.add_marker(mark) - - if reduction_func == "sum": # GH 31422 - mark = pytest.mark.xfail( - reason=( - "sum should return 0 but currently returns NaN. " - "This is a known bug. See GH 31422." - ) - ) - request.node.add_marker(mark) - df = pd.DataFrame( { "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")), From 586b1bd386cdd7b07ddb72e89959360209ed5b88 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Sat, 11 Jul 2020 22:39:35 +0100 Subject: [PATCH 2/7] wrote test for .sum() and .count() exception handling --- pandas/tests/groupby/test_categorical.py | 28 +++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index d96efe1c2c732..b04e8953360cb 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1365,7 +1365,10 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans( val = result.loc[idx] assert (pd.isna(zero_or_nan) and pd.isna(val)) or (val == zero_or_nan) - # If we expect unobserved values to be zero, we also expect the dtype to be int + # If we expect unobserved values to be zero, we also expect the dtype to be int. + # Except for .sum(). If the observed categories sum to dtype=float (i.e. their + # sums have decimals), then the zeros for the missing categories should also be + # floats. if zero_or_nan == 0 and reduction_func != "sum": assert np.issubdtype(result.dtype, np.integer) @@ -1430,6 +1433,29 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_false( assert (res.loc[unobserved_cats] == expected).all().all() +@pytest.mark.parametrize('func', ['sum', 'count']) +def test_sum_and_count_exception_handling(func: str, observed: bool, monkeypatch): + df = pd.DataFrame( + { + "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")), + "cat_2": pd.Categorical(list("1111"), categories=list("12")), + "value": [0.1, 0.1, 0.1, 0.1], + } + ) + df_grp = df.groupby(["cat_1", "cat_2"], observed=observed) + + def _mock_method(*args, **kwargs): + raise ZeroDivisionError + + to_patch = {'count' : '_wrap_agged_blocks', 'sum' : '_agg_general'} + monkeypatch.setattr(df_grp, to_patch[func], _mock_method) + + with pytest.raises(ZeroDivisionError): + getattr(df_grp, func)() + + assert df_grp.observed is observed + + def test_series_groupby_categorical_aggregation_getitem(): # GH 8870 d = {"foo": [10, 8, 4, 1], "bar": [10, 20, 30, 40], "baz": ["d", "c", "d", "c"]} From ec4bd286ac9501a46eea4c4365fe10c1019c5d97 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Sat, 11 Jul 2020 22:40:12 +0100 Subject: [PATCH 3/7] black --- pandas/tests/groupby/test_categorical.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index b04e8953360cb..4c83ecbc53c3d 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1433,7 +1433,7 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_false( assert (res.loc[unobserved_cats] == expected).all().all() -@pytest.mark.parametrize('func', ['sum', 'count']) +@pytest.mark.parametrize("func", ["sum", "count"]) def test_sum_and_count_exception_handling(func: str, observed: bool, monkeypatch): df = pd.DataFrame( { @@ -1443,17 +1443,17 @@ def test_sum_and_count_exception_handling(func: str, observed: bool, monkeypatch } ) df_grp = df.groupby(["cat_1", "cat_2"], observed=observed) - + def _mock_method(*args, **kwargs): raise ZeroDivisionError - - to_patch = {'count' : '_wrap_agged_blocks', 'sum' : '_agg_general'} + + to_patch = {"count": "_wrap_agged_blocks", "sum": "_agg_general"} monkeypatch.setattr(df_grp, to_patch[func], _mock_method) - + with pytest.raises(ZeroDivisionError): getattr(df_grp, func)() - assert df_grp.observed is observed + assert df_grp.observed is observed def test_series_groupby_categorical_aggregation_getitem(): From eed48cb021bdc157185162f0345c56db00099d17 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Sat, 11 Jul 2020 22:49:53 +0100 Subject: [PATCH 4/7] updated pivot tests --- pandas/tests/reshape/test_pivot.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index c07a5673fe503..67b3151b0ff9c 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1817,7 +1817,7 @@ def test_categorical_aggfunc(self, observed): ["A", "B", "C"], categories=["A", "B", "C"], ordered=False, name="C1" ) expected_columns = pd.Index(["a", "b"], name="C2") - expected_data = np.array([[1.0, np.nan], [1.0, np.nan], [np.nan, 2.0]]) + expected_data = np.array([[1, 0], [1, 0], [0, 2]], dtype=np.int64) expected = pd.DataFrame( expected_data, index=expected_index, columns=expected_columns ) @@ -1851,18 +1851,19 @@ def test_categorical_pivot_index_ordering(self, observed): values="Sales", index="Month", columns="Year", - dropna=observed, + observed=observed, aggfunc="sum", ) expected_columns = pd.Int64Index([2013, 2014], name="Year") expected_index = pd.CategoricalIndex( - ["January"], categories=months, ordered=False, name="Month" + months, categories=months, ordered=False, name="Month" ) + expected_data = [[320, 120]] + [[0, 0]] * 11 expected = pd.DataFrame( - [[320, 120]], index=expected_index, columns=expected_columns + expected_data, index=expected_index, columns=expected_columns ) - if not observed: - result = result.dropna().astype(np.int64) + if observed: + expected = expected.loc[["January"]] tm.assert_frame_equal(result, expected) From 88d8a148fef8efccaeef631cc57f426fd869a4d1 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Sat, 11 Jul 2020 23:07:07 +0100 Subject: [PATCH 5/7] whatsnew --- doc/source/whatsnew/v1.1.0.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 5f93e08d51baa..70dcc580a2006 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1091,6 +1091,9 @@ Groupby/resample/rolling - Bug in :meth:`Rolling.apply` where ``center=True`` was ignored when ``engine='numba'`` was specified (:issue:`34784`) - Bug in :meth:`DataFrame.ewm.cov` was throwing ``AssertionError`` for :class:`MultiIndex` inputs (:issue:`34440`) - Bug in :meth:`core.groupby.DataFrameGroupBy.transform` when ``func='nunique'`` and columns are of type ``datetime64``, the result would also be of type ``datetime64`` instead of ``int64`` (:issue:`35109`) +- Bug in :meth:`DataFrameGroupBy.count` was returning ``NaN`` for missing categories when grouped on multiple ``Categoricals``. Now returning ``0`` (:issue:`35028`) +- Bug in :meth:`DataFrameGroupBy.sum` and :meth:`SeriesGroupBy.sum` was reutrning ``NaN`` for missing categories when grouped on multiple ``Categorials``. Now returning ``0`` (:issue:`31422`) + Reshaping ^^^^^^^^^ @@ -1124,6 +1127,8 @@ Reshaping - Bug in :meth:`Series.where` with an empty Series and empty ``cond`` having non-bool dtype (:issue:`34592`) - Fixed regression where :meth:`DataFrame.apply` would raise ``ValueError`` for elements whth ``S`` dtype (:issue:`34529`) - Bug in :meth:`DataFrame.append` leading to sorting columns even when ``sort=False`` is specified (:issue:`35092`) +- Bug in :meth:`DataFrame.pivot_table` with ``aggfunc='count'``, was returning ``NaN`` for missing categories when pivoted on a ``Categorical``. Now returning ``0`` (:issue:`35028`) +- Bug in :meth:`DataFrame.pivot_table` with ``aggfunc='sum'``, was reutrning ``NaN`` for missing categories when pivoted on a ``Categorical``. Now returning ``0`` (:issue:`31422`) Sparse ^^^^^^ From 940b32ed13306dc29c465d8b6a0915cfd5e5d561 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Sat, 11 Jul 2020 23:29:37 +0100 Subject: [PATCH 6/7] comments --- pandas/core/groupby/generic.py | 11 ++++------- pandas/core/groupby/groupby.py | 11 ++++------- pandas/tests/groupby/test_categorical.py | 8 ++++++++ 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 2b656e3035d0e..119bb3acf51c9 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1820,13 +1820,10 @@ def count(self): ) blocks = [make_block(val, placement=loc) for val, loc in zip(counted, locs)] - # If self.observed=False then self._reindex_output will fill the missing - # categories (if the grouper was grouped on pd.Categorical variables) with - # np.NaN. For .count() we want these values filled in with zero. So we set - # self.observed=True for the call to self._agg_general, and then set - # it back to its orignal value. We then call self._reindex_output with - # fill_value=0. If the original self.observed is False, then we will get - # our result with 0 for the missing categories. + # GH 35028: We want .count() to return 0 for missing categories + # rather than NaN. So we set self.observed=True to turn off the + # reindexing within self._wrap_agged_blocks, then reindex below with + # fill_value=0 observed_orig = self.observed self.observed = True try: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index c7600327f46ef..801123ea4bc53 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1532,13 +1532,10 @@ def size(self) -> FrameOrSeriesUnion: @doc(_groupby_agg_method_template, fname="sum", no=True, mc=0) def sum(self, numeric_only: bool = True, min_count: int = 0): - # If self.observed=False then self._reindex_output will fill the missing - # categories (if the grouper was grouped on pd.Categorical variables) with - # np.NaN. For .sum() we want these values filled in with zero. So we set - # self.observed=True for the call to self._agg_general, and then set - # it back to its orignal value. We then call self._reindex_output with - # fill_value=0. If the original self.observed is False, then we will get - # our result with 0 for the missing categories. + # GH 31422: We want .sum() to return 0 for missing categories + # rather than NaN. So we set self.observed=True to turn off the + # reindexing within self._agg_general, then reindex below with + # fill_value=0 observed_orig = self.observed self.observed = True try: diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 4c83ecbc53c3d..595c6df8241ba 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1435,6 +1435,14 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_false( @pytest.mark.parametrize("func", ["sum", "count"]) def test_sum_and_count_exception_handling(func: str, observed: bool, monkeypatch): + # GH 31422 + # GH 35028 + # In order to return 0 instead of NaN for missing categories in + # GroupBy.count() and GroupBy.sum(), both methods overwrite the value of + # self.observed and then use a try-except-finally block. This test ensures + # that: + # a) An exception from a internal method is still raised + # b) self.observed is set back to its original value df = pd.DataFrame( { "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")), From 3e39f1aa3ff9f89a2ce9c5c0fc1171ab4a396542 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Sat, 11 Jul 2020 23:31:10 +0100 Subject: [PATCH 7/7] black --- pandas/core/groupby/generic.py | 6 +++--- pandas/core/groupby/groupby.py | 6 +++--- pandas/tests/groupby/test_categorical.py | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 119bb3acf51c9..bdfc29ac563dc 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1820,9 +1820,9 @@ def count(self): ) blocks = [make_block(val, placement=loc) for val, loc in zip(counted, locs)] - # GH 35028: We want .count() to return 0 for missing categories - # rather than NaN. So we set self.observed=True to turn off the - # reindexing within self._wrap_agged_blocks, then reindex below with + # GH 35028: We want .count() to return 0 for missing categories + # rather than NaN. So we set self.observed=True to turn off the + # reindexing within self._wrap_agged_blocks, then reindex below with # fill_value=0 observed_orig = self.observed self.observed = True diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 801123ea4bc53..992cbaa6af860 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1532,9 +1532,9 @@ def size(self) -> FrameOrSeriesUnion: @doc(_groupby_agg_method_template, fname="sum", no=True, mc=0) def sum(self, numeric_only: bool = True, min_count: int = 0): - # GH 31422: We want .sum() to return 0 for missing categories - # rather than NaN. So we set self.observed=True to turn off the - # reindexing within self._agg_general, then reindex below with + # GH 31422: We want .sum() to return 0 for missing categories + # rather than NaN. So we set self.observed=True to turn off the + # reindexing within self._agg_general, then reindex below with # fill_value=0 observed_orig = self.observed self.observed = True diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 595c6df8241ba..d5897b8ba00cb 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1437,11 +1437,11 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_false( def test_sum_and_count_exception_handling(func: str, observed: bool, monkeypatch): # GH 31422 # GH 35028 - # In order to return 0 instead of NaN for missing categories in - # GroupBy.count() and GroupBy.sum(), both methods overwrite the value of - # self.observed and then use a try-except-finally block. This test ensures + # In order to return 0 instead of NaN for missing categories in + # GroupBy.count() and GroupBy.sum(), both methods overwrite the value of + # self.observed and then use a try-except-finally block. This test ensures # that: - # a) An exception from a internal method is still raised + # a) An exception from a internal method is still raised # b) self.observed is set back to its original value df = pd.DataFrame( {