Skip to content

BUG: GroupBy.count() and GroupBy.sum() incorreclty return NaN instead of 0 for missing categories (Version 2) #35241

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1091,6 +1091,9 @@ Groupby/resample/rolling
- Bug in :meth:`Rolling.apply` where ``center=True`` was ignored when ``engine='numba'`` was specified (:issue:`34784`)
- Bug in :meth:`DataFrame.ewm.cov` was throwing ``AssertionError`` for :class:`MultiIndex` inputs (:issue:`34440`)
- Bug in :meth:`core.groupby.DataFrameGroupBy.transform` when ``func='nunique'`` and columns are of type ``datetime64``, the result would also be of type ``datetime64`` instead of ``int64`` (:issue:`35109`)
- Bug in :meth:`DataFrameGroupBy.count` was returning ``NaN`` for missing categories when grouped on multiple ``Categoricals``. Now returning ``0`` (:issue:`35028`)
- Bug in :meth:`DataFrameGroupBy.sum` and :meth:`SeriesGroupBy.sum` was reutrning ``NaN`` for missing categories when grouped on multiple ``Categorials``. Now returning ``0`` (:issue:`31422`)


Reshaping
^^^^^^^^^
Expand Down Expand Up @@ -1124,6 +1127,8 @@ Reshaping
- Bug in :meth:`Series.where` with an empty Series and empty ``cond`` having non-bool dtype (:issue:`34592`)
- Fixed regression where :meth:`DataFrame.apply` would raise ``ValueError`` for elements whth ``S`` dtype (:issue:`34529`)
- Bug in :meth:`DataFrame.append` leading to sorting columns even when ``sort=False`` is specified (:issue:`35092`)
- Bug in :meth:`DataFrame.pivot_table` with ``aggfunc='count'``, was returning ``NaN`` for missing categories when pivoted on a ``Categorical``. Now returning ``0`` (:issue:`35028`)
- Bug in :meth:`DataFrame.pivot_table` with ``aggfunc='sum'``, was reutrning ``NaN`` for missing categories when pivoted on a ``Categorical``. Now returning ``0`` (:issue:`31422`)

Sparse
^^^^^^
Expand Down
15 changes: 14 additions & 1 deletion pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1820,7 +1820,20 @@ def count(self):
)
blocks = [make_block(val, placement=loc) for val, loc in zip(counted, locs)]

return self._wrap_agged_blocks(blocks, items=data.items)
# GH 35028: We want .count() to return 0 for missing categories
# rather than NaN. So we set self.observed=True to turn off the
# reindexing within self._wrap_agged_blocks, then reindex below with
# fill_value=0
observed_orig = self.observed
self.observed = True
try:
result = self._wrap_agged_blocks(blocks, items=data.items)
except Exception as e:
raise e
finally:
self.observed = observed_orig

return self._reindex_output(result, fill_value=0)

def nunique(self, dropna: bool = True):
"""
Expand Down
23 changes: 20 additions & 3 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1531,9 +1531,26 @@ def size(self) -> FrameOrSeriesUnion:

@doc(_groupby_agg_method_template, fname="sum", no=True, mc=0)
def sum(self, numeric_only: bool = True, min_count: int = 0):
return self._agg_general(
numeric_only=numeric_only, min_count=min_count, alias="add", npfunc=np.sum
)

# GH 31422: We want .sum() to return 0 for missing categories
# rather than NaN. So we set self.observed=True to turn off the
# reindexing within self._agg_general, then reindex below with
# fill_value=0
observed_orig = self.observed
self.observed = True
try:
result = self._agg_general(
numeric_only=numeric_only,
min_count=min_count,
alias="add",
npfunc=np.sum,
)
except Exception as e:
raise e
finally:
self.observed = observed_orig

return self._reindex_output(result, fill_value=0)

@doc(_groupby_agg_method_template, fname="prod", no=True, mc=0)
def prod(self, numeric_only: bool = True, min_count: int = 0):
Expand Down
78 changes: 45 additions & 33 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import pandas._testing as tm


def cartesian_product_for_groupers(result, args, names):
def cartesian_product_for_groupers(result, args, names, fill_value=np.NaN):
""" Reindex to a cartesian production for the groupers,
preserving the nature (Categorical) of each grouper
"""
Expand All @@ -33,7 +33,7 @@ def f(a):
return a

index = MultiIndex.from_product(map(f, args), names=names)
return result.reindex(index).sort_index()
return result.reindex(index, fill_value=fill_value).sort_index()


_results_for_groupbys_with_missing_categories = dict(
Expand Down Expand Up @@ -309,7 +309,7 @@ def test_observed(observed):
result = gb.sum()
if not observed:
expected = cartesian_product_for_groupers(
expected, [cat1, cat2, ["foo", "bar"]], list("ABC")
expected, [cat1, cat2, ["foo", "bar"]], list("ABC"), fill_value=0
)

tm.assert_frame_equal(result, expected)
Expand All @@ -319,7 +319,9 @@ def test_observed(observed):
expected = DataFrame({"values": [1, 2, 3, 4]}, index=exp_index)
result = gb.sum()
if not observed:
expected = cartesian_product_for_groupers(expected, [cat1, cat2], list("AB"))
expected = cartesian_product_for_groupers(
expected, [cat1, cat2], list("AB"), fill_value=0
)

tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -1189,8 +1191,11 @@ def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation):
).sortlevel()

expected = Series(data=[2, 4, np.nan, 1, np.nan, 3], index=index, name="C")
if operation == "agg":
expected = expected.fillna(0, downcast="infer")
grouped = df_cat.groupby(["A", "B"], observed=observed)["C"]
result = getattr(grouped, operation)(sum)

tm.assert_series_equal(result, expected)


Expand Down Expand Up @@ -1340,15 +1345,6 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(
)
request.node.add_marker(mark)

if reduction_func == "sum": # GH 31422
mark = pytest.mark.xfail(
reason=(
"sum should return 0 but currently returns NaN. "
"This is a known bug. See GH 31422."
)
)
request.node.add_marker(mark)

df = pd.DataFrame(
{
"cat_1": pd.Categorical(list("AABB"), categories=list("ABC")),
Expand All @@ -1369,8 +1365,11 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(
val = result.loc[idx]
assert (pd.isna(zero_or_nan) and pd.isna(val)) or (val == zero_or_nan)

# If we expect unobserved values to be zero, we also expect the dtype to be int
if zero_or_nan == 0:
# If we expect unobserved values to be zero, we also expect the dtype to be int.
# Except for .sum(). If the observed categories sum to dtype=float (i.e. their
# sums have decimals), then the zeros for the missing categories should also be
# floats.
if zero_or_nan == 0 and reduction_func != "sum":
assert np.issubdtype(result.dtype, np.integer)


Expand Down Expand Up @@ -1412,24 +1411,6 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_false(
if reduction_func == "ngroup":
pytest.skip("ngroup does not return the Categories on the index")

if reduction_func == "count": # GH 35028
mark = pytest.mark.xfail(
reason=(
"DataFrameGroupBy.count returns np.NaN for missing "
"categories, when it should return 0. See GH 35028"
)
)
request.node.add_marker(mark)

if reduction_func == "sum": # GH 31422
mark = pytest.mark.xfail(
reason=(
"sum should return 0 but currently returns NaN. "
"This is a known bug. See GH 31422."
)
)
request.node.add_marker(mark)

df = pd.DataFrame(
{
"cat_1": pd.Categorical(list("AABB"), categories=list("ABC")),
Expand All @@ -1452,6 +1433,37 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_false(
assert (res.loc[unobserved_cats] == expected).all().all()


@pytest.mark.parametrize("func", ["sum", "count"])
def test_sum_and_count_exception_handling(func: str, observed: bool, monkeypatch):
# GH 31422
# GH 35028
# In order to return 0 instead of NaN for missing categories in
# GroupBy.count() and GroupBy.sum(), both methods overwrite the value of
# self.observed and then use a try-except-finally block. This test ensures
# that:
# a) An exception from a internal method is still raised
# b) self.observed is set back to its original value
df = pd.DataFrame(
{
"cat_1": pd.Categorical(list("AABB"), categories=list("ABC")),
"cat_2": pd.Categorical(list("1111"), categories=list("12")),
"value": [0.1, 0.1, 0.1, 0.1],
}
)
df_grp = df.groupby(["cat_1", "cat_2"], observed=observed)

def _mock_method(*args, **kwargs):
raise ZeroDivisionError

to_patch = {"count": "_wrap_agged_blocks", "sum": "_agg_general"}
monkeypatch.setattr(df_grp, to_patch[func], _mock_method)

with pytest.raises(ZeroDivisionError):
getattr(df_grp, func)()

assert df_grp.observed is observed


def test_series_groupby_categorical_aggregation_getitem():
# GH 8870
d = {"foo": [10, 8, 4, 1], "bar": [10, 20, 30, 40], "baz": ["d", "c", "d", "c"]}
Expand Down
13 changes: 7 additions & 6 deletions pandas/tests/reshape/test_pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -1817,7 +1817,7 @@ def test_categorical_aggfunc(self, observed):
["A", "B", "C"], categories=["A", "B", "C"], ordered=False, name="C1"
)
expected_columns = pd.Index(["a", "b"], name="C2")
expected_data = np.array([[1.0, np.nan], [1.0, np.nan], [np.nan, 2.0]])
expected_data = np.array([[1, 0], [1, 0], [0, 2]], dtype=np.int64)
expected = pd.DataFrame(
expected_data, index=expected_index, columns=expected_columns
)
Expand Down Expand Up @@ -1851,18 +1851,19 @@ def test_categorical_pivot_index_ordering(self, observed):
values="Sales",
index="Month",
columns="Year",
dropna=observed,
observed=observed,
aggfunc="sum",
)
expected_columns = pd.Int64Index([2013, 2014], name="Year")
expected_index = pd.CategoricalIndex(
["January"], categories=months, ordered=False, name="Month"
months, categories=months, ordered=False, name="Month"
)
expected_data = [[320, 120]] + [[0, 0]] * 11
expected = pd.DataFrame(
[[320, 120]], index=expected_index, columns=expected_columns
expected_data, index=expected_index, columns=expected_columns
)
if not observed:
result = result.dropna().astype(np.int64)
if observed:
expected = expected.loc[["January"]]

tm.assert_frame_equal(result, expected)

Expand Down