diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index fc7019c486d9a..62069dfcb2262 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -850,6 +850,7 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your - :func:`ExtensionArray.isna` is allowed to return an ``ExtensionArray`` (:issue:`22325`). - Support for reduction operations such as ``sum``, ``mean`` via opt-in base class method override (:issue:`22762`) - :meth:`Series.unstack` no longer converts extension arrays to object-dtype ndarrays. The output ``DataFrame`` will now have the same dtype as the input. This changes behavior for Categorical and Sparse data (:issue:`23077`). +- Bug when grouping :meth:`Dataframe.groupby()` and aggregating on ``ExtensionArray`` it was not returning the actual ``ExtensionArray`` dtype (:issue:`23227`). .. _whatsnew_0240.api.incompatibilities: @@ -1084,6 +1085,7 @@ Categorical - Bug when indexing with a boolean-valued ``Categorical``. Now a boolean-valued ``Categorical`` is treated as a boolean mask (:issue:`22665`) - Constructing a :class:`CategoricalIndex` with empty values and boolean categories was raising a ``ValueError`` after a change to dtype coercion (:issue:`22702`). - Bug in :meth:`Categorical.take` with a user-provided ``fill_value`` not encoding the ``fill_value``, which could result in a ``ValueError``, incorrect results, or a segmentation fault (:issue:`23296`). +- Bug when resampling :meth:`Dataframe.resample()` and aggregating on categorical data, the categorical dtype was getting lost. (:issue:`23227`) Datetimelike ^^^^^^^^^^^^ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index e31929434b5d6..ea7507799fa9a 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -24,7 +24,8 @@ class providing the base-class of operations. from pandas.util._validators import validate_kwargs from pandas.core.dtypes.cast import maybe_downcast_to_dtype -from pandas.core.dtypes.common import ensure_float, is_numeric_dtype, is_scalar +from pandas.core.dtypes.common import ( + ensure_float, is_extension_array_dtype, is_numeric_dtype, is_scalar) from pandas.core.dtypes.missing import isna, notna import pandas.core.algorithms as algorithms @@ -754,7 +755,18 @@ def _try_cast(self, result, obj, numeric_only=False): dtype = obj.dtype if not is_scalar(result): - if numeric_only and is_numeric_dtype(dtype) or not numeric_only: + if is_extension_array_dtype(dtype): + # The function can return something of any type, so check + # if the type is compatible with the calling EA. + try: + result = obj.values._from_sequence(result) + except Exception: + # https://github.com/pandas-dev/pandas/issues/22850 + # pandas has no control over what 3rd-party ExtensionArrays + # do in _values_from_sequence. We still want ops to work + # though, so we catch any regular Exception. + pass + elif numeric_only and is_numeric_dtype(dtype) or not numeric_only: result = maybe_downcast_to_dtype(result, dtype) return result diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 41ec2d3026499..24bc8ffe2e5a5 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -650,9 +650,10 @@ def test_preserve_dtypes(op): # groupby result = getattr(df.groupby("A"), op)() + expected = pd.DataFrame({ "B": np.array([1.0, 3.0]), - "C": np.array([1, 3], dtype="int64") + "C": integer_array([1, 3], dtype="Int64") }, index=pd.Index(['a', 'b'], name='A')) tm.assert_frame_equal(result, expected) @@ -673,9 +674,10 @@ def test_reduce_to_float(op): # groupby result = getattr(df.groupby("A"), op)() + expected = pd.DataFrame({ "B": np.array([1.0, 3.0]), - "C": np.array([1, 3], dtype="float64") + "C": integer_array([1, 3], dtype="Int64") }, index=pd.Index(['a', 'b'], name='A')) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/sparse/test_groupby.py b/pandas/tests/sparse/test_groupby.py index 1d2129312fb1b..d0ff2a02c4046 100644 --- a/pandas/tests/sparse/test_groupby.py +++ b/pandas/tests/sparse/test_groupby.py @@ -24,27 +24,39 @@ def test_first_last_nth(self): sparse_grouped = self.sparse.groupby('A') dense_grouped = self.dense.groupby('A') + sparse_grouped_first = sparse_grouped.first() + sparse_grouped_last = sparse_grouped.last() + sparse_grouped_nth = sparse_grouped.nth(1) + + dense_grouped_first = dense_grouped.first().to_sparse() + dense_grouped_last = dense_grouped.last().to_sparse() + dense_grouped_nth = dense_grouped.nth(1).to_sparse() + # TODO: shouldn't these all be spares or not? - tm.assert_frame_equal(sparse_grouped.first(), - dense_grouped.first()) - tm.assert_frame_equal(sparse_grouped.last(), - dense_grouped.last()) - tm.assert_frame_equal(sparse_grouped.nth(1), - dense_grouped.nth(1).to_sparse()) + tm.assert_frame_equal(sparse_grouped_first, + dense_grouped_first) + tm.assert_frame_equal(sparse_grouped_last, + dense_grouped_last) + tm.assert_frame_equal(sparse_grouped_nth, + dense_grouped_nth) def test_aggfuncs(self): sparse_grouped = self.sparse.groupby('A') dense_grouped = self.dense.groupby('A') - tm.assert_frame_equal(sparse_grouped.mean(), - dense_grouped.mean()) + result = sparse_grouped.mean().to_sparse() + expected = dense_grouped.mean().to_sparse() + + tm.assert_frame_equal(result, expected) # ToDo: sparse sum includes str column # tm.assert_frame_equal(sparse_grouped.sum(), # dense_grouped.sum()) - tm.assert_frame_equal(sparse_grouped.count(), - dense_grouped.count()) + result = sparse_grouped.count().to_sparse() + expected = dense_grouped.count().to_sparse() + + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("fill_value", [0, np.nan]) @@ -54,6 +66,5 @@ def test_groupby_includes_fill_value(fill_value): 'b': [fill_value, 1, fill_value, fill_value]}) sdf = df.to_sparse(fill_value=fill_value) result = sdf.groupby('a').sum() - expected = df.groupby('a').sum() - tm.assert_frame_equal(result, expected, - check_index_type=False) + expected = df.groupby('a').sum().to_sparse(fill_value=fill_value) + tm.assert_frame_equal(result, expected, check_index_type=False) diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index 69a0613c95475..ed29e20fd5ca5 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -1576,6 +1576,7 @@ def test_resample_categorical_data_with_timedeltaindex(self): 'Group': ['A', 'A']}, index=pd.to_timedelta([0, 10], unit='s')) expected = expected.reindex(['Group_obj', 'Group'], axis=1) + expected['Group'] = expected['Group_obj'].astype('category') tm.assert_frame_equal(result, expected) def test_resample_daily_anchored(self):