diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 3545dd8a89159..621baa01fbded 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -224,6 +224,7 @@ Indexing Missing ^^^^^^^ +- Bug in :class:`Grouper` now correctly propagates ``dropna`` argument and :meth:`DataFrameGroupBy.transform` now correctly handles missing values for ``dropna=True`` (:issue:`35612`) - - diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 07ffb881495fa..16b00735cf694 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -553,7 +553,6 @@ def _transform_general(self, func, *args, **kwargs): result = maybe_downcast_numeric(result, self._selected_obj.dtype) result.name = self._selected_obj.name - result.index = self._selected_obj.index return result def _transform_fast(self, result) -> Series: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 947f18901775b..cebbfac16019e 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -729,14 +729,28 @@ def _set_result_index_ordered( # set the result index on the passed values object and # return the new object, xref 8046 - # the values/counts are repeated according to the group index - # shortcut if we have an already ordered grouper - if not self.grouper.is_monotonic: - index = Index(np.concatenate(self._get_indices(self.grouper.result_index))) - result.set_axis(index, axis=self.axis, inplace=True) - result = result.sort_index(axis=self.axis) - - result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True) + if self.grouper.is_monotonic: + # shortcut if we have an already ordered grouper + result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True) + return result + + # row order is scrambled => sort the rows by position in original index + original_positions = Index( + np.concatenate(self._get_indices(self.grouper.result_index)) + ) + result.set_axis(original_positions, axis=self.axis, inplace=True) + result = result.sort_index(axis=self.axis) + + dropped_rows = len(result.index) < len(self.obj.index) + + if dropped_rows: + # get index by slicing original index according to original positions + # slice drops attrs => use set_axis when no rows were dropped + sorted_indexer = result.index + result.index = self._selected_obj.index[sorted_indexer] + else: + result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True) + return result @final diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index e38fa5e8de87e..ab568e24ff029 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -171,36 +171,53 @@ def test_grouper_dropna_propagation(dropna): @pytest.mark.parametrize( - "dropna,df_expected,s_expected", + "dropna,input_index,expected_data,expected_index", [ - pytest.param( + (True, pd.RangeIndex(0, 4), {"B": [2, 2, 1]}, pd.RangeIndex(0, 3)), + (True, list("abcd"), {"B": [2, 2, 1]}, list("abc")), + ( True, - pd.DataFrame({"B": [2, 2, 1]}), - pd.Series(data=[2, 2, 1], name="B"), - marks=pytest.mark.xfail(raises=ValueError), + pd.MultiIndex.from_tuples( + [(1, "R"), (1, "B"), (2, "R"), (2, "B")], names=["num", "col"] + ), + {"B": [2, 2, 1]}, + pd.MultiIndex.from_tuples( + [(1, "R"), (1, "B"), (2, "R")], names=["num", "col"] + ), ), + (False, pd.RangeIndex(0, 4), {"B": [2, 2, 1, 1]}, pd.RangeIndex(0, 4)), + (False, list("abcd"), {"B": [2, 2, 1, 1]}, list("abcd")), ( False, - pd.DataFrame({"B": [2, 2, 1, 1]}), - pd.Series(data=[2, 2, 1, 1], name="B"), + pd.MultiIndex.from_tuples( + [(1, "R"), (1, "B"), (2, "R"), (2, "B")], names=["num", "col"] + ), + {"B": [2, 2, 1, 1]}, + pd.MultiIndex.from_tuples( + [(1, "R"), (1, "B"), (2, "R"), (2, "B")], names=["num", "col"] + ), ), ], ) -def test_slice_groupby_then_transform(dropna, df_expected, s_expected): - # GH35014 +def test_groupby_dataframe_slice_then_transform( + dropna, input_index, expected_data, expected_index +): + # GH35014 & GH35612 - df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}) + df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}, index=input_index) gb = df.groupby("A", dropna=dropna) - res = gb.transform(len) - tm.assert_frame_equal(res, df_expected) + result = gb.transform(len) + expected = pd.DataFrame(expected_data, index=expected_index) + tm.assert_frame_equal(result, expected) - gb_slice = gb[["B"]] - res = gb_slice.transform(len) - tm.assert_frame_equal(res, df_expected) + result = gb[["B"]].transform(len) + expected = pd.DataFrame(expected_data, index=expected_index) + tm.assert_frame_equal(result, expected) - res = gb["B"].transform(len) - tm.assert_series_equal(res, s_expected) + result = gb["B"].transform(len) + expected = pd.Series(expected_data["B"], index=expected_index, name="B") + tm.assert_series_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 1d2208592a06d..5205ca3777fc0 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -626,7 +626,7 @@ def test_list_grouper_with_nat(self): [ ( "transform", - Series(name=2, dtype=np.float64, index=pd.RangeIndex(0, 0, 1)), + Series(name=2, dtype=np.float64, index=Index([])), ), ( "agg",