-
-
Notifications
You must be signed in to change notification settings - Fork 19.5k
BUG: Groupby ops on empty objects loses index, columns, dtypes #39940
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
84a0347
17c2396
bb30001
f124da1
2fc70ff
d3e52aa
af55c7d
4ec2eca
782caba
bd51562
c4e1c0d
d00d5bc
379e12a
8266af6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -435,6 +435,7 @@ Groupby/resample/rolling | |
| - Bug in :meth:`core.window.rolling.RollingGroupby.corr` and :meth:`core.window.expanding.ExpandingGroupby.corr` where the groupby column would return 0 instead of ``np.nan`` when providing ``other`` that was longer than each group (:issue:`39591`) | ||
| - Bug in :meth:`core.window.expanding.ExpandingGroupby.corr` and :meth:`core.window.expanding.ExpandingGroupby.cov` where 1 would be returned instead of ``np.nan`` when providing ``other`` that was longer than each group (:issue:`39591`) | ||
| - Bug in :meth:`.GroupBy.mean`, :meth:`.GroupBy.median` and :meth:`DataFrame.pivot_table` not propagating metadata (:issue:`28283`) | ||
| - Bug in various Groupby operations on an empty ``Series`` or ``DataFrame`` would lose index, columns, and data types (:issue:`26411`) | ||
|
||
| - | ||
|
|
||
| Reshaping | ||
|
|
@@ -450,6 +451,7 @@ Reshaping | |
| - Bug in :meth:`DataFrame.sort_values` not reshaping index correctly after sorting on columns, when ``ignore_index=True`` (:issue:`39464`) | ||
| - Bug in :meth:`DataFrame.append` returning incorrect dtypes with combinations of ``ExtensionDtype`` dtypes (:issue:`39454`) | ||
| - Bug in :meth:`DataFrame.append` returning incorrect dtypes with combinations of ``datetime64`` and ``timedelta64`` dtypes (:issue:`39574`) | ||
| - Bug in :meth:`DataFrame.pivot_table` returning a ``MultiIndex`` for a single value when operating on and empty ``DataFrame`` (:issue:`13483`) | ||
|
|
||
| Sparse | ||
| ^^^^^^ | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -450,13 +450,19 @@ def _wrap_transformed_output( | |
| return result | ||
|
|
||
| def _wrap_applied_output( | ||
| self, keys: Index, values: Optional[List[Any]], not_indexed_same: bool = False | ||
| self, | ||
| data: Series, | ||
| keys: Index, | ||
| values: Optional[List[Any]], | ||
| not_indexed_same: bool = False, | ||
| ) -> FrameOrSeriesUnion: | ||
| """ | ||
| Wrap the output of SeriesGroupBy.apply into the expected result. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| data : Series | ||
| Input data for groupby operation. | ||
| keys : Index | ||
| Keys of groups that Series was grouped by. | ||
| values : Optional[List[Any]] | ||
|
|
@@ -471,7 +477,10 @@ def _wrap_applied_output( | |
| if len(keys) == 0: | ||
| # GH #6265 | ||
| return self.obj._constructor( | ||
| [], name=self._selection_name, index=keys, dtype=np.float64 | ||
| [], | ||
| name=self._selection_name, | ||
| index=self.grouper.result_index, | ||
| dtype=data.dtype, | ||
| ) | ||
| assert values is not None | ||
|
|
||
|
|
@@ -1229,9 +1238,13 @@ def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame: | |
|
|
||
| return self.obj._constructor(result, columns=result_columns) | ||
|
|
||
| def _wrap_applied_output(self, keys, values, not_indexed_same=False): | ||
| def _wrap_applied_output(self, data, keys, values, not_indexed_same=False): | ||
| if len(keys) == 0: | ||
| return self.obj._constructor(index=keys) | ||
| result = self.obj._constructor( | ||
| index=self.grouper.result_index, columns=data.columns | ||
| ) | ||
| result = result.astype(data.dtypes.to_dict()) | ||
|
||
| return result | ||
|
|
||
| # GH12824 | ||
| first_not_none = next(com.not_none(*values), None) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1716,15 +1716,40 @@ def test_pivot_table_values_key_error(): | |
| ) | ||
|
|
||
|
|
||
| def test_empty_dataframe_groupby(): | ||
| # GH8093 | ||
| @pytest.mark.parametrize("columns", ["C", ["C"]]) | ||
| @pytest.mark.parametrize("keys", [["A"], ["A", "B"]]) | ||
| @pytest.mark.parametrize( | ||
| "dtypes", | ||
| [ | ||
| "object", | ||
| "int", | ||
| "float", | ||
| {"A": "object", "B": "int", "C": "float"}, | ||
| {"A": "int", "B": "float", "C": "object"}, | ||
| ], | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add some other datatypes to make sure preserving (categorical, datetime, datetime w/tz, Int). if some still don't work, just xfail them and create an issue.) |
||
| ) | ||
| @pytest.mark.parametrize( | ||
| "op, args", | ||
| [ | ||
| ["sum", ()], | ||
| ["agg", ("sum",)], | ||
| ["apply", ("sum",)], | ||
| ["transform", ("sum",)], | ||
| ], | ||
| ) | ||
| def test_empty_dataframe_groupby(columns, keys, dtypes, op, args): | ||
| # GH8093 & GH26411 | ||
| df = DataFrame(columns=["A", "B", "C"]) | ||
|
|
||
| result = df.groupby("A").sum() | ||
| expected = DataFrame(columns=["B", "C"], dtype=np.float64) | ||
| expected.index.name = "A" | ||
|
|
||
| tm.assert_frame_equal(result, expected) | ||
| df = df.astype(dtypes) | ||
|
|
||
| result = getattr(df.groupby(keys)[columns], op)(*args) | ||
| if op == "transform": | ||
| expected = df[columns] | ||
| else: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OT: might be worthile to split up this file as getting kind of long.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Makes sense - TBH I've never fully understood what tests were meant to be in here. I've always thought of it as tests of the *GroupBy attributes themselves, rather than the computation methods (e.g. sum, apply, etc). If that's the case, then maybe just move any tests that rely on calling computation methods out?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. right the test_groupby.py is basically test that we correctly construct a groupby object and other tests are about actually executing it. over the years these have slowly been separated out. i think time to rename this and be clear about it. |
||
| expected = df.set_index(keys)[columns] | ||
| if len(keys) == 1: | ||
| expected.index.name = keys[0] | ||
| tm.assert_equal(result, expected) | ||
|
|
||
|
|
||
| def test_tuple_as_grouping(): | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you be more specific here (which groupby ops)