From f6c6408127119b666951c75c41a515dda1078ab6 Mon Sep 17 00:00:00 2001 From: Kei Date: Sat, 27 Apr 2024 23:28:37 +0800 Subject: [PATCH 01/10] Update compute_dict_like to get all columns --- pandas/core/apply.py | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 832beeddcef3c..3f7fe1d4c0156 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -471,8 +471,22 @@ def compute_dict_like( keys += [key] * len(key_data) results += key_data - else: + elif is_groupby: # key used for column selection and output + + df = obj.obj + results, keys = [], [] + for key, how in func.items(): + for index in range(df.shape[1]): + col = df.iloc[:, index] + if col.name != key: + continue + + series = obj._gotitem(key, ndim=1, subset=col) + result = getattr(series, op_name)(how, **kwargs) + results.append(result) + keys.append(key) + else: results = [ getattr(obj._gotitem(key, ndim=1), op_name)(how, **kwargs) for key, how in func.items() @@ -496,11 +510,14 @@ def wrap_results_dict_like( is_ndframe = [isinstance(r, ABCNDFrame) for r in result_data] if all(is_ndframe): - results = dict(zip(result_index, result_data)) + results = [result for result in result_data if not result.empty] keys_to_use: Iterable[Hashable] - keys_to_use = [k for k in result_index if not results[k].empty] + keys_to_use = [k for k, v in zip(result_index, result_data) if not v.empty] # Have to check, if at least one DataFrame is not empty. - keys_to_use = keys_to_use if keys_to_use != [] else result_index + if keys_to_use == []: + keys_to_use = result_index + results = result_data + if selected_obj.ndim == 2: # keys are columns, so we can preserve names ktu = Index(keys_to_use) @@ -509,7 +526,7 @@ def wrap_results_dict_like( axis: AxisInt = 0 if isinstance(obj, ABCSeries) else 1 result = concat( - {k: results[k] for k in keys_to_use}, + results, axis=axis, keys=keys_to_use, ) From 84ef941f01193910256524e3bff042165ec63c4d Mon Sep 17 00:00:00 2001 From: Kei Date: Sun, 28 Apr 2024 22:31:06 +0800 Subject: [PATCH 02/10] Add tests --- .../tests/groupby/aggregate/test_aggregate.py | 86 +++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 2b9df1b7079da..5fc7e98c91c11 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1663,3 +1663,89 @@ def func(x): msg = "length must not be 0" with pytest.raises(ValueError, match=msg): df.groupby("A", observed=False).agg(func) + + +def test_groupby_aggregation_duplicate_columns_single_dict_value(): + # GH#55041 + df = DataFrame( + [[1, 2, 3, 4], [1, 3, 4, 5], [2, 4, 5, 6]], + columns=["a", "b", "c", "c"], + ) + gb = df.groupby("a") + result = gb.agg({"c": "sum"}) + + expected = DataFrame( + [[7, 9], [5, 6]], columns=["c", "c"], index=Index([1, 2], name="a") + ) + tm.assert_frame_equal(result, expected) + + +def test_groupby_aggregation_duplicate_columns_multiple_dict_values(): + # GH#55041 + df = DataFrame( + [[1, 2, 3, 4], [1, 3, 4, 5], [2, 4, 5, 6]], + columns=["a", "b", "c", "c"], + ) + gb = df.groupby("a") + result = gb.agg({"c": ["sum", "min", "max", "min"]}) + + expected = DataFrame( + [[7, 3, 4, 3, 9, 4, 5, 4], [5, 5, 5, 5, 6, 6, 6, 6]], + columns=MultiIndex( + levels=[["c"], ["sum", "min", "max"]], + codes=[[0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 2, 1, 0, 1, 2, 1]], + ), + index=Index([1, 2], name="a"), + ) + tm.assert_frame_equal(result, expected) + + +def test_groupby_aggregation_duplicate_columns_some_empty_result(): + # GH#55041 + df = DataFrame( + [ + [1, 9843, 43, 54, 7867], + [2, 940, 9, -34, 44], + [1, -34, -546, -549358, 0], + [2, 244, -33, -100, 44], + ], + columns=["a", "b", "b", "c", "c"], + ) + gb = df.groupby("a") + result = gb.agg({"b": [], "c": ["var"]}) + + expected = DataFrame( + [[1.509268e11, 30944844.5], [2.178000e03, 0.0]], + columns=MultiIndex(levels=[["c"], ["var"]], codes=[[0, 0], [0, 0]]), + index=Index([1, 2], name="a"), + ) + tm.assert_frame_equal(result, expected) + + +def test_groupby_aggregation_multi_index_duplicate_columns(): + # GH#55041 + df = DataFrame( + [ + [1, -9843, 43, 54, 7867], + [2, 940, 9, -34, 44], + [1, -34, 546, -549358, 0], + [2, 244, -33, -100, 44], + ], + columns=MultiIndex( + levels=[["level1.1", "level1.2"], ["level2.1", "level2.2"]], + codes=[[0, 0, 0, 1, 1], [0, 1, 1, 0, 1]], + ), + index=MultiIndex( + levels=[["level1.1", "level1.2"], ["level2.1", "level2.2"]], + codes=[[0, 0, 0, 1], [0, 1, 1, 0]], + ), + ) + gb = df.groupby(level=0) + result = gb.agg({("level1.1", "level2.2"): "min"}) + + expected = DataFrame( + [[-9843, 9], [244, -33]], + columns=MultiIndex(levels=[["level1.1"], ["level2.2"]], codes=[[0, 0], [0, 0]]), + index=Index(["level1.1", "level1.2"]), + ) + tm.assert_frame_equal(result, expected) From e8f31720ca5376683dede157602ca5033bbf38f9 Mon Sep 17 00:00:00 2001 From: Kei Date: Sun, 28 Apr 2024 22:36:26 +0800 Subject: [PATCH 03/10] Update rst --- doc/source/whatsnew/v3.0.0.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index c77348b365370..b2cca04e5dd7f 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -441,6 +441,8 @@ Groupby/resample/rolling - Bug in :meth:`.Resampler.interpolate` on a :class:`DataFrame` with non-uniform sampling and/or indices not aligning with the resulting resampled index would result in wrong interpolation (:issue:`21351`) - Bug in :meth:`DataFrame.ewm` and :meth:`Series.ewm` when passed ``times`` and aggregation functions other than mean (:issue:`51695`) - Bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`) +- Bug in :meth:`DataFrameGroupBy.agg` that raises ``AttributeError`` when there is dictionary input and duplicated columns, instead of returning a DataFrame with the aggregation of all duplicate columns. +(:issue:`55041`) Reshaping From 79a8ea649d3e4e1a36512fe0353ce0172e5632ac Mon Sep 17 00:00:00 2001 From: Kei Date: Sun, 28 Apr 2024 23:36:11 +0800 Subject: [PATCH 04/10] Remove newline from rst --- doc/source/whatsnew/v3.0.0.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index b2cca04e5dd7f..7e49b532c66b5 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -440,9 +440,8 @@ Groupby/resample/rolling - Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`) - Bug in :meth:`.Resampler.interpolate` on a :class:`DataFrame` with non-uniform sampling and/or indices not aligning with the resulting resampled index would result in wrong interpolation (:issue:`21351`) - Bug in :meth:`DataFrame.ewm` and :meth:`Series.ewm` when passed ``times`` and aggregation functions other than mean (:issue:`51695`) +- Bug in :meth:`DataFrameGroupBy.agg` that raises ``AttributeError`` when there is dictionary input and duplicated columns, instead of returning a DataFrame with the aggregation of all duplicate columns. (:issue:`55041`) - Bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`) -- Bug in :meth:`DataFrameGroupBy.agg` that raises ``AttributeError`` when there is dictionary input and duplicated columns, instead of returning a DataFrame with the aggregation of all duplicate columns. -(:issue:`55041`) Reshaping From c8ca7f734df0bffbe853984337bb4644a219aba0 Mon Sep 17 00:00:00 2001 From: Kei Date: Thu, 2 May 2024 17:15:32 +0800 Subject: [PATCH 05/10] Project the columns before converting to series group by --- pandas/core/apply.py | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 3f7fe1d4c0156..d84c44dfe2d09 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -474,18 +474,19 @@ def compute_dict_like( elif is_groupby: # key used for column selection and output - df = obj.obj + df = selected_obj results, keys = [], [] for key, how in func.items(): - for index in range(df.shape[1]): - col = df.iloc[:, index] - if col.name != key: - continue + cols = df[key] + + for index in range(cols.shape[1]): + col = cols.iloc[:, index] series = obj._gotitem(key, ndim=1, subset=col) result = getattr(series, op_name)(how, **kwargs) results.append(result) keys.append(key) + else: results = [ getattr(obj._gotitem(key, ndim=1), op_name)(how, **kwargs) @@ -525,11 +526,18 @@ def wrap_results_dict_like( keys_to_use = ktu axis: AxisInt = 0 if isinstance(obj, ABCSeries) else 1 - result = concat( - results, - axis=axis, - keys=keys_to_use, - ) + if len(keys_to_use) == 0: + result = concat( + results, + axis=axis, + ) + else: + result = concat( + results, + axis=axis, + keys=keys_to_use, + ) + elif any(is_ndframe): # There is a mix of NDFrames and scalars raise ValueError( From 5dd4fbcc195c997dd16ce951ad4833a335ea57c7 Mon Sep 17 00:00:00 2001 From: Kei Date: Thu, 2 May 2024 17:24:41 +0800 Subject: [PATCH 06/10] retrigger doc build From a3eac473b595aea023cc892806b0d9f66bebf45c Mon Sep 17 00:00:00 2001 From: Kei Date: Thu, 2 May 2024 18:08:14 +0800 Subject: [PATCH 07/10] Account for 1d/series projection result --- pandas/core/apply.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index d84c44dfe2d09..9dffc5d928f81 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -479,10 +479,16 @@ def compute_dict_like( for key, how in func.items(): cols = df[key] - for index in range(cols.shape[1]): - col = cols.iloc[:, index] + if cols.ndim == 1: + series_list = [obj._gotitem(key, ndim=1, subset=cols)] + else: + for index in range(cols.shape[1]): + col = cols.iloc[:, index] + + series = obj._gotitem(key, ndim=1, subset=col) + series_list.append(series) - series = obj._gotitem(key, ndim=1, subset=col) + for series in series_list: result = getattr(series, op_name)(how, **kwargs) results.append(result) keys.append(key) From 7f565cadabaa5ed2df345c5c7ea27f56ffa0afba Mon Sep 17 00:00:00 2001 From: Kei Date: Thu, 2 May 2024 21:21:31 +0800 Subject: [PATCH 08/10] Declare var before assignment --- pandas/core/apply.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 9dffc5d928f81..db384290323cd 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -482,6 +482,7 @@ def compute_dict_like( if cols.ndim == 1: series_list = [obj._gotitem(key, ndim=1, subset=cols)] else: + series_list = [] for index in range(cols.shape[1]): col = cols.iloc[:, index] From de5948d07ecd188997274ff9e6aef52e71407f1e Mon Sep 17 00:00:00 2001 From: Kei Date: Fri, 3 May 2024 13:53:31 +0800 Subject: [PATCH 09/10] Remove if condition --- pandas/core/apply.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index db384290323cd..cfd8dceeae9e6 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -533,18 +533,11 @@ def wrap_results_dict_like( keys_to_use = ktu axis: AxisInt = 0 if isinstance(obj, ABCSeries) else 1 - if len(keys_to_use) == 0: - result = concat( - results, - axis=axis, - ) - else: - result = concat( - results, - axis=axis, - keys=keys_to_use, - ) - + result = concat( + results, + axis=axis, + keys=keys_to_use, + ) elif any(is_ndframe): # There is a mix of NDFrames and scalars raise ValueError( From dc1b44928f6ad88015964a82c82368fed28e56a5 Mon Sep 17 00:00:00 2001 From: ellaella12 Date: Sun, 12 May 2024 16:37:52 +0800 Subject: [PATCH 10/10] Add test to test agg list funcs --- .../tests/groupby/aggregate/test_aggregate.py | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 5fc7e98c91c11..729bfc338691f 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1749,3 +1749,35 @@ def test_groupby_aggregation_multi_index_duplicate_columns(): index=Index(["level1.1", "level1.2"]), ) tm.assert_frame_equal(result, expected) + + +def test_groupby_aggregation_func_list_multi_index_duplicate_columns(): + # GH#55041 + df = DataFrame( + [ + [1, -9843, 43, 54, 7867], + [2, 940, 9, -34, 44], + [1, -34, 546, -549358, 0], + [2, 244, -33, -100, 44], + ], + columns=MultiIndex( + levels=[["level1.1", "level1.2"], ["level2.1", "level2.2"]], + codes=[[0, 0, 0, 1, 1], [0, 1, 1, 0, 1]], + ), + index=MultiIndex( + levels=[["level1.1", "level1.2"], ["level2.1", "level2.2"]], + codes=[[0, 0, 0, 1], [0, 1, 1, 0]], + ), + ) + gb = df.groupby(level=0) + result = gb.agg({("level1.1", "level2.2"): ["min", "max"]}) + + expected = DataFrame( + [[-9843, 940, 9, 546], [244, 244, -33, -33]], + columns=MultiIndex( + levels=[["level1.1"], ["level2.2"], ["min", "max"]], + codes=[[0, 0, 0, 0], [0, 0, 0, 0], [0, 1, 0, 1]], + ), + index=Index(["level1.1", "level1.2"]), + ) + tm.assert_frame_equal(result, expected)