From 92966d69b43c7673f4ae286d2357344d3dd1a2db Mon Sep 17 00:00:00 2001 From: luke <2736230899@qq.com> Date: Wed, 22 Mar 2023 18:42:51 +0800 Subject: [PATCH 1/7] BUG: Agg in non-unique col --- pandas/core/apply.py | 21 ++++++++++++++++++--- pandas/tests/apply/test_frame_apply.py | 12 ++++++++++++ 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 08618d5a6aa16..34119b1894709 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -416,9 +416,14 @@ def agg_dict_like(self) -> DataFrame | Series: results = {key: colg.agg(how) for key, how in arg.items()} else: # key used for column selection and output - results = { - key: obj._gotitem(key, ndim=1).agg(how) for key, how in arg.items() - } + results = {} + for key, how in arg.items(): + indices = [i for i, col in enumerate(obj.columns) if col == key] + if len(indices) == 1: # for unique columns + results[key] = obj._gotitem(key, ndim=1).agg(how) + else: # for non-unique columns + col_results = [obj.iloc[:, i].agg(how) for i in indices] + results[key] = col_results # set the final keys keys = list(arg.keys()) @@ -426,6 +431,8 @@ def agg_dict_like(self) -> DataFrame | Series: # Avoid making two isinstance calls in all and any below is_ndframe = [isinstance(r, ABCNDFrame) for r in results.values()] + is_list = [isinstance(v, list) for v in results.values()] + # combine results if all(is_ndframe): keys_to_use: Iterable[Hashable] @@ -451,6 +458,14 @@ def agg_dict_like(self) -> DataFrame | Series: "and transformation operations " "simultaneously" ) + elif any(is_list): + # GH#51099 + # convert list-like values in results to Series with corresponding keys + from pandas import Series + + values = [val for sublist in results.values() for val in sublist] + keys = [key for key, sublist in results.items() for _ in sublist] + result = Series(values, index=keys) else: from pandas import Series diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 6ed3f6140d361..bf79b5efa3acc 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -1642,3 +1642,15 @@ def foo2(x, b=2, c=0): columns=MultiIndex.from_tuples([("x", "foo1"), ("x", "foo2")]), ) tm.assert_frame_equal(result, expected) + + +def test_agg_dist_like_and_nonunique_columns(): + # GH#51099 + df = DataFrame( + {"A": [None, 2, 3], "B": [1.0, np.nan, 3.0], "C": ["foo", None, "bar"]} + ) + df.columns = ["A", "A", "C"] + + result = df.agg({"A": "count"}) # same with 'apply' instead of 'agg' + expected = df["A"].count() + tm.assert_series_equal(result, expected) From d830964aca0e1ec35edad3d06c578384c1de474f Mon Sep 17 00:00:00 2001 From: luke <2736230899@qq.com> Date: Wed, 22 Mar 2023 18:58:50 +0800 Subject: [PATCH 2/7] what is new --- doc/source/whatsnew/v2.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index ec5d08e75f0e4..f691c9b9be179 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1365,6 +1365,7 @@ Reshaping - Bug in :meth:`DataFrame.explode` raising ``ValueError`` on multiple columns with ``NaN`` values or empty lists (:issue:`46084`) - Bug in :meth:`DataFrame.transpose` with ``IntervalDtype`` column with ``timedelta64[ns]`` endpoints (:issue:`44917`) - Bug in :meth:`DataFrame.agg` and :meth:`Series.agg` would ignore arguments when passed a list of functions (:issue:`50863`) +- Bug in :meth:`DataFrame.agg` and :meth:`Series.agg` would return incorrect type when dist-like argument passed in (:issue:`51099`) Sparse ^^^^^^ From 9c21e639646aae5251a23890f3e20e2d0414f7bf Mon Sep 17 00:00:00 2001 From: luke <2736230899@qq.com> Date: Tue, 28 Mar 2023 11:13:20 +0800 Subject: [PATCH 3/7] Fix bug but add more codes --- pandas/core/apply.py | 56 +++++++++++++++----------- pandas/tests/apply/test_frame_apply.py | 2 +- 2 files changed, 34 insertions(+), 24 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 34119b1894709..9415dfa6a1446 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -409,30 +409,38 @@ def agg_dict_like(self) -> DataFrame | Series: context_manager = com.temp_setattr(obj, "as_index", True) else: context_manager = nullcontext() + + if isinstance(selected_obj, ABCDataFrame): + is_non_unique_col = selected_obj.columns.duplicated() + else: + is_non_unique_col = [False] + with context_manager: if selected_obj.ndim == 1: # key only used for output - colg = obj._gotitem(selection, ndim=1) - results = {key: colg.agg(how) for key, how in arg.items()} - else: - # key used for column selection and output + key_res = obj._gotitem(selection, ndim=1) + results = {key: key_res.agg(how) for key, how in arg.items()} + elif any(is_non_unique_col): + # GH#51099 + # results is a dict of lists results = {} for key, how in arg.items(): - indices = [i for i, col in enumerate(obj.columns) if col == key] - if len(indices) == 1: # for unique columns - results[key] = obj._gotitem(key, ndim=1).agg(how) - else: # for non-unique columns - col_results = [obj.iloc[:, i].agg(how) for i in indices] - results[key] = col_results - + key_res = [] + for col_idx in selected_obj.columns.get_indexer_for([key]): + col = selected_obj.iloc[:, col_idx] + key_res.append(col.agg(how)) + results[key] = key_res + else: + # key used for column selection and output + results = { + key: obj._gotitem(key, ndim=1).agg(how) for key, how in arg.items() + } # set the final keys keys = list(arg.keys()) # Avoid making two isinstance calls in all and any below is_ndframe = [isinstance(r, ABCNDFrame) for r in results.values()] - is_list = [isinstance(v, list) for v in results.values()] - # combine results if all(is_ndframe): keys_to_use: Iterable[Hashable] @@ -458,18 +466,10 @@ def agg_dict_like(self) -> DataFrame | Series: "and transformation operations " "simultaneously" ) - elif any(is_list): - # GH#51099 - # convert list-like values in results to Series with corresponding keys - from pandas import Series - - values = [val for sublist in results.values() for val in sublist] - keys = [key for key, sublist in results.items() for _ in sublist] - result = Series(values, index=keys) else: from pandas import Series - # we have a dict of scalars + # we have a dict of scalars or a list of scalars # GH 36212 use name only if obj is a series if obj.ndim == 1: obj = cast("Series", obj) @@ -477,7 +477,17 @@ def agg_dict_like(self) -> DataFrame | Series: else: name = None - result = Series(results, name=name) + if any(is_non_unique_col): + # Expand the scalar list and construct a series. + series_list = [] + for key, value in results.items(): + assert isinstance(value, list) + series_list.append(Series(value, index=[key] * len(value))) + + result = concat(series_list, axis=0) + result.name = name + else: + result = Series(results, name=name) return result diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index f17aa2ebb6e32..f6907c0cea09b 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -1674,6 +1674,6 @@ def test_agg_dist_like_and_nonunique_columns(): ) df.columns = ["A", "A", "C"] - result = df.agg({"A": "count"}) # same with 'apply' instead of 'agg' + result = df.agg({"A": "count"}) expected = df["A"].count() tm.assert_series_equal(result, expected) From f03122b518509e65628d55d48ad83f3809207d73 Mon Sep 17 00:00:00 2001 From: luke <2736230899@qq.com> Date: Tue, 28 Mar 2023 16:03:40 +0800 Subject: [PATCH 4/7] Fix mypy and improve what's new --- doc/source/whatsnew/v2.0.0.rst | 2 +- pandas/core/apply.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 7dce93730bb69..87a41a01ec81f 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1366,7 +1366,7 @@ Reshaping - Bug in :meth:`DataFrame.explode` raising ``ValueError`` on multiple columns with ``NaN`` values or empty lists (:issue:`46084`) - Bug in :meth:`DataFrame.transpose` with ``IntervalDtype`` column with ``timedelta64[ns]`` endpoints (:issue:`44917`) - Bug in :meth:`DataFrame.agg` and :meth:`Series.agg` would ignore arguments when passed a list of functions (:issue:`50863`) -- Bug in :meth:`DataFrame.agg` and :meth:`Series.agg` would return incorrect type when dist-like argument passed in (:issue:`51099`) +- Bug in :meth:`DataFrame.agg` and :meth:`Series.agg` on non-unique columns would return incorrect type when dist-like argument passed in (:issue:`51099`) Sparse ^^^^^^ diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 9415dfa6a1446..be346515b77b5 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -411,7 +411,7 @@ def agg_dict_like(self) -> DataFrame | Series: context_manager = nullcontext() if isinstance(selected_obj, ABCDataFrame): - is_non_unique_col = selected_obj.columns.duplicated() + is_non_unique_col = selected_obj.columns.duplicated().tolist() else: is_non_unique_col = [False] From 14ef7e98e6c42bacc2dd8eb2c034b75480f54600 Mon Sep 17 00:00:00 2001 From: luke <2736230899@qq.com> Date: Sat, 1 Apr 2023 17:01:39 +0800 Subject: [PATCH 5/7] Improve preformance --- pandas/core/apply.py | 51 +++++++++++++++++++++++--------------------- 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index cbe1d77d4ff0b..c397cf28ce3b4 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -411,26 +411,33 @@ def agg_dict_like(self) -> DataFrame | Series: else: context_manager = nullcontext() - if isinstance(selected_obj, ABCDataFrame): - is_non_unique_col = selected_obj.columns.duplicated().tolist() - else: - is_non_unique_col = [False] + is_non_unique_col = ( + selected_obj.ndim == 2 + and selected_obj.columns.nunique() < len(selected_obj.columns) + ) with context_manager: if selected_obj.ndim == 1: # key only used for output - key_res = obj._gotitem(selection, ndim=1) - results = {key: key_res.agg(how) for key, how in arg.items()} - elif any(is_non_unique_col): + colg = obj._gotitem(selection, ndim=1) + results = {key: colg.agg(how) for key, how in arg.items()} + elif is_non_unique_col: # GH#51099 - # results is a dict of lists - results = {} + result_data = [] + result_index = [] for key, how in arg.items(): - key_res = [] - for col_idx in selected_obj.columns.get_indexer_for([key]): - col = selected_obj.iloc[:, col_idx] - key_res.append(col.agg(how)) - results[key] = key_res + indices = selected_obj.columns.get_indexer_for([key]) + labels = selected_obj.columns.take(indices) + label_to_indices = defaultdict(list) + for index, label in zip(indices, labels): + label_to_indices[label].append(index) + + for indices in label_to_indices.values(): + for indice in indices: + result_index.append(key) + result_data.append( + selected_obj._ixs(indice, axis=1).agg(how) + ) else: # key used for column selection and output results = { @@ -440,7 +447,10 @@ def agg_dict_like(self) -> DataFrame | Series: keys = list(arg.keys()) # Avoid making two isinstance calls in all and any below - is_ndframe = [isinstance(r, ABCNDFrame) for r in results.values()] + if is_non_unique_col: + is_ndframe = [False] + else: + is_ndframe = [isinstance(r, ABCNDFrame) for r in results.values()] # combine results if all(is_ndframe): @@ -478,15 +488,8 @@ def agg_dict_like(self) -> DataFrame | Series: else: name = None - if any(is_non_unique_col): - # Expand the scalar list and construct a series. - series_list = [] - for key, value in results.items(): - assert isinstance(value, list) - series_list.append(Series(value, index=[key] * len(value))) - - result = concat(series_list, axis=0) - result.name = name + if is_non_unique_col: + result = Series(result_data, index=result_index, name=name) else: result = Series(results, name=name) From e533f430486637d803c1bf6f1368c7a74f061871 Mon Sep 17 00:00:00 2001 From: luke <2736230899@qq.com> Date: Mon, 3 Apr 2023 10:14:43 +0800 Subject: [PATCH 6/7] Improve preformance --- pandas/core/apply.py | 42 ++++++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index c397cf28ce3b4..01e018b2eaaeb 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -420,8 +420,10 @@ def agg_dict_like(self) -> DataFrame | Series: if selected_obj.ndim == 1: # key only used for output colg = obj._gotitem(selection, ndim=1) - results = {key: colg.agg(how) for key, how in arg.items()} + result_data = [colg.agg(how) for _, how in arg.items()] + result_index = list(arg.keys()) elif is_non_unique_col: + # key used for column selection and output # GH#51099 result_data = [] result_index = [] @@ -432,32 +434,31 @@ def agg_dict_like(self) -> DataFrame | Series: for index, label in zip(indices, labels): label_to_indices[label].append(index) - for indices in label_to_indices.values(): - for indice in indices: - result_index.append(key) - result_data.append( - selected_obj._ixs(indice, axis=1).agg(how) - ) + key_data = [ + selected_obj._ixs(indice, axis=1).agg(how) + for label, indices in label_to_indices.items() + for indice in indices + ] + + result_index += [key] * len(key_data) + result_data += key_data else: # key used for column selection and output - results = { - key: obj._gotitem(key, ndim=1).agg(how) for key, how in arg.items() - } - # set the final keys - keys = list(arg.keys()) + result_data = [ + obj._gotitem(key, ndim=1).agg(how) for key, how in arg.items() + ] + result_index = list(arg.keys()) # Avoid making two isinstance calls in all and any below - if is_non_unique_col: - is_ndframe = [False] - else: - is_ndframe = [isinstance(r, ABCNDFrame) for r in results.values()] + is_ndframe = [isinstance(r, ABCNDFrame) for r in result_data] # combine results if all(is_ndframe): + results = dict(zip(result_index, result_data)) keys_to_use: Iterable[Hashable] - keys_to_use = [k for k in keys if not results[k].empty] + keys_to_use = [k for k in result_index if not results[k].empty] # Have to check, if at least one DataFrame is not empty. - keys_to_use = keys_to_use if keys_to_use != [] else keys + keys_to_use = keys_to_use if keys_to_use != [] else result_index if selected_obj.ndim == 2: # keys are columns, so we can preserve names ktu = Index(keys_to_use) @@ -488,10 +489,7 @@ def agg_dict_like(self) -> DataFrame | Series: else: name = None - if is_non_unique_col: - result = Series(result_data, index=result_index, name=name) - else: - result = Series(results, name=name) + result = Series(result_data, index=result_index, name=name) return result From b2c32b97a96fd9ecd02d375651ac4b47505712e7 Mon Sep 17 00:00:00 2001 From: luke <2736230899@qq.com> Date: Tue, 11 Apr 2023 10:59:14 +0800 Subject: [PATCH 7/7] Improve what'new and comment --- doc/source/whatsnew/v2.0.0.rst | 1 - doc/source/whatsnew/v2.1.0.rst | 1 + pandas/core/apply.py | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index e401d89f157c2..2ee6ecc4e6cd4 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1372,7 +1372,6 @@ Reshaping - Bug in :meth:`DataFrame.explode` raising ``ValueError`` on multiple columns with ``NaN`` values or empty lists (:issue:`46084`) - Bug in :meth:`DataFrame.transpose` with ``IntervalDtype`` column with ``timedelta64[ns]`` endpoints (:issue:`44917`) - Bug in :meth:`DataFrame.agg` and :meth:`Series.agg` would ignore arguments when passed a list of functions (:issue:`50863`) -- Bug in :meth:`DataFrame.agg` and :meth:`Series.agg` on non-unique columns would return incorrect type when dist-like argument passed in (:issue:`51099`) Sparse ^^^^^^ diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 9f5d6011a7780..8df644b0ccb41 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -308,6 +308,7 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ - Bug in :meth:`DataFrame.stack` losing extension dtypes when columns is a :class:`MultiIndex` and frame contains mixed dtypes (:issue:`45740`) +- Bug in :meth:`DataFrame.agg` and :meth:`Series.agg` on non-unique columns would return incorrect type when dist-like argument passed in (:issue:`51099`) - Bug in :meth:`DataFrame.transpose` inferring dtype for object column (:issue:`51546`) - Bug in :meth:`Series.combine_first` converting ``int64`` dtype to ``float64`` and losing precision on very large integers (:issue:`51764`) - diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 4c6611286ad5b..c8e189eeadebd 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -483,7 +483,7 @@ def agg_dict_like(self) -> DataFrame | Series: else: from pandas import Series - # we have a dict of scalars or a list of scalars + # we have a list of scalars # GH 36212 use name only if obj is a series if obj.ndim == 1: obj = cast("Series", obj)