diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index afe361da1114d..4020660f2fd49 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -364,6 +364,7 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ - Bug in :meth:`DataFrame.stack` losing extension dtypes when columns is a :class:`MultiIndex` and frame contains mixed dtypes (:issue:`45740`) +- Bug in :meth:`DataFrame.agg` and :meth:`Series.agg` on non-unique columns would return incorrect type when dist-like argument passed in (:issue:`51099`) - Bug in :meth:`DataFrame.transpose` inferring dtype for object column (:issue:`51546`) - Bug in :meth:`Series.combine_first` converting ``int64`` dtype to ``float64`` and losing precision on very large integers (:issue:`51764`) - diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 18a0fed915384..31a61bd01131e 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -412,29 +412,55 @@ def agg_dict_like(self) -> DataFrame | Series: context_manager = com.temp_setattr(obj, "as_index", True) else: context_manager = nullcontext() + + is_non_unique_col = ( + selected_obj.ndim == 2 + and selected_obj.columns.nunique() < len(selected_obj.columns) + ) + with context_manager: if selected_obj.ndim == 1: # key only used for output colg = obj._gotitem(selection, ndim=1) - results = {key: colg.agg(how) for key, how in arg.items()} + result_data = [colg.agg(how) for _, how in arg.items()] + result_index = list(arg.keys()) + elif is_non_unique_col: + # key used for column selection and output + # GH#51099 + result_data = [] + result_index = [] + for key, how in arg.items(): + indices = selected_obj.columns.get_indexer_for([key]) + labels = selected_obj.columns.take(indices) + label_to_indices = defaultdict(list) + for index, label in zip(indices, labels): + label_to_indices[label].append(index) + + key_data = [ + selected_obj._ixs(indice, axis=1).agg(how) + for label, indices in label_to_indices.items() + for indice in indices + ] + + result_index += [key] * len(key_data) + result_data += key_data else: # key used for column selection and output - results = { - key: obj._gotitem(key, ndim=1).agg(how) for key, how in arg.items() - } - - # set the final keys - keys = list(arg.keys()) + result_data = [ + obj._gotitem(key, ndim=1).agg(how) for key, how in arg.items() + ] + result_index = list(arg.keys()) # Avoid making two isinstance calls in all and any below - is_ndframe = [isinstance(r, ABCNDFrame) for r in results.values()] + is_ndframe = [isinstance(r, ABCNDFrame) for r in result_data] # combine results if all(is_ndframe): + results = dict(zip(result_index, result_data)) keys_to_use: Iterable[Hashable] - keys_to_use = [k for k in keys if not results[k].empty] + keys_to_use = [k for k in result_index if not results[k].empty] # Have to check, if at least one DataFrame is not empty. - keys_to_use = keys_to_use if keys_to_use != [] else keys + keys_to_use = keys_to_use if keys_to_use != [] else result_index if selected_obj.ndim == 2: # keys are columns, so we can preserve names ktu = Index(keys_to_use) @@ -457,7 +483,7 @@ def agg_dict_like(self) -> DataFrame | Series: else: from pandas import Series - # we have a dict of scalars + # we have a list of scalars # GH 36212 use name only if obj is a series if obj.ndim == 1: obj = cast("Series", obj) @@ -465,7 +491,7 @@ def agg_dict_like(self) -> DataFrame | Series: else: name = None - result = Series(results, name=name) + result = Series(result_data, index=result_index, name=name) return result diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index f08baff1c5d65..0397f8cae3ac7 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -1496,3 +1496,15 @@ def test_agg_std(): result = df.agg([np.std]) expected = DataFrame({"A": 2.0, "B": 2.0}, index=["std"]) tm.assert_frame_equal(result, expected) + + +def test_agg_dist_like_and_nonunique_columns(): + # GH#51099 + df = DataFrame( + {"A": [None, 2, 3], "B": [1.0, np.nan, 3.0], "C": ["foo", None, "bar"]} + ) + df.columns = ["A", "A", "C"] + + result = df.agg({"A": "count"}) + expected = df["A"].count() + tm.assert_series_equal(result, expected)