diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 7555fb50f16af..b954600016d4b 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -21,7 +21,10 @@ import numpy as np -from pandas._config import option_context +from pandas._config import ( + get_option, + option_context, +) from pandas._libs import lib from pandas._typing import ( @@ -82,6 +85,7 @@ def frame_apply( result_type: str | None = None, args=None, kwargs=None, + renamer=None, ) -> FrameApply: """construct and return a row or column based frame apply object""" axis = obj._get_axis_number(axis) @@ -98,6 +102,7 @@ def frame_apply( result_type=result_type, args=args, kwargs=kwargs, + renamer=renamer, ) @@ -112,6 +117,7 @@ def __init__( result_type: str | None, args, kwargs, + renamer=None, ): self.obj = obj self.raw = raw @@ -141,6 +147,7 @@ def f(x): self.orig_f: AggFuncType = func self.f: AggFuncType = f + self.renamer = renamer @abc.abstractmethod def apply(self) -> DataFrame | Series: @@ -164,10 +171,16 @@ def agg(self) -> DataFrame | Series | None: return self.apply_str() if is_dict_like(arg): - return self.agg_dict_like() + if get_option("new_udf_methods"): + return self.new_dict_like("agg") + else: + return self.agg_dict_like() elif is_list_like(arg): # we require a list, but not a 'str' - return self.agg_list_like() + if get_option("new_udf_methods"): + return self.new_list_like("agg") + else: + return self.agg_list_like() if callable(arg): f = com.get_cython_func(arg) @@ -408,6 +421,70 @@ def agg_list_like(self) -> DataFrame | Series: ) return concatenated.reindex(full_ordered_index, copy=False) + def new_list_like(self, method: str) -> DataFrame | Series: + """ + Compute aggregation in the case of a list-like argument. + + Returns + ------- + Result of aggregation. + """ + from pandas.core.reshape.concat import concat + + obj = self.obj + arg = cast(List[AggFuncTypeBase], self.f) + + results = [] + keys = [] + result_dim = None + + for a in arg: + name = None + try: + if isinstance(a, (tuple, list)): + # Handle (name, value) pairs + name, a = a + new_res = getattr(obj, method)(a) + if result_dim is None: + result_dim = getattr(new_res, "ndim", 0) + elif getattr(new_res, "ndim", 0) != result_dim: + raise ValueError( + "cannot combine transform and aggregation operations" + ) + except TypeError: + pass + else: + results.append(new_res) + + # make sure we find a good name + if name is None: + name = com.get_callable_name(a) or a + keys.append(name) + + # if we are empty + if not len(results): + raise ValueError("no results") + + try: + concatenated = concat(results, keys=keys, axis=1, sort=False) + except TypeError: + # we are concatting non-NDFrame objects, + # e.g. a list of scalars + from pandas import Series + + result = Series(results, index=keys, name=obj.name) + return result + else: + # Concat uses the first index to determine the final indexing order. + # The union of a shorter first index with the other indices causes + # the index sorting to be different from the order of the aggregating + # functions. Reindex if this is the case. + index_size = concatenated.index.size + full_ordered_index = next( + result.index for result in results if result.index.size == index_size + ) + return concatenated.reindex(full_ordered_index, copy=False) + def agg_dict_like(self) -> DataFrame | Series: """ Compute aggregation in the case of a dict-like argument. @@ -486,6 +563,86 @@ def agg_dict_like(self) -> DataFrame | Series: return result + def new_dict_like(self, method: str) -> DataFrame | Series: + """ + Compute aggregation in the case of a dict-like argument. + + Returns + ------- + Result of aggregation. + """ + from pandas import Index + from pandas.core.reshape.concat import concat + + obj = self.obj + arg = cast(AggFuncTypeDict, self.f) + + if not isinstance(obj, SelectionMixin): + # i.e. obj is Series or DataFrame + selected_obj = obj + selection = None + else: + selected_obj = obj._selected_obj + selection = obj._selection + + arg = self.normalize_dictlike_arg("agg", selected_obj, arg) + + if selected_obj.ndim == 1: + # key only used for output + colg = obj._gotitem(selection, ndim=1) + results = {key: getattr(colg, method)(how) for key, how in arg.items()} + + else: + # key used for column selection and output + results = { + key: getattr(obj._gotitem(key, ndim=1), method)(how) + for key, how in arg.items() + } + if self.renamer is not None: + for key, columns in self.renamer.items(): + results[key].columns = columns + + # Avoid making two isinstance calls in all and any below + if isinstance(results, dict): + is_ndframe = [isinstance(r, ABCNDFrame) for r in results.values()] + else: + is_ndframe = [isinstance(r, ABCNDFrame) for r in results] + + # combine results + result: DataFrame | Series + if all(is_ndframe): + keys_to_use: Iterable[Hashable] + keys_to_use = [k for k in arg.keys() if not results[k].empty] + keys_to_use = keys_to_use if keys_to_use != [] else arg.keys() + if selected_obj.ndim == 2: + # keys are columns, so we can preserve names + ktu = Index(keys_to_use) + ktu._set_names(selected_obj.columns.names) + keys_to_use = ktu + keys = None if selected_obj.ndim == 1 else keys_to_use + result = concat({k: results[k] for k in keys_to_use}, keys=keys, axis=1) + elif any(is_ndframe): + # There is a mix of NDFrames and scalars + raise ValueError( + "cannot perform both aggregation " + "and transformation operations " + "simultaneously" + ) + else: + from pandas import Series + + # we have a dict of scalars + # GH 36212 use name only if obj is a series + if obj.ndim == 1: + obj = cast("Series", obj) + name = obj.name + else: + name = None + + result = Series(results, index=arg.keys(), name=name) + + return result + def apply_str(self) -> DataFrame | Series: """ Compute apply in case of a string. @@ -522,6 +679,35 @@ def apply_multiple(self) -> DataFrame | Series: """ return self.obj.aggregate(self.f, self.axis, *self.args, **self.kwargs) + def new_apply_multiple(self) -> DataFrame | Series: + """ + Compute apply in case of a list-like or dict-like. + + Returns + ------- + result: Series, DataFrame, or None + Result when self.f is a list-like or dict-like, None otherwise. + """ + obj = self.obj + axis = self.axis + + self.obj = obj if axis == 0 else obj.T + self.axis = 0 + + try: + if is_dict_like(self.f): + result = self.new_dict_like("apply") + else: + result = self.new_list_like("apply") + finally: + self.obj = obj + self.axis = axis + + if axis == 1: + result = result.T if result is not None else result + + return result + def normalize_dictlike_arg( self, how: str, obj: DataFrame | Series, func: AggFuncTypeDict ) -> AggFuncTypeDict: @@ -661,7 +847,10 @@ def apply(self) -> DataFrame | Series: """compute the results""" # dispatch to agg if is_list_like(self.f): - return self.apply_multiple() + if get_option("new_udf_methods"): + return self.new_apply_multiple() + else: + return self.apply_multiple() # all empty if len(self.columns) == 0 and len(self.index) == 0: @@ -1039,7 +1228,10 @@ def apply(self) -> DataFrame | Series: # dispatch to agg if is_list_like(self.f): - return self.apply_multiple() + if get_option("new_udf_methods"): + return self.new_apply_multiple() + else: + return self.apply_multiple() if isinstance(self.f, str): # if we are a string, try to dispatch @@ -1172,7 +1364,13 @@ def transform(self): def reconstruct_func( func: AggFuncType | None, **kwargs -) -> tuple[bool, AggFuncType | None, list[str] | None, list[int] | None]: +) -> tuple[ + bool, + AggFuncType | None, + list[str] | None, + list[int] | None, + dict[str, list[str]] | None, +]: """ This is the internal function to reconstruct func given if there is relabeling or not and also normalize the keyword to get new order of columns. @@ -1204,14 +1402,16 @@ def reconstruct_func( Examples -------- >>> reconstruct_func(None, **{"foo": ("col", "min")}) - (True, defaultdict(, {'col': ['min']}), ('foo',), array([0])) + (True, defaultdict(, {'col': ['min']}), ('foo',), array([0]), + defaultdict(, {'col': ['foo']})) >>> reconstruct_func("min") - (False, 'min', None, None) + (False, 'min', None, None, None) """ relabeling = func is None and is_multi_agg_with_relabel(**kwargs) columns: list[str] | None = None order: list[int] | None = None + renamer: dict[str, list[str]] | None = None if not relabeling: if isinstance(func, list) and len(func) > len(set(func)): @@ -1227,9 +1427,9 @@ def reconstruct_func( raise TypeError("Must provide 'func' or tuples of '(column, aggfunc).") if relabeling: - func, columns, order = normalize_keyword_aggregation(kwargs) + func, columns, order, renamer = normalize_keyword_aggregation(kwargs) - return relabeling, func, columns, order + return relabeling, func, columns, order, renamer def is_multi_agg_with_relabel(**kwargs) -> bool: @@ -1258,7 +1458,9 @@ def is_multi_agg_with_relabel(**kwargs) -> bool: ) -def normalize_keyword_aggregation(kwargs: dict) -> tuple[dict, list[str], list[int]]: +def normalize_keyword_aggregation( + kwargs: dict, +) -> tuple[dict, list[str], list[int], dict[str, list]]: """ Normalize user-provided "named aggregation" kwargs. Transforms from the new ``Mapping[str, NamedAgg]`` style kwargs @@ -1280,7 +1482,8 @@ def normalize_keyword_aggregation(kwargs: dict) -> tuple[dict, list[str], list[i Examples -------- >>> normalize_keyword_aggregation({"output": ("input", "sum")}) - (defaultdict(, {'input': ['sum']}), ('output',), array([0])) + (defaultdict(, {'input': ['sum']}), ('output',), array([0]), + defaultdict(, {'input': ['output']})) """ from pandas.core.indexes.base import Index @@ -1290,11 +1493,13 @@ def normalize_keyword_aggregation(kwargs: dict) -> tuple[dict, list[str], list[i # May be hitting https://github.com/python/mypy/issues/5958 # saying it doesn't have an attribute __name__ aggspec: DefaultDict = defaultdict(list) + renamer: DefaultDict = defaultdict(list) order = [] columns, pairs = list(zip(*kwargs.items())) - for column, aggfunc in pairs: + for name, (column, aggfunc) in zip(kwargs, pairs): aggspec[column].append(aggfunc) + renamer[column].append(name) order.append((column, com.get_callable_name(aggfunc) or aggfunc)) # uniquify aggfunc name if duplicated in order list @@ -1314,7 +1519,7 @@ def normalize_keyword_aggregation(kwargs: dict) -> tuple[dict, list[str], list[i col_idx_order = Index(uniquified_aggspec).get_indexer(uniquified_order) # error: Incompatible return value type (got "Tuple[defaultdict[Any, Any], # Any, ndarray]", expected "Tuple[Dict[Any, Any], List[str], List[int]]") - return aggspec, columns, col_idx_order # type: ignore[return-value] + return aggspec, columns, col_idx_order, renamer # type: ignore[return-value] def _make_unique_kwarg_list( diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index cf41bcff3d0c8..2df98a59cb184 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -511,6 +511,23 @@ def use_inf_as_na_cb(key): validator=is_one_of_factory(["block", "array"]), ) +new_udf_methods = """ +: boolean + Whether to use the new UDF method implementations. Currently experimental. + Defaults to False. +""" + + +with cf.config_prefix("mode"): + cf.register_option( + "new_udf_methods", + # Get the default from an environment variable, if set, otherwise defaults + # to "block". This environment variable can be set for testing. + os.environ.get("PANDAS_NEW_UDF_METHODS", "false").lower() == "true", + new_udf_methods, + validator=is_bool, + ) + # user warnings chained_assignment = """ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index bfdfeabbd389c..853f7009113d5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8609,9 +8609,11 @@ def aggregate(self, func=None, axis: Axis = 0, *args, **kwargs): axis = self._get_axis_number(axis) - relabeling, func, columns, order = reconstruct_func(func, **kwargs) + relabeling, func, columns, order, renamer = reconstruct_func(func, **kwargs) - op = frame_apply(self, func=func, axis=axis, args=args, kwargs=kwargs) + op = frame_apply( + self, func=func, axis=axis, args=args, kwargs=kwargs, renamer=renamer + ) result = op.agg() if relabeling: diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 38f1d41494fd2..833d011e4745d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -26,6 +26,8 @@ import numpy as np +from pandas._config import get_option + from pandas._libs import reduction as libreduction from pandas._typing import ( ArrayLike, @@ -883,7 +885,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) index = self.grouper.result_index return self.obj._constructor(result, index=index, columns=data.columns) - relabeling, func, columns, order = reconstruct_func(func, **kwargs) + relabeling, func, columns, order, _ = reconstruct_func(func, **kwargs) func = maybe_mangle_lambdas(func) op = GroupByApply(self, func, args, kwargs) @@ -897,49 +899,65 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) result.columns = columns if result is None: - - # grouper specific aggregations - if self.grouper.nkeys > 1: - # test_groupby_as_index_series_scalar gets here with 'not self.as_index' - return self._python_agg_general(func, *args, **kwargs) - elif args or kwargs: - # test_pass_args_kwargs gets here (with and without as_index) - # can't return early - result = self._aggregate_frame(func, *args, **kwargs) - - elif self.axis == 1: - # _aggregate_multiple_funcs does not allow self.axis == 1 - # Note: axis == 1 precludes 'not self.as_index', see __init__ - result = self._aggregate_frame(func) - return result - + if get_option("new_udf_methods"): + if args or kwargs: + # test_pass_args_kwargs gets here (with and without as_index) + # can't return early + result = self._aggregate_frame(func, *args, **kwargs) + + elif self.axis == 1 and self.grouper.nkeys == 1: + # _aggregate_multiple_funcs does not allow self.axis == 1 + # Note: axis == 1 precludes 'not self.as_index', see __init__ + result = self._aggregate_frame(func) + return result + else: + # test_groupby_as_index_series_scalar gets here + # with 'not self.as_index' + return self._python_agg_general(func, *args, **kwargs) else: - - # try to treat as if we are passing a list - gba = GroupByApply(self, [func], args=(), kwargs={}) - try: - result = gba.agg() - - except ValueError as err: - if "no results" not in str(err): - # raised directly by _aggregate_multiple_funcs - raise + # grouper specific aggregations + if self.grouper.nkeys > 1: + # test_groupby_as_index_series_scalar gets here with + # 'not self.as_index' + return self._python_agg_general(func, *args, **kwargs) + elif args or kwargs: + # test_pass_args_kwargs gets here (with and without as_index) + # can't return early + result = self._aggregate_frame(func, *args, **kwargs) + + elif self.axis == 1: + # _aggregate_multiple_funcs does not allow self.axis == 1 + # Note: axis == 1 precludes 'not self.as_index', see __init__ result = self._aggregate_frame(func) + return result else: - sobj = self._selected_obj + # try to treat as if we are passing a list + gba = GroupByApply(self, [func], args=(), kwargs={}) + try: + result = gba.agg() + + except ValueError as err: + if "no results" not in str(err): + # raised directly by _aggregate_multiple_funcs + raise + result = self._aggregate_frame(func) - if isinstance(sobj, Series): - # GH#35246 test_groupby_as_index_select_column_sum_empty_df - result.columns = self._obj_with_exclusions.columns.copy() else: - # Retain our column names - result.columns._set_names( - sobj.columns.names, level=list(range(sobj.columns.nlevels)) - ) - # select everything except for the last level, which is the one - # containing the name of the function(s), see GH#32040 - result.columns = result.columns.droplevel(-1) + sobj = self._selected_obj + + if isinstance(sobj, Series): + # GH#35246 test_groupby_as_index_select_column_sum_empty_df + result.columns = self._obj_with_exclusions.columns.copy() + else: + # Retain our column names + result.columns._set_names( + sobj.columns.names, + level=list(range(sobj.columns.nlevels)), + ) + # select everything except for the last level, which is the + # one containing the name of the function(s), see GH#32040 + result.columns = result.columns.droplevel(-1) if not self.as_index: self._insert_inaxis_grouper_inplace(result) diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 62983b5327a26..d79317e48bd5b 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -13,6 +13,7 @@ Series, Timestamp, date_range, + get_option, ) import pandas._testing as tm from pandas.tests.frame.common import zip_frames @@ -639,6 +640,8 @@ def test_apply_dup_names_multi_agg(): # GH 21063 df = DataFrame([[0, 1], [2, 3]], columns=["a", "a"]) expected = DataFrame([[0, 1]], columns=["a", "a"], index=["min"]) + if get_option("mode.new_udf_methods"): + expected = expected.T result = df.agg(["min"]) tm.assert_frame_equal(result, expected) @@ -1010,25 +1013,46 @@ def test_agg_transform(axis, float_frame): # list-like result = float_frame.apply([np.sqrt], axis=axis) expected = f_sqrt.copy() - if axis in {0, "index"}: - expected.columns = MultiIndex.from_product([float_frame.columns, ["sqrt"]]) + if get_option("mode.new_udf_methods"): + if axis in {0, "index"}: + expected.columns = MultiIndex.from_product( + [["sqrt"], float_frame.columns] + ) + else: + expected.index = MultiIndex.from_product([["sqrt"], float_frame.index]) else: - expected.index = MultiIndex.from_product([float_frame.index, ["sqrt"]]) + if axis in {0, "index"}: + expected.columns = MultiIndex.from_product( + [float_frame.columns, ["sqrt"]] + ) + else: + expected.index = MultiIndex.from_product([float_frame.index, ["sqrt"]]) tm.assert_frame_equal(result, expected) # multiple items in list # these are in the order as if we are applying both # functions per series and then concatting result = float_frame.apply([np.abs, np.sqrt], axis=axis) - expected = zip_frames([f_abs, f_sqrt], axis=other_axis) - if axis in {0, "index"}: - expected.columns = MultiIndex.from_product( - [float_frame.columns, ["absolute", "sqrt"]] - ) + if get_option("mode.new_udf_methods"): + expected = pd.concat([f_abs, f_sqrt], axis=other_axis) + if axis in {0, "index"}: + expected.columns = MultiIndex.from_product( + [["absolute", "sqrt"], float_frame.columns] + ) + else: + expected.index = MultiIndex.from_product( + [["absolute", "sqrt"], float_frame.index] + ) else: - expected.index = MultiIndex.from_product( - [float_frame.index, ["absolute", "sqrt"]] - ) + expected = zip_frames([f_abs, f_sqrt], axis=other_axis) + if axis in {0, "index"}: + expected.columns = MultiIndex.from_product( + [float_frame.columns, ["absolute", "sqrt"]] + ) + else: + expected.index = MultiIndex.from_product( + [float_frame.index, ["absolute", "sqrt"]] + ) tm.assert_frame_equal(result, expected) @@ -1040,6 +1064,8 @@ def test_demo(): expected = DataFrame( {"A": [0, 4], "B": [5, 5]}, columns=["A", "B"], index=["min", "max"] ) + if get_option("mode.new_udf_methods"): + expected = expected.T tm.assert_frame_equal(result, expected) result = df.agg({"A": ["min", "max"], "B": ["sum", "max"]}) @@ -1086,18 +1112,29 @@ def test_agg_multiple_mixed_no_warning(): }, index=["min", "sum"], ) + klass, match = None, None + if get_option("mode.new_udf_methods"): + expected = expected.T + klass, match = FutureWarning, "Dropping of nuisance columns" # sorted index - with tm.assert_produces_warning(None): + with tm.assert_produces_warning(klass, match=match, check_stacklevel=False): result = mdf.agg(["min", "sum"]) tm.assert_frame_equal(result, expected) - with tm.assert_produces_warning(None): + klass, match = None, None + if get_option("mode.new_udf_methods"): + klass, match = FutureWarning, "Dropping of nuisance columns" + + with tm.assert_produces_warning(klass, match=match, check_stacklevel=False): result = mdf[["D", "C", "B", "A"]].agg(["sum", "min"]) # GH40420: the result of .agg should have an index that is sorted # according to the arguments provided to agg. - expected = expected[["D", "C", "B", "A"]].reindex(["sum", "min"]) + if get_option("mode.new_udf_methods"): + expected = expected.loc[["D", "C", "B", "A"], ["sum", "min"]] + else: + expected = expected[["D", "C", "B", "A"]].reindex(["sum", "min"]) tm.assert_frame_equal(result, expected) @@ -1116,6 +1153,8 @@ def test_agg_reduce(axis, float_frame): ) expected.columns = ["mean", "max", "sum"] expected = expected.T if axis in {0, "index"} else expected + if get_option("mode.new_udf_methods"): + expected = expected.T result = float_frame.agg(["mean", "max", "sum"], axis=axis) tm.assert_frame_equal(result, expected) @@ -1192,6 +1231,8 @@ def test_nuiscance_columns(): index=["min"], columns=df.columns, ) + if get_option("mode.new_udf_methods"): + expected = expected.T tm.assert_frame_equal(result, expected) with tm.assert_produces_warning( @@ -1205,6 +1246,8 @@ def test_nuiscance_columns(): expected = DataFrame( [[6, 6.0, "foobarbaz"]], index=["sum"], columns=["A", "B", "C"] ) + if get_option("mode.new_udf_methods"): + expected = expected.T tm.assert_frame_equal(result, expected) @@ -1244,8 +1287,12 @@ def test_non_callable_aggregates(how): } ) - tm.assert_frame_equal(result1, result2, check_like=True) - tm.assert_frame_equal(result2, expected, check_like=True) + if get_option("new_udf_methods"): + tm.assert_frame_equal(result2, expected) + tm.assert_frame_equal(result1, expected.T) + else: + tm.assert_frame_equal(result1, result2, check_like=True) + tm.assert_frame_equal(result2, expected, check_like=True) # Just functional string arg is same as calling df.arg() result = getattr(df, how)("count") @@ -1282,7 +1329,9 @@ def func(group_col): tm.assert_series_equal(result, expected) result = df.agg([func]) - expected = expected.to_frame("func").T + expected = expected.to_frame("func") + if not get_option("mode.new_udf_methods"): + expected = expected.T tm.assert_frame_equal(result, expected) @@ -1395,14 +1444,20 @@ def test_apply_empty_list_reduce(): tm.assert_series_equal(result, expected) -def test_apply_no_suffix_index(): +def test_apply_no_suffix_index(request): # GH36189 pdf = DataFrame([[4, 9]] * 3, columns=["A", "B"]) - result = pdf.apply(["sum", lambda x: x.sum(), lambda x: x.sum()]) - expected = DataFrame( - {"A": [12, 12, 12], "B": [27, 27, 27]}, index=["sum", "", ""] - ) - + result = pdf.apply([np.square, lambda x: x, lambda x: x]) + if get_option("mode.new_udf_methods"): + columns = MultiIndex.from_product( + [["square", "", ""], ["A", "B"]] + ) + expected = DataFrame(3 * [[16, 81, 4, 9, 4, 9]], columns=columns) + else: + columns = MultiIndex.from_product( + [["A", "B"], ["square", "", ""]] + ) + expected = DataFrame(3 * [[16, 4, 4, 81, 9, 9]], columns=columns) tm.assert_frame_equal(result, expected) @@ -1434,15 +1489,25 @@ def foo(s): aggs = ["sum", foo, "count", "min"] result = df.agg(aggs) - expected = DataFrame( - { - "item": ["123456", np.nan, 6, "1"], - "att1": [21.0, 10.5, 6.0, 1.0], - "att2": [18.0, 9.0, 6.0, 0.0], - "att3": [17.0, 8.5, 6.0, 0.0], - }, - index=["sum", "foo", "count", "min"], - ) + if get_option("mode.new_udf_methods"): + expected = DataFrame( + { + "sum": ["123456", 21, 18, 17], + "count": [6, 6, 6, 6], + "min": ["1", 1, 0, 0], + }, + index=["item", "att1", "att2", "att3"], + ) + else: + expected = DataFrame( + { + "item": ["123456", np.nan, 6, "1"], + "att1": [21.0, 10.5, 6.0, 1.0], + "att2": [18.0, 9.0, 6.0, 0.0], + "att3": [17.0, 8.5, 6.0, 0.0], + }, + index=["sum", "foo", "count", "min"], + ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py index 2af340f0c1bb9..513f074c7beb5 100644 --- a/pandas/tests/apply/test_series_apply.py +++ b/pandas/tests/apply/test_series_apply.py @@ -13,6 +13,7 @@ MultiIndex, Series, concat, + get_option, isna, timedelta_range, ) @@ -254,10 +255,14 @@ def test_transform(string_series): # dict, provide renaming expected = concat([f_sqrt, f_abs], axis=1) expected.columns = ["foo", "bar"] - expected = expected.unstack().rename("series") + if not get_option("new_udf_methods"): + expected = expected.unstack().rename("series") result = string_series.apply({"foo": np.sqrt, "bar": np.abs}) - tm.assert_series_equal(result.reindex_like(expected), expected) + if get_option("new_udf_methods"): + tm.assert_frame_equal(result, expected) + else: + tm.assert_series_equal(result.reindex_like(expected), expected) @pytest.mark.parametrize("op", series_transform_kernels) @@ -364,18 +369,32 @@ def test_with_nested_series(datetime_series): def test_replicate_describe(string_series): # this also tests a result set that is all scalars expected = string_series.describe() - result = string_series.apply( - { - "count": "count", - "mean": "mean", - "std": "std", - "min": "min", - "25%": lambda x: x.quantile(0.25), - "50%": "median", - "75%": lambda x: x.quantile(0.75), - "max": "max", - } - ) + if get_option("new_udf_methods"): + result = string_series.agg( + { + "count": "count", + "mean": "mean", + "std": "std", + "min": "min", + "25%": lambda x: x.quantile(0.25), + "50%": "median", + "75%": lambda x: x.quantile(0.75), + "max": "max", + } + ) + else: + result = string_series.apply( + { + "count": "count", + "mean": "mean", + "std": "std", + "min": "min", + "25%": lambda x: x.quantile(0.25), + "50%": "median", + "75%": lambda x: x.quantile(0.75), + "max": "max", + } + ) tm.assert_series_equal(result, expected) @@ -410,10 +429,14 @@ def test_non_callable_aggregates(how): def test_series_apply_no_suffix_index(): # GH36189 s = Series([4] * 3) - result = s.apply(["sum", lambda x: x.sum(), lambda x: x.sum()]) - expected = Series([12, 12, 12], index=["sum", "", ""]) - - tm.assert_series_equal(result, expected) + if get_option("new_udf_methods"): + result = s.apply(["sqrt", lambda x: np.sqrt(x), lambda x: np.sqrt(x)]) + expected = DataFrame([[2.0] * 3] * 3, columns=["sqrt", "", ""]) + tm.assert_frame_equal(result, expected) + else: + result = s.apply(["sum", lambda x: x.sum(), lambda x: x.sum()]) + expected = Series([12, 12, 12], index=["sum", "", ""]) + tm.assert_series_equal(result, expected) def test_map(datetime_series): @@ -795,10 +818,19 @@ def test_apply_to_timedelta(): @pytest.mark.parametrize("how", ["agg", "apply"]) def test_apply_listlike_reducer(string_series, ops, names, how): # GH 39140 - expected = Series({name: op(string_series) for name, op in zip(names, ops)}) - expected.name = "series" result = getattr(string_series, how)(ops) - tm.assert_series_equal(result, expected) + if get_option("new_udf_methods"): + if how == "apply": + expected = DataFrame({name: string_series for name, op in zip(names, ops)}) + else: + expected = Series( + {name: op(string_series) for name, op in zip(names, ops)}, name="series" + ) + tm.assert_equal(result, expected) + else: + expected = Series({name: op(string_series) for name, op in zip(names, ops)}) + expected.name = "series" + tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -813,10 +845,21 @@ def test_apply_listlike_reducer(string_series, ops, names, how): @pytest.mark.parametrize("how", ["agg", "apply"]) def test_apply_dictlike_reducer(string_series, ops, how): # GH 39140 - expected = Series({name: op(string_series) for name, op in ops.items()}) - expected.name = string_series.name - result = getattr(string_series, how)(ops) - tm.assert_series_equal(result, expected) + if get_option("new_udf_methods"): + if how == "apply": + names = ops.keys() if isinstance(ops, dict) else ops.index + expected = concat([string_series.rename(name) for name in names], axis=1) + else: + expected = Series( + {name: op(string_series) for name, op in ops.items()}, name="series" + ) + result = getattr(string_series, how)(ops) + tm.assert_equal(result, expected) + else: + expected = Series({name: op(string_series) for name, op in ops.items()}) + expected.name = string_series.name + result = getattr(string_series, how)(ops) + tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -849,7 +892,14 @@ def test_apply_listlike_transformer(string_series, ops, names): def test_apply_dictlike_transformer(string_series, ops): # GH 39140 with np.errstate(all="ignore"): - expected = concat({name: op(string_series) for name, op in ops.items()}) - expected.name = string_series.name - result = string_series.apply(ops) - tm.assert_series_equal(result, expected) + if get_option("new_udf_methods"): + expected = concat( + {name: op(string_series) for name, op in ops.items()}, axis=1 + ) + result = string_series.apply(ops) + tm.assert_frame_equal(result, expected) + else: + expected = concat({name: op(string_series) for name, op in ops.items()}) + expected.name = string_series.name + result = string_series.apply(ops) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 4bda0e6ef9872..78194a806f456 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -20,6 +20,7 @@ MultiIndex, Series, concat, + get_option, to_datetime, ) import pandas._testing as tm @@ -499,12 +500,18 @@ def test_order_aggregate_multiple_funcs(): # GH 25692 df = DataFrame({"A": [1, 1, 2, 2], "B": [1, 2, 3, 4]}) - res = df.groupby("A").agg(["sum", "max", "mean", "ohlc", "min"]) - result = res.columns.levels[1] + if get_option("new_udf_methods"): + # TODO (GH 35725): This will not raise when agg-must-agg is implemented + msg = "Cannot concat indices that do not have the same number of levels" + with pytest.raises(AssertionError, match=msg): + df.groupby("A").agg(["sum", "max", "mean", "ohlc", "min"]) + else: + res = df.groupby("A").agg(["sum", "max", "mean", "ohlc", "min"]) + result = res.columns.levels[1] - expected = Index(["sum", "max", "mean", "ohlc", "min"]) + expected = Index(["sum", "max", "mean", "ohlc", "min"]) - tm.assert_index_equal(result, expected) + tm.assert_index_equal(result, expected) @pytest.mark.parametrize("dtype", [np.int64, np.uint64]) @@ -1207,7 +1214,10 @@ def test_nonagg_agg(): g = df.groupby("a") result = g.agg(["cumsum"]) - result.columns = result.columns.droplevel(-1) + if get_option("new_udf_methods"): + result.columns = result.columns.droplevel(0) + else: + result.columns = result.columns.droplevel(-1) expected = g.agg("cumsum") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 79990deed261d..d34538a4f5935 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config import get_option + import pandas.util._test_decorators as td import pandas as pd @@ -201,13 +203,21 @@ def test_aggregate_api_consistency(): tm.assert_frame_equal(result, expected, check_like=True) result = grouped.agg([np.sum, np.mean]) - expected = pd.concat([c_sum, c_mean, d_sum, d_mean], axis=1) - expected.columns = MultiIndex.from_product([["C", "D"], ["sum", "mean"]]) + if get_option("new_udf_methods"): + expected = pd.concat([c_sum, d_sum, c_mean, d_mean], axis=1) + expected.columns = MultiIndex.from_product([["sum", "mean"], ["C", "D"]]) + else: + expected = pd.concat([c_sum, c_mean, d_sum, d_mean], axis=1) + expected.columns = MultiIndex.from_product([["C", "D"], ["sum", "mean"]]) tm.assert_frame_equal(result, expected, check_like=True) result = grouped[["D", "C"]].agg([np.sum, np.mean]) - expected = pd.concat([d_sum, d_mean, c_sum, c_mean], axis=1) - expected.columns = MultiIndex.from_product([["D", "C"], ["sum", "mean"]]) + if get_option("new_udf_methods"): + expected = pd.concat([d_sum, c_sum, d_mean, c_mean], axis=1) + expected.columns = MultiIndex.from_product([["sum", "mean"], ["D", "C"]]) + else: + expected = pd.concat([d_sum, d_mean, c_sum, c_mean], axis=1) + expected.columns = MultiIndex.from_product([["D", "C"], ["sum", "mean"]]) tm.assert_frame_equal(result, expected, check_like=True) result = grouped.agg({"C": "mean", "D": "sum"}) @@ -393,7 +403,10 @@ def P1(a): g = df.groupby("date") expected = g.agg([P1]) - expected.columns = expected.columns.levels[0] + if get_option("new_udf_methods"): + expected.columns = expected.columns.levels[1] + else: + expected.columns = expected.columns.levels[0] result = g.agg(P1) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 3ae11847cc06b..caa04d7994223 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -14,6 +14,7 @@ Series, Timestamp, date_range, + get_option, ) import pandas._testing as tm import pandas.core.nanops as nanops @@ -1138,7 +1139,10 @@ def test_apply_to_nullable_integer_returns_float(values, function): tm.assert_frame_equal(result, expected) result = groups.agg([function]) - expected.columns = MultiIndex.from_tuples([("b", function)]) + if get_option("new_udf_methods"): + expected.columns = MultiIndex.from_tuples([(function, "b")]) + else: + expected.columns = MultiIndex.from_tuples([("b", function)]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index b9a6730996a02..1cf36ddbb1772 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -20,6 +20,7 @@ Timedelta, Timestamp, date_range, + get_option, read_csv, to_datetime, ) @@ -584,11 +585,18 @@ def test_frame_multi_key_function_list(): grouped = data.groupby(["A", "B"]) funcs = [np.mean, np.std] agged = grouped.agg(funcs) - expected = pd.concat( - [grouped["D"].agg(funcs), grouped["E"].agg(funcs), grouped["F"].agg(funcs)], - keys=["D", "E", "F"], - axis=1, - ) + if get_option("new_udf_methods"): + expected = pd.concat( + [grouped.agg(funcs[0]), grouped.agg(funcs[1])], + keys=["mean", "std"], + axis=1, + ) + else: + expected = pd.concat( + [grouped["D"].agg(funcs), grouped["E"].agg(funcs), grouped["F"].agg(funcs)], + keys=["D", "E", "F"], + axis=1, + ) assert isinstance(agged.index, MultiIndex) assert isinstance(expected.index, MultiIndex) tm.assert_frame_equal(agged, expected) @@ -1985,9 +1993,14 @@ def test_groupby_agg_ohlc_non_first(): index=date_range("2018-01-01", periods=2, freq="D", name="dti"), ) - result = df.groupby(Grouper(freq="D")).agg(["sum", "ohlc"]) - - tm.assert_frame_equal(result, expected) + if get_option("new_udf_methods"): + # TODO (GH 35725): This will not raise when agg-must-agg is implemented + msg = "Cannot concat indices that do not have the same number of levels" + with pytest.raises(AssertionError, match=msg): + df.groupby(Grouper(freq="D")).agg(["sum", "ohlc"]) + else: + result = df.groupby(Grouper(freq="D")).agg(["sum", "ohlc"]) + tm.assert_frame_equal(result, expected) def test_groupby_multiindex_nat(): diff --git a/pandas/tests/resample/test_deprecated.py b/pandas/tests/resample/test_deprecated.py index 359c3cea62f9c..ff3cb8d873bb9 100644 --- a/pandas/tests/resample/test_deprecated.py +++ b/pandas/tests/resample/test_deprecated.py @@ -10,6 +10,7 @@ from pandas import ( DataFrame, Series, + get_option, ) import pandas._testing as tm from pandas.core.indexes.datetimes import date_range @@ -97,7 +98,10 @@ def test_resample_loffset_arg_type(frame, create_index, arg): result_agg = df.resample("2D", loffset="2H").agg(arg) if isinstance(arg, list): - expected.columns = pd.MultiIndex.from_tuples([("value", "mean")]) + if get_option("new_udf_methods"): + expected.columns = pd.MultiIndex.from_tuples([("mean", "value")]) + else: + expected.columns = pd.MultiIndex.from_tuples([("value", "mean")]) tm.assert_frame_equal(result_agg, expected) @@ -216,7 +220,10 @@ def test_loffset_returns_datetimeindex(frame, kind, agg_arg): with tm.assert_produces_warning(FutureWarning): result_agg = df.resample("2D", loffset="2H", kind=kind).agg(agg_arg) if isinstance(agg_arg, list): - expected.columns = pd.MultiIndex.from_tuples([("value", "mean")]) + if get_option("new_udf_methods"): + expected.columns = pd.MultiIndex.from_tuples([("mean", "value")]) + else: + expected.columns = pd.MultiIndex.from_tuples([("value", "mean")]) tm.assert_frame_equal(result_agg, expected) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 3b3bd402e4cc7..d73a99ab80d0d 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -8,6 +8,7 @@ DataFrame, NamedAgg, Series, + get_option, ) import pandas._testing as tm from pandas.core.indexes.datetimes import date_range @@ -347,15 +348,14 @@ def test_agg(): b_std = r["B"].std() b_sum = r["B"].sum() - expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) - expected.columns = pd.MultiIndex.from_product([["A", "B"], ["mean", "std"]]) + if get_option("new_udf_methods"): + expected = pd.concat([a_mean, b_mean, a_std, b_std], axis=1) + expected.columns = pd.MultiIndex.from_product([["mean", "std"], ["A", "B"]]) + else: + expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) + expected.columns = pd.MultiIndex.from_product([["A", "B"], ["mean", "std"]]) for t in cases: - warn = FutureWarning if t in cases[1:3] else None - with tm.assert_produces_warning( - warn, match="Dropping invalid columns", check_stacklevel=False - ): - # .var on dt64 column raises and is dropped - result = t.aggregate([np.mean, np.std]) + result = t.aggregate([np.mean, np.std]) tm.assert_frame_equal(result, expected) expected = pd.concat([a_mean, b_std], axis=1) @@ -628,11 +628,22 @@ def test_agg_with_datetime_index_list_agg_func(col_name): columns=[col_name], ) result = df.resample("1d").aggregate(["mean"]) - expected = DataFrame( - [47.5, 143.5, 195.5], - index=date_range(start="2017-01-01", freq="D", periods=3, tz="Europe/Berlin"), - columns=pd.MultiIndex(levels=[[col_name], ["mean"]], codes=[[0], [0]]), - ) + if get_option("new_udf_methods"): + expected = DataFrame( + [47.5, 143.5, 195.5], + index=date_range( + start="2017-01-01", freq="D", periods=3, tz="Europe/Berlin" + ), + columns=pd.MultiIndex(levels=[["mean"], [col_name]], codes=[[0], [0]]), + ) + else: + expected = DataFrame( + [47.5, 143.5, 195.5], + index=date_range( + start="2017-01-01", freq="D", periods=3, tz="Europe/Berlin" + ), + columns=pd.MultiIndex(levels=[[col_name], ["mean"]], codes=[[0], [0]]), + ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 88607f4b036a0..4c13c9733cf68 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config import get_option + import pandas as pd from pandas import ( Categorical, @@ -1905,8 +1907,14 @@ def test_pivot_margins_name_unicode(self): frame, index=["foo"], aggfunc=len, margins=True, margins_name=greek ) index = Index([1, 2, 3, greek], dtype="object", name="foo") - expected = DataFrame(index=index) - tm.assert_frame_equal(table, expected) + + if get_option("new_udf_methods"): + expected = Series([1, 1, 1, 3], index=index) + expected.index.name = None + tm.assert_series_equal(table, expected) + else: + expected = DataFrame(index=index) + tm.assert_frame_equal(table, expected) def test_pivot_string_as_func(self): # GH #18713 diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index 5cc22249c26f0..0089f092dd439 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -10,6 +10,7 @@ Timestamp, concat, date_range, + get_option, timedelta_range, ) import pandas._testing as tm @@ -90,8 +91,12 @@ def test_agg(): b_std = r["B"].std() result = r.aggregate([np.mean, np.std]) - expected = concat([a_mean, a_std, b_mean, b_std], axis=1) - expected.columns = MultiIndex.from_product([["A", "B"], ["mean", "std"]]) + if get_option("new_udf_methods"): + expected = concat([a_mean, b_mean, a_std, b_std], axis=1) + expected.columns = MultiIndex.from_product([["mean", "std"], ["A", "B"]]) + else: + expected = concat([a_mean, a_std, b_mean, b_std], axis=1) + expected.columns = MultiIndex.from_product([["A", "B"], ["mean", "std"]]) tm.assert_frame_equal(result, expected) result = r.aggregate({"A": np.mean, "B": np.std}) @@ -147,7 +152,10 @@ def test_agg_consistency(): r = df.rolling(window=3) result = r.agg([np.sum, np.mean]).columns - expected = MultiIndex.from_product([list("AB"), ["sum", "mean"]]) + if get_option("new_udf_methods"): + expected = MultiIndex.from_product([["sum", "mean"], list("AB")]) + else: + expected = MultiIndex.from_product([list("AB"), ["sum", "mean"]]) tm.assert_index_equal(result, expected) result = r["A"].agg([np.sum, np.mean]).columns