diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 8198cc532d998..86f03b04fddb3 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -160,6 +160,33 @@ def test_agg_apply_corner(ts, tsframe): tm.assert_frame_equal(res, exp_df) +def test_with_na_groups(any_real_numpy_dtype): + index = Index(np.arange(10)) + values = Series(np.ones(10), index, dtype=any_real_numpy_dtype) + labels = Series( + [np.nan, "foo", "bar", "bar", np.nan, np.nan, "bar", "bar", np.nan, "foo"], + index=index, + ) + + # this SHOULD be an int + grouped = values.groupby(labels) + agged = grouped.agg(len) + expected = Series([4, 2], index=["bar", "foo"]) + + tm.assert_series_equal(agged, expected, check_dtype=False) + + # assert issubclass(agged.dtype.type, np.integer) + + # explicitly return a float from my function + def f(x): + return float(len(x)) + + agged = grouped.agg(f) + expected = Series([4.0, 2.0], index=["bar", "foo"]) + + tm.assert_series_equal(agged, expected) + + def test_agg_grouping_is_list_tuple(ts): df = DataFrame( np.random.default_rng(2).standard_normal((30, 4)), @@ -1049,6 +1076,73 @@ def test_grouby_agg_loses_results_with_as_index_false_relabel_multiindex(): tm.assert_frame_equal(result, expected) +def test_groupby_as_index_agg(df): + grouped = df.groupby("A", as_index=False) + + # single-key + + result = grouped[["C", "D"]].agg("mean") + expected = grouped.mean(numeric_only=True) + tm.assert_frame_equal(result, expected) + + result2 = grouped.agg({"C": "mean", "D": "sum"}) + expected2 = grouped.mean(numeric_only=True) + expected2["D"] = grouped.sum()["D"] + tm.assert_frame_equal(result2, expected2) + + grouped = df.groupby("A", as_index=True) + + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + grouped["C"].agg({"Q": "sum"}) + + # multi-key + + grouped = df.groupby(["A", "B"], as_index=False) + + result = grouped.agg("mean") + expected = grouped.mean() + tm.assert_frame_equal(result, expected) + + result2 = grouped.agg({"C": "mean", "D": "sum"}) + expected2 = grouped.mean() + expected2["D"] = grouped.sum()["D"] + tm.assert_frame_equal(result2, expected2) + + expected3 = grouped["C"].sum() + expected3 = DataFrame(expected3).rename(columns={"C": "Q"}) + msg = "Passing a dictionary to SeriesGroupBy.agg is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result3 = grouped["C"].agg({"Q": "sum"}) + tm.assert_frame_equal(result3, expected3) + + # GH7115 & GH8112 & GH8582 + df = DataFrame( + np.random.default_rng(2).integers(0, 100, (50, 3)), + columns=["jim", "joe", "jolie"], + ) + ts = Series(np.random.default_rng(2).integers(5, 10, 50), name="jim") + + gr = df.groupby(ts) + gr.nth(0) # invokes set_selection_from_grouper internally + + msg = "The behavior of DataFrame.sum with axis=None is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False): + res = gr.apply(sum) + with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False): + alt = df.groupby(ts).apply(sum) + tm.assert_frame_equal(res, alt) + + for attr in ["mean", "max", "count", "idxmax", "cumsum", "all"]: + gr = df.groupby(ts, as_index=False) + left = getattr(gr, attr)() + + gr = df.groupby(ts.values, as_index=True) + right = getattr(gr, attr)().reset_index(drop=True) + + tm.assert_frame_equal(left, right) + + @pytest.mark.parametrize( "func", [lambda s: s.mean(), lambda s: np.mean(s), lambda s: np.nanmean(s)] ) @@ -1252,6 +1346,28 @@ def test_agg_multiple_lambda(self): tm.assert_frame_equal(result2, expected) +def test_pass_args_kwargs_duplicate_columns(tsframe, as_index): + # go through _aggregate_frame with self.axis == 0 and duplicate columns + tsframe.columns = ["A", "B", "A", "C"] + gb = tsframe.groupby(lambda x: x.month, as_index=as_index) + + warn = None if as_index else FutureWarning + msg = "A grouping .* was excluded from the result" + with tm.assert_produces_warning(warn, match=msg): + res = gb.agg(np.percentile, 80, axis=0) + + ex_data = { + 1: tsframe[tsframe.index.month == 1].quantile(0.8), + 2: tsframe[tsframe.index.month == 2].quantile(0.8), + } + expected = DataFrame(ex_data).T + if not as_index: + # TODO: try to get this more consistent? + expected.index = Index(range(2)) + + tm.assert_frame_equal(res, expected) + + def test_groupby_get_by_index(): # GH 33439 df = DataFrame({"A": ["S", "W", "W"], "B": [1.0, 1.0, 2.0]}) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index f4b228eb5b326..5de98156b44e1 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1602,3 +1602,75 @@ def test_builtins_apply(keys, f): tm.assert_frame_equal(result, expected, check_dtype=False) tm.assert_series_equal(getattr(result, fname)(axis=0), getattr(df, fname)(axis=0)) + + +def test_inconsistent_return_type(): + # GH5592 + # inconsistent return type + df = DataFrame( + { + "A": ["Tiger", "Tiger", "Tiger", "Lamb", "Lamb", "Pony", "Pony"], + "B": Series(np.arange(7), dtype="int64"), + "C": pd.date_range("20130101", periods=7), + } + ) + + def f_0(grp): + return grp.iloc[0] + + expected = df.groupby("A").first()[["B"]] + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").apply(f_0)[["B"]] + tm.assert_frame_equal(result, expected) + + def f_1(grp): + if grp.name == "Tiger": + return None + return grp.iloc[0] + + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").apply(f_1)[["B"]] + e = expected.copy() + e.loc["Tiger"] = np.nan + tm.assert_frame_equal(result, e) + + def f_2(grp): + if grp.name == "Pony": + return None + return grp.iloc[0] + + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").apply(f_2)[["B"]] + e = expected.copy() + e.loc["Pony"] = np.nan + tm.assert_frame_equal(result, e) + + # 5592 revisited, with datetimes + def f_3(grp): + if grp.name == "Pony": + return None + return grp.iloc[0] + + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").apply(f_3)[["C"]] + e = df.groupby("A").first()[["C"]] + e.loc["Pony"] = pd.NaT + tm.assert_frame_equal(result, e) + + # scalar outputs + def f_4(grp): + if grp.name == "Pony": + return None + return grp.iloc[0].loc["C"] + + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").apply(f_4) + e = df.groupby("A").first()["C"].copy() + e.loc["Pony"] = np.nan + e.name = None + tm.assert_series_equal(result, e) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 14c5c21d41772..8750dd18b3db4 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -43,99 +43,6 @@ def test_repr(): assert result == expected -def test_groupby_std_datetimelike(warn_copy_on_write): - # GH#48481 - tdi = pd.timedelta_range("1 Day", periods=10000) - ser = Series(tdi) - ser[::5] *= 2 # get different std for different groups - - df = ser.to_frame("A").copy() - - df["B"] = ser + Timestamp(0) - df["C"] = ser + Timestamp(0, tz="UTC") - df.iloc[-1] = pd.NaT # last group includes NaTs - - gb = df.groupby(list(range(5)) * 2000) - - result = gb.std() - - # Note: this does not _exactly_ match what we would get if we did - # [gb.get_group(i).std() for i in gb.groups] - # but it _does_ match the floating point error we get doing the - # same operation on int64 data xref GH#51332 - td1 = Timedelta("2887 days 11:21:02.326710176") - td4 = Timedelta("2886 days 00:42:34.664668096") - exp_ser = Series([td1 * 2, td1, td1, td1, td4], index=np.arange(5)) - expected = DataFrame({"A": exp_ser, "B": exp_ser, "C": exp_ser}) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("dtype", ["int64", "int32", "float64", "float32"]) -def test_basic_aggregations(dtype): - data = Series(np.arange(9) // 3, index=np.arange(9), dtype=dtype) - - index = np.arange(9) - np.random.default_rng(2).shuffle(index) - data = data.reindex(index) - - grouped = data.groupby(lambda x: x // 3, group_keys=False) - - for k, v in grouped: - assert len(v) == 3 - - msg = "using SeriesGroupBy.mean" - with tm.assert_produces_warning(FutureWarning, match=msg): - agged = grouped.aggregate(np.mean) - assert agged[1] == 1 - - msg = "using SeriesGroupBy.mean" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = grouped.agg(np.mean) - tm.assert_series_equal(agged, expected) # shorthand - tm.assert_series_equal(agged, grouped.mean()) - result = grouped.sum() - msg = "using SeriesGroupBy.sum" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = grouped.agg(np.sum) - tm.assert_series_equal(result, expected) - - expected = grouped.apply(lambda x: x * x.sum()) - transformed = grouped.transform(lambda x: x * x.sum()) - assert transformed[7] == 12 - tm.assert_series_equal(transformed, expected) - - value_grouped = data.groupby(data) - msg = "using SeriesGroupBy.mean" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = value_grouped.aggregate(np.mean) - tm.assert_series_equal(result, agged, check_index_type=False) - - # complex agg - msg = "using SeriesGroupBy.[mean|std]" - with tm.assert_produces_warning(FutureWarning, match=msg): - agged = grouped.aggregate([np.mean, np.std]) - - msg = r"nested renamer is not supported" - with pytest.raises(SpecificationError, match=msg): - grouped.aggregate({"one": np.mean, "two": np.std}) - - group_constants = {0: 10, 1: 20, 2: 30} - msg = ( - "Pinning the groupby key to each group in SeriesGroupBy.agg is deprecated, " - "and cases that relied on it will raise in a future version" - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - # GH#41090 - agged = grouped.agg(lambda x: group_constants[x.name] + x.mean()) - assert agged[1] == 21 - - # corner cases - msg = "Must produce aggregated value" - # exception raised is type Exception - with pytest.raises(Exception, match=msg): - grouped.aggregate(lambda x: x * 2) - - def test_groupby_nonobject_dtype(multiindex_dataframe_random_data): key = multiindex_dataframe_random_data.index.codes[0] grouped = multiindex_dataframe_random_data.groupby(key) @@ -170,78 +77,6 @@ def max_value(group): tm.assert_series_equal(result, expected) -def test_inconsistent_return_type(): - # GH5592 - # inconsistent return type - df = DataFrame( - { - "A": ["Tiger", "Tiger", "Tiger", "Lamb", "Lamb", "Pony", "Pony"], - "B": Series(np.arange(7), dtype="int64"), - "C": date_range("20130101", periods=7), - } - ) - - def f_0(grp): - return grp.iloc[0] - - expected = df.groupby("A").first()[["B"]] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("A").apply(f_0)[["B"]] - tm.assert_frame_equal(result, expected) - - def f_1(grp): - if grp.name == "Tiger": - return None - return grp.iloc[0] - - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("A").apply(f_1)[["B"]] - e = expected.copy() - e.loc["Tiger"] = np.nan - tm.assert_frame_equal(result, e) - - def f_2(grp): - if grp.name == "Pony": - return None - return grp.iloc[0] - - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("A").apply(f_2)[["B"]] - e = expected.copy() - e.loc["Pony"] = np.nan - tm.assert_frame_equal(result, e) - - # 5592 revisited, with datetimes - def f_3(grp): - if grp.name == "Pony": - return None - return grp.iloc[0] - - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("A").apply(f_3)[["C"]] - e = df.groupby("A").first()[["C"]] - e.loc["Pony"] = pd.NaT - tm.assert_frame_equal(result, e) - - # scalar outputs - def f_4(grp): - if grp.name == "Pony": - return None - return grp.iloc[0].loc["C"] - - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("A").apply(f_4) - e = df.groupby("A").first()["C"].copy() - e.loc["Pony"] = np.nan - e.name = None - tm.assert_series_equal(result, e) - - def test_pass_args_kwargs(ts, tsframe): def f(x, q=None, axis=0): return np.percentile(x, q, axis=axis) @@ -295,28 +130,6 @@ def f(x, q=None, axis=0): tm.assert_frame_equal(apply_result, expected, check_names=False) -def test_pass_args_kwargs_duplicate_columns(tsframe, as_index): - # go through _aggregate_frame with self.axis == 0 and duplicate columns - tsframe.columns = ["A", "B", "A", "C"] - gb = tsframe.groupby(lambda x: x.month, as_index=as_index) - - warn = None if as_index else FutureWarning - msg = "A grouping .* was excluded from the result" - with tm.assert_produces_warning(warn, match=msg): - res = gb.agg(np.percentile, 80, axis=0) - - ex_data = { - 1: tsframe[tsframe.index.month == 1].quantile(0.8), - 2: tsframe[tsframe.index.month == 2].quantile(0.8), - } - expected = DataFrame(ex_data).T - if not as_index: - # TODO: try to get this more consistent? - expected.index = Index(range(2)) - - tm.assert_frame_equal(res, expected) - - def test_len(): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), @@ -350,33 +163,6 @@ def test_basic_regression(): grouped.mean() -def test_with_na_groups(any_real_numpy_dtype): - index = Index(np.arange(10)) - values = Series(np.ones(10), index, dtype=any_real_numpy_dtype) - labels = Series( - [np.nan, "foo", "bar", "bar", np.nan, np.nan, "bar", "bar", np.nan, "foo"], - index=index, - ) - - # this SHOULD be an int - grouped = values.groupby(labels) - agged = grouped.agg(len) - expected = Series([4, 2], index=["bar", "foo"]) - - tm.assert_series_equal(agged, expected, check_dtype=False) - - # assert issubclass(agged.dtype.type, np.integer) - - # explicitly return a float from my function - def f(x): - return float(len(x)) - - agged = grouped.agg(f) - expected = Series([4.0, 2.0], index=["bar", "foo"]) - - tm.assert_series_equal(agged, expected) - - def test_indices_concatenation_order(): # GH 2808 @@ -761,73 +547,6 @@ def test_groupby_as_index_select_column_sum_empty_df(): tm.assert_frame_equal(left, expected) -def test_groupby_as_index_agg(df): - grouped = df.groupby("A", as_index=False) - - # single-key - - result = grouped[["C", "D"]].agg("mean") - expected = grouped.mean(numeric_only=True) - tm.assert_frame_equal(result, expected) - - result2 = grouped.agg({"C": "mean", "D": "sum"}) - expected2 = grouped.mean(numeric_only=True) - expected2["D"] = grouped.sum()["D"] - tm.assert_frame_equal(result2, expected2) - - grouped = df.groupby("A", as_index=True) - - msg = r"nested renamer is not supported" - with pytest.raises(SpecificationError, match=msg): - grouped["C"].agg({"Q": "sum"}) - - # multi-key - - grouped = df.groupby(["A", "B"], as_index=False) - - result = grouped.agg("mean") - expected = grouped.mean() - tm.assert_frame_equal(result, expected) - - result2 = grouped.agg({"C": "mean", "D": "sum"}) - expected2 = grouped.mean() - expected2["D"] = grouped.sum()["D"] - tm.assert_frame_equal(result2, expected2) - - expected3 = grouped["C"].sum() - expected3 = DataFrame(expected3).rename(columns={"C": "Q"}) - msg = "Passing a dictionary to SeriesGroupBy.agg is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result3 = grouped["C"].agg({"Q": "sum"}) - tm.assert_frame_equal(result3, expected3) - - # GH7115 & GH8112 & GH8582 - df = DataFrame( - np.random.default_rng(2).integers(0, 100, (50, 3)), - columns=["jim", "joe", "jolie"], - ) - ts = Series(np.random.default_rng(2).integers(5, 10, 50), name="jim") - - gr = df.groupby(ts) - gr.nth(0) # invokes set_selection_from_grouper internally - - msg = "The behavior of DataFrame.sum with axis=None is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False): - res = gr.apply(sum) - with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False): - alt = df.groupby(ts).apply(sum) - tm.assert_frame_equal(res, alt) - - for attr in ["mean", "max", "count", "idxmax", "cumsum", "all"]: - gr = df.groupby(ts, as_index=False) - left = getattr(gr, attr)() - - gr = df.groupby(ts.values, as_index=True) - right = getattr(gr, attr)().reset_index(drop=True) - - tm.assert_frame_equal(left, right) - - def test_ops_not_as_index(reduction_func): # GH 10355, 21090 # Using as_index=False should not modify grouped column diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 273734e84d9aa..7530c9ca78cbc 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -20,6 +20,72 @@ from pandas.util import _test_decorators as td +@pytest.mark.parametrize("dtype", ["int64", "int32", "float64", "float32"]) +def test_basic_aggregations(dtype): + data = Series(np.arange(9) // 3, index=np.arange(9), dtype=dtype) + + index = np.arange(9) + np.random.default_rng(2).shuffle(index) + data = data.reindex(index) + + grouped = data.groupby(lambda x: x // 3, group_keys=False) + + for k, v in grouped: + assert len(v) == 3 + + msg = "using SeriesGroupBy.mean" + with tm.assert_produces_warning(FutureWarning, match=msg): + agged = grouped.aggregate(np.mean) + assert agged[1] == 1 + + msg = "using SeriesGroupBy.mean" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = grouped.agg(np.mean) + tm.assert_series_equal(agged, expected) # shorthand + tm.assert_series_equal(agged, grouped.mean()) + result = grouped.sum() + msg = "using SeriesGroupBy.sum" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = grouped.agg(np.sum) + tm.assert_series_equal(result, expected) + + expected = grouped.apply(lambda x: x * x.sum()) + transformed = grouped.transform(lambda x: x * x.sum()) + assert transformed[7] == 12 + tm.assert_series_equal(transformed, expected) + + value_grouped = data.groupby(data) + msg = "using SeriesGroupBy.mean" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = value_grouped.aggregate(np.mean) + tm.assert_series_equal(result, agged, check_index_type=False) + + # complex agg + msg = "using SeriesGroupBy.[mean|std]" + with tm.assert_produces_warning(FutureWarning, match=msg): + agged = grouped.aggregate([np.mean, np.std]) + + msg = r"nested renamer is not supported" + with pytest.raises(pd.errors.SpecificationError, match=msg): + grouped.aggregate({"one": np.mean, "two": np.std}) + + group_constants = {0: 10, 1: 20, 2: 30} + msg = ( + "Pinning the groupby key to each group in SeriesGroupBy.agg is deprecated, " + "and cases that relied on it will raise in a future version" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + # GH#41090 + agged = grouped.agg(lambda x: group_constants[x.name] + x.mean()) + assert agged[1] == 21 + + # corner cases + msg = "Must produce aggregated value" + # exception raised is type Exception + with pytest.raises(Exception, match=msg): + grouped.aggregate(lambda x: x * 2) + + @pytest.mark.parametrize( "vals", [ @@ -1071,3 +1137,30 @@ def test_groupby_prod_with_int64_dtype(): result = df.groupby(["A"]).prod().reset_index() expected = DataFrame({"A": [1], "B": [180970905912331920]}, dtype="int64") tm.assert_frame_equal(result, expected) + + +def test_groupby_std_datetimelike(warn_copy_on_write): + # GH#48481 + tdi = pd.timedelta_range("1 Day", periods=10000) + ser = Series(tdi) + ser[::5] *= 2 # get different std for different groups + + df = ser.to_frame("A").copy() + + df["B"] = ser + Timestamp(0) + df["C"] = ser + Timestamp(0, tz="UTC") + df.iloc[-1] = pd.NaT # last group includes NaTs + + gb = df.groupby(list(range(5)) * 2000) + + result = gb.std() + + # Note: this does not _exactly_ match what we would get if we did + # [gb.get_group(i).std() for i in gb.groups] + # but it _does_ match the floating point error we get doing the + # same operation on int64 data xref GH#51332 + td1 = pd.Timedelta("2887 days 11:21:02.326710176") + td4 = pd.Timedelta("2886 days 00:42:34.664668096") + exp_ser = Series([td1 * 2, td1, td1, td1, td4], index=np.arange(5)) + expected = DataFrame({"A": exp_ser, "B": exp_ser, "C": exp_ser}) + tm.assert_frame_equal(result, expected)