pandas-dev · mroeschke · Jan 13, 2024 · Jan 13, 2024
diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py
@@ -160,6 +160,33 @@ def test_agg_apply_corner(ts, tsframe):
     tm.assert_frame_equal(res, exp_df)
 
 
+def test_with_na_groups(any_real_numpy_dtype):
+    index = Index(np.arange(10))
+    values = Series(np.ones(10), index, dtype=any_real_numpy_dtype)
+    labels = Series(
+        [np.nan, "foo", "bar", "bar", np.nan, np.nan, "bar", "bar", np.nan, "foo"],
+        index=index,
+    )
+
+    # this SHOULD be an int
+    grouped = values.groupby(labels)
+    agged = grouped.agg(len)
+    expected = Series([4, 2], index=["bar", "foo"])
+
+    tm.assert_series_equal(agged, expected, check_dtype=False)
+
+    # assert issubclass(agged.dtype.type, np.integer)
+
+    # explicitly return a float from my function
+    def f(x):
+        return float(len(x))
+
+    agged = grouped.agg(f)
+    expected = Series([4.0, 2.0], index=["bar", "foo"])
+
+    tm.assert_series_equal(agged, expected)
+
+
 def test_agg_grouping_is_list_tuple(ts):
     df = DataFrame(
         np.random.default_rng(2).standard_normal((30, 4)),
@@ -1049,6 +1076,73 @@ def test_grouby_agg_loses_results_with_as_index_false_relabel_multiindex():
     tm.assert_frame_equal(result, expected)
 
 
+def test_groupby_as_index_agg(df):
+    grouped = df.groupby("A", as_index=False)
+
+    # single-key
+
+    result = grouped[["C", "D"]].agg("mean")
+    expected = grouped.mean(numeric_only=True)
+    tm.assert_frame_equal(result, expected)
+
+    result2 = grouped.agg({"C": "mean", "D": "sum"})
+    expected2 = grouped.mean(numeric_only=True)
+    expected2["D"] = grouped.sum()["D"]
+    tm.assert_frame_equal(result2, expected2)
+
+    grouped = df.groupby("A", as_index=True)
+
+    msg = r"nested renamer is not supported"
+    with pytest.raises(SpecificationError, match=msg):
+        grouped["C"].agg({"Q": "sum"})
+
+    # multi-key
+
+    grouped = df.groupby(["A", "B"], as_index=False)
+
+    result = grouped.agg("mean")
+    expected = grouped.mean()
+    tm.assert_frame_equal(result, expected)
+
+    result2 = grouped.agg({"C": "mean", "D": "sum"})
+    expected2 = grouped.mean()
+    expected2["D"] = grouped.sum()["D"]
+    tm.assert_frame_equal(result2, expected2)
+
+    expected3 = grouped["C"].sum()
+    expected3 = DataFrame(expected3).rename(columns={"C": "Q"})
+    msg = "Passing a dictionary to SeriesGroupBy.agg is deprecated"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        result3 = grouped["C"].agg({"Q": "sum"})
+    tm.assert_frame_equal(result3, expected3)
+
+    # GH7115 & GH8112 & GH8582
+    df = DataFrame(
+        np.random.default_rng(2).integers(0, 100, (50, 3)),
+        columns=["jim", "joe", "jolie"],
+    )
+    ts = Series(np.random.default_rng(2).integers(5, 10, 50), name="jim")
+
+    gr = df.groupby(ts)
+    gr.nth(0)  # invokes set_selection_from_grouper internally
+
+    msg = "The behavior of DataFrame.sum with axis=None is deprecated"
+    with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False):
+        res = gr.apply(sum)
+    with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False):
+        alt = df.groupby(ts).apply(sum)
+    tm.assert_frame_equal(res, alt)
+
+    for attr in ["mean", "max", "count", "idxmax", "cumsum", "all"]:
+        gr = df.groupby(ts, as_index=False)
+        left = getattr(gr, attr)()
+
+        gr = df.groupby(ts.values, as_index=True)
+        right = getattr(gr, attr)().reset_index(drop=True)
+
+        tm.assert_frame_equal(left, right)
+
+
 @pytest.mark.parametrize(
     "func", [lambda s: s.mean(), lambda s: np.mean(s), lambda s: np.nanmean(s)]
 )
@@ -1252,6 +1346,28 @@ def test_agg_multiple_lambda(self):
         tm.assert_frame_equal(result2, expected)
 
 
+def test_pass_args_kwargs_duplicate_columns(tsframe, as_index):
+    # go through _aggregate_frame with self.axis == 0 and duplicate columns
+    tsframe.columns = ["A", "B", "A", "C"]
+    gb = tsframe.groupby(lambda x: x.month, as_index=as_index)
+
+    warn = None if as_index else FutureWarning
+    msg = "A grouping .* was excluded from the result"
+    with tm.assert_produces_warning(warn, match=msg):
+        res = gb.agg(np.percentile, 80, axis=0)
+
+    ex_data = {
+        1: tsframe[tsframe.index.month == 1].quantile(0.8),
+        2: tsframe[tsframe.index.month == 2].quantile(0.8),
+    }
+    expected = DataFrame(ex_data).T
+    if not as_index:
+        # TODO: try to get this more consistent?
+        expected.index = Index(range(2))
+
+    tm.assert_frame_equal(res, expected)
+
+
 def test_groupby_get_by_index():
     # GH 33439
     df = DataFrame({"A": ["S", "W", "W"], "B": [1.0, 1.0, 2.0]})

diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py
@@ -1602,3 +1602,75 @@ def test_builtins_apply(keys, f):
         tm.assert_frame_equal(result, expected, check_dtype=False)
 
     tm.assert_series_equal(getattr(result, fname)(axis=0), getattr(df, fname)(axis=0))
+
+
+def test_inconsistent_return_type():
+    # GH5592
+    # inconsistent return type
+    df = DataFrame(
+        {
+            "A": ["Tiger", "Tiger", "Tiger", "Lamb", "Lamb", "Pony", "Pony"],
+            "B": Series(np.arange(7), dtype="int64"),
+            "C": pd.date_range("20130101", periods=7),
+        }
+    )
+
+    def f_0(grp):
+        return grp.iloc[0]
+
+    expected = df.groupby("A").first()[["B"]]
+    msg = "DataFrameGroupBy.apply operated on the grouping columns"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        result = df.groupby("A").apply(f_0)[["B"]]
+    tm.assert_frame_equal(result, expected)
+
+    def f_1(grp):
+        if grp.name == "Tiger":
+            return None
+        return grp.iloc[0]
+
+    msg = "DataFrameGroupBy.apply operated on the grouping columns"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        result = df.groupby("A").apply(f_1)[["B"]]
+    e = expected.copy()
+    e.loc["Tiger"] = np.nan
+    tm.assert_frame_equal(result, e)
+
+    def f_2(grp):
+        if grp.name == "Pony":
+            return None
+        return grp.iloc[0]
+
+    msg = "DataFrameGroupBy.apply operated on the grouping columns"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        result = df.groupby("A").apply(f_2)[["B"]]
+    e = expected.copy()
+    e.loc["Pony"] = np.nan
+    tm.assert_frame_equal(result, e)
+
+    # 5592 revisited, with datetimes
+    def f_3(grp):
+        if grp.name == "Pony":
+            return None
+        return grp.iloc[0]
+
+    msg = "DataFrameGroupBy.apply operated on the grouping columns"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        result = df.groupby("A").apply(f_3)[["C"]]
+    e = df.groupby("A").first()[["C"]]
+    e.loc["Pony"] = pd.NaT
+    tm.assert_frame_equal(result, e)
+
+    # scalar outputs
+    def f_4(grp):
+        if grp.name == "Pony":
+            return None
+        return grp.iloc[0].loc["C"]
+
+    msg = "DataFrameGroupBy.apply operated on the grouping columns"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        result = df.groupby("A").apply(f_4)
+    e = df.groupby("A").first()["C"].copy()
+    e.loc["Pony"] = np.nan
+    e.name = None
+    tm.assert_series_equal(result, e)