Skip to content

Commit 0babe10

Browse files
authored
BUG: DataFrameGroupby std/sem modify grouped column when as_index=False (#33630)
1 parent 4f4282f commit 0babe10

File tree

3 files changed

+49
-5
lines changed

3 files changed

+49
-5
lines changed

doc/source/whatsnew/v1.1.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -819,7 +819,7 @@ Groupby/resample/rolling
819819
- Bug in :meth:`Series.groupby` would raise ``ValueError`` when grouping by :class:`PeriodIndex` level (:issue:`34010`)
820820
- Bug in :meth:`GroupBy.agg`, :meth:`GroupBy.transform`, and :meth:`GroupBy.resample` where subclasses are not preserved (:issue:`28330`)
821821
- Bug in :meth:`GroupBy.rolling.apply` ignores args and kwargs parameters (:issue:`33433`)
822-
822+
- Bug in :meth:`DataFrameGroupby.std` and :meth:`DataFrameGroupby.sem` would modify grouped-by columns when ``as_index=False`` (:issue:`10355`)
823823

824824
Reshaping
825825
^^^^^^^^^

pandas/core/groupby/groupby.py

+26-4
Original file line numberDiff line numberDiff line change
@@ -666,11 +666,11 @@ def _set_group_selection(self):
666666
):
667667
return
668668

669-
ax = self.obj._info_axis
670669
groupers = [g.name for g in grp.groupings if g.level is None and g.in_axis]
671670

672671
if len(groupers):
673672
# GH12839 clear selected obj cache when group selection changes
673+
ax = self.obj._info_axis
674674
self._group_selection = ax.difference(Index(groupers), sort=False).tolist()
675675
self._reset_cache("_selected_obj")
676676

@@ -1416,8 +1416,18 @@ def std(self, ddof: int = 1):
14161416
Series or DataFrame
14171417
Standard deviation of values within each group.
14181418
"""
1419-
# TODO: implement at Cython level?
1420-
return np.sqrt(self.var(ddof=ddof))
1419+
result = self.var(ddof=ddof)
1420+
if result.ndim == 1:
1421+
result = np.sqrt(result)
1422+
else:
1423+
cols = result.columns.get_indexer_for(
1424+
result.columns.difference(self.exclusions).unique()
1425+
)
1426+
# TODO(GH-22046) - setting with iloc broken if labels are not unique
1427+
# .values to remove labels
1428+
result.iloc[:, cols] = np.sqrt(result.iloc[:, cols]).values
1429+
1430+
return result
14211431

14221432
@Substitution(name="groupby")
14231433
@Appender(_common_see_also)
@@ -1464,7 +1474,19 @@ def sem(self, ddof: int = 1):
14641474
Series or DataFrame
14651475
Standard error of the mean of values within each group.
14661476
"""
1467-
return self.std(ddof=ddof) / np.sqrt(self.count())
1477+
result = self.std(ddof=ddof)
1478+
if result.ndim == 1:
1479+
result /= np.sqrt(self.count())
1480+
else:
1481+
cols = result.columns.get_indexer_for(
1482+
result.columns.difference(self.exclusions).unique()
1483+
)
1484+
# TODO(GH-22046) - setting with iloc broken if labels are not unique
1485+
# .values to remove labels
1486+
result.iloc[:, cols] = (
1487+
result.iloc[:, cols].values / np.sqrt(self.count().iloc[:, cols]).values
1488+
)
1489+
return result
14681490

14691491
@Substitution(name="groupby")
14701492
@Appender(_common_see_also)

pandas/tests/groupby/test_function.py

+22
Original file line numberDiff line numberDiff line change
@@ -573,6 +573,28 @@ def test_ops_general(op, targop):
573573
tm.assert_frame_equal(result, expected)
574574

575575

576+
def test_ops_not_as_index(reduction_func):
577+
# GH 10355
578+
# Using as_index=False should not modify grouped column
579+
580+
if reduction_func in ("nth", "ngroup", "size",):
581+
pytest.skip("Skip until behavior is determined (GH #5755)")
582+
583+
if reduction_func in ("corrwith", "idxmax", "idxmin", "mad", "nunique", "skew",):
584+
pytest.xfail(
585+
"_GroupBy._python_apply_general incorrectly modifies grouping columns"
586+
)
587+
588+
df = DataFrame(np.random.randint(0, 5, size=(100, 2)), columns=["a", "b"])
589+
expected = getattr(df.groupby("a"), reduction_func)().reset_index()
590+
591+
result = getattr(df.groupby("a", as_index=False), reduction_func)()
592+
tm.assert_frame_equal(result, expected)
593+
594+
result = getattr(df.groupby("a", as_index=False)["b"], reduction_func)()
595+
tm.assert_frame_equal(result, expected)
596+
597+
576598
def test_max_nan_bug():
577599
raw = """,Date,app,File
578600
-04-23,2013-04-23 00:00:00,,log080001.log

0 commit comments

Comments
 (0)