Skip to content

Commit 76cd654

Browse files
committed
BUG: DataFrameGroupby std/sem modify grouped column when as_index=False
1 parent 4a267c6 commit 76cd654

File tree

3 files changed

+49
-4
lines changed

3 files changed

+49
-4
lines changed

doc/source/whatsnew/v1.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -803,6 +803,7 @@ Groupby/resample/rolling
803803
- Bug in :meth:`DataFrame.groupby` where a ``ValueError`` would be raised when grouping by a categorical column with read-only categories and ``sort=False`` (:issue:`33410`)
804804
- Bug in :meth:`GroupBy.first` and :meth:`GroupBy.last` where None is not preserved in object dtype (:issue:`32800`)
805805
- Bug in :meth:`Rolling.min` and :meth:`Rolling.max`: Growing memory usage after multiple calls when using a fixed window (:issue:`30726`)
806+
- Bug in :meth:`DataFrameGroupby.std` and :meth:`DataFrameGroupby.sem` would modify grouped-by columns when ``as_index=False`` (:issue:`10355`)
806807

807808
Reshaping
808809
^^^^^^^^^

pandas/core/groupby/groupby.py

+26-4
Original file line numberDiff line numberDiff line change
@@ -649,11 +649,11 @@ def _set_group_selection(self):
649649
):
650650
return
651651

652-
ax = self.obj._info_axis
653652
groupers = [g.name for g in grp.groupings if g.level is None and g.in_axis]
654653

655654
if len(groupers):
656655
# GH12839 clear selected obj cache when group selection changes
656+
ax = self.obj._info_axis
657657
self._group_selection = ax.difference(Index(groupers), sort=False).tolist()
658658
self._reset_cache("_selected_obj")
659659

@@ -1360,8 +1360,18 @@ def std(self, ddof: int = 1):
13601360
Series or DataFrame
13611361
Standard deviation of values within each group.
13621362
"""
1363-
# TODO: implement at Cython level?
1364-
return np.sqrt(self.var(ddof=ddof))
1363+
result = self.var(ddof=ddof)
1364+
if result.ndim == 1:
1365+
result = np.sqrt(result)
1366+
else:
1367+
cols = result.columns.get_indexer_for(
1368+
result.columns.difference(self.exclusions).unique()
1369+
)
1370+
# TODO(GH-22046) - setting with iloc broken if labels are not unique
1371+
# .values to remove labels
1372+
result.iloc[:, cols] = np.sqrt(result.iloc[:, cols]).values
1373+
1374+
return result
13651375

13661376
@Substitution(name="groupby")
13671377
@Appender(_common_see_also)
@@ -1408,7 +1418,19 @@ def sem(self, ddof: int = 1):
14081418
Series or DataFrame
14091419
Standard error of the mean of values within each group.
14101420
"""
1411-
return self.std(ddof=ddof) / np.sqrt(self.count())
1421+
result = self.std(ddof=ddof)
1422+
if result.ndim == 1:
1423+
result /= np.sqrt(self.count())
1424+
else:
1425+
cols = result.columns.get_indexer_for(
1426+
result.columns.difference(self.exclusions).unique()
1427+
)
1428+
# TODO(GH-22046) - setting with iloc broken if labels are not unique
1429+
# .values to remove labels
1430+
result.iloc[:, cols] = (
1431+
result.iloc[:, cols].values / np.sqrt(self.count().iloc[:, cols]).values
1432+
)
1433+
return result
14121434

14131435
@Substitution(name="groupby")
14141436
@Appender(_common_see_also)

pandas/tests/groupby/test_function.py

+22
Original file line numberDiff line numberDiff line change
@@ -573,6 +573,28 @@ def test_ops_general(op, targop):
573573
tm.assert_frame_equal(result, expected)
574574

575575

576+
def test_ops_not_as_index(reduction_func):
577+
# GH 10355
578+
# Using as_index=False should not modify grouped column
579+
580+
if reduction_func in ("nth", "ngroup", "size",):
581+
pytest.skip("Skip until behavior is determined (GH #5755)")
582+
583+
if reduction_func in ("corrwith", "idxmax", "idxmin", "mad", "nunique", "skew",):
584+
pytest.xfail(
585+
"_GroupBy._python_apply_general incorrectly modifies grouping columns"
586+
)
587+
588+
df = DataFrame(np.random.randint(0, 5, size=(100, 2)), columns=["a", "b"])
589+
expected = getattr(df.groupby("a"), reduction_func)().reset_index()
590+
591+
result = getattr(df.groupby("a", as_index=False), reduction_func)()
592+
tm.assert_frame_equal(result, expected)
593+
594+
result = getattr(df.groupby("a", as_index=False)["b"], reduction_func)()
595+
tm.assert_frame_equal(result, expected)
596+
597+
576598
def test_max_nan_bug():
577599
raw = """,Date,app,File
578600
-04-23,2013-04-23 00:00:00,,log080001.log

0 commit comments

Comments
 (0)