Skip to content

Commit 9b90af7

Browse files
committed
BUG: less false positives with SettingWithCopy (GH6025)
1 parent eacc354 commit 9b90af7

File tree

8 files changed

+73
-29
lines changed

8 files changed

+73
-29
lines changed

doc/source/release.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ API Changes
5959
- ``Series.sort`` will raise a ``ValueError`` (rather than a ``TypeError``) on sorting an
6060
object that is a view of another (:issue:`5856`, :issue:`5853`)
6161
- Raise/Warn ``SettingWithCopyError`` (according to the option ``chained_assignment`` in more cases,
62-
when detecting chained assignment, related (:issue:`5938`)
62+
when detecting chained assignment, related (:issue:`5938`, :issue:`6025`)
6363
- DataFrame.head(0) returns self instead of empty frame (:issue:`5846`)
6464
- ``autocorrelation_plot`` now accepts ``**kwargs``. (:issue:`5623`)
6565
- ``convert_objects`` now accepts a ``convert_timedeltas='coerce'`` argument to allow forced dtype conversion of

pandas/core/frame.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1582,7 +1582,7 @@ def _ixs(self, i, axis=0, copy=False):
15821582
new_values, copy = self._data.fast_2d_xs(i, copy=copy)
15831583
result = Series(new_values, index=self.columns,
15841584
name=self.index[i], dtype=new_values.dtype)
1585-
result.is_copy=copy
1585+
result._set_is_copy(self, copy=copy)
15861586
return result
15871587

15881588
# icol
@@ -1707,7 +1707,7 @@ def _getitem_multilevel(self, key):
17071707
if isinstance(result, Series):
17081708
result = Series(result, index=self.index, name=key)
17091709

1710-
result.is_copy=True
1710+
result._set_is_copy(self)
17111711
return result
17121712
else:
17131713
return self._get_item_cache(key)
@@ -1878,6 +1878,7 @@ def __setitem__(self, key, value):
18781878
self._set_item(key, value)
18791879

18801880
def _setitem_slice(self, key, value):
1881+
self._check_setitem_copy()
18811882
self.ix._setitem_with_indexer(key, value)
18821883

18831884
def _setitem_array(self, key, value):
@@ -1912,6 +1913,7 @@ def _setitem_frame(self, key, value):
19121913
raise TypeError(
19131914
'Cannot do boolean setting on mixed-type frame')
19141915

1916+
self._check_setitem_copy()
19151917
self.where(-key, value, inplace=True)
19161918

19171919
def _ensure_valid_index(self, value):

pandas/core/generic.py

Lines changed: 32 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import warnings
33
import operator
44
import weakref
5+
import gc
56
import numpy as np
67
import pandas.lib as lib
78

@@ -97,7 +98,7 @@ def __init__(self, data, axes=None, copy=False, dtype=None,
9798
for i, ax in enumerate(axes):
9899
data = data.reindex_axis(ax, axis=i)
99100

100-
object.__setattr__(self, 'is_copy', False)
101+
object.__setattr__(self, 'is_copy', None)
101102
object.__setattr__(self, '_data', data)
102103
object.__setattr__(self, '_item_cache', {})
103104

@@ -994,6 +995,9 @@ def _get_item_cache(self, item):
994995
res = self._box_item_values(item, values)
995996
cache[item] = res
996997
res._set_as_cached(item, self)
998+
999+
# for a chain
1000+
res.is_copy = self.is_copy
9971001
return res
9981002

9991003
def _set_as_cached(self, item, cacher):
@@ -1031,16 +1035,14 @@ def _maybe_update_cacher(self, clear=False):
10311035
# a copy
10321036
if ref is None:
10331037
del self._cacher
1034-
self.is_copy = True
1035-
self._check_setitem_copy(stacklevel=5, t='referant')
10361038
else:
10371039
try:
10381040
ref._maybe_cache_changed(cacher[0], self)
10391041
except:
10401042
pass
1041-
if ref.is_copy:
1042-
self.is_copy = True
1043-
self._check_setitem_copy(stacklevel=5, t='referant')
1043+
1044+
# check if we are a copy
1045+
self._check_setitem_copy(stacklevel=5, t='referant')
10441046

10451047
if clear:
10461048
self._clear_item_cache()
@@ -1055,13 +1057,35 @@ def _set_item(self, key, value):
10551057
self._data.set(key, value)
10561058
self._clear_item_cache()
10571059

1060+
def _set_is_copy(self, ref=None, copy=True):
1061+
if not copy:
1062+
self.is_copy = None
1063+
else:
1064+
if ref is not None:
1065+
self.is_copy = weakref.ref(ref)
1066+
else:
1067+
self.is_copy = None
1068+
10581069
def _check_setitem_copy(self, stacklevel=4, t='setting'):
10591070
""" validate if we are doing a settitem on a chained copy.
10601071
10611072
If you call this function, be sure to set the stacklevel such that the
10621073
user will see the error *at the level of setting*"""
10631074
if self.is_copy:
1075+
10641076
value = config.get_option('mode.chained_assignment')
1077+
if value is None:
1078+
return
1079+
1080+
# see if the copy is not actually refererd; if so, then disolve
1081+
# the copy weakref
1082+
try:
1083+
gc.collect(2)
1084+
if not gc.get_referents(self.is_copy()):
1085+
self.is_copy = None
1086+
return
1087+
except:
1088+
pass
10651089

10661090
if t == 'referant':
10671091
t = ("A value is trying to be set on a copy of a slice from a "
@@ -1143,7 +1167,7 @@ def take(self, indices, axis=0, convert=True, is_copy=True):
11431167

11441168
# maybe set copy if we didn't actually change the index
11451169
if is_copy and not result._get_axis(axis).equals(self._get_axis(axis)):
1146-
result.is_copy=is_copy
1170+
result._set_is_copy(self)
11471171

11481172
return result
11491173

@@ -1276,12 +1300,12 @@ def xs(self, key, axis=0, level=None, copy=True, drop_level=True):
12761300
new_values, copy = self._data.fast_2d_xs(loc, copy=copy)
12771301
result = Series(new_values, index=self.columns,
12781302
name=self.index[loc])
1279-
result.is_copy=True
12801303

12811304
else:
12821305
result = self[loc]
12831306
result.index = new_index
12841307

1308+
result._set_is_copy(self)
12851309
return result
12861310

12871311
_xs = xs

pandas/core/indexing.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,7 @@ def _setitem_with_indexer(self, indexer, value):
211211
labels = _safe_append_to_index(index, key)
212212
self.obj._data = self.obj.reindex_axis(labels, i)._data
213213
self.obj._maybe_update_cacher(clear=True)
214-
self.obj.is_copy=False
214+
self.obj.is_copy=None
215215

216216
if isinstance(labels, MultiIndex):
217217
self.obj.sortlevel(inplace=True)
@@ -418,6 +418,7 @@ def can_do_equal_len():
418418
if isinstance(value, ABCPanel):
419419
value = self._align_panel(indexer, value)
420420

421+
# actually do the set
421422
self.obj._data = self.obj._data.setitem(indexer, value)
422423
self.obj._maybe_update_cacher(clear=True)
423424

pandas/io/json.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ def __init__(self, obj, orient, date_format, double_precision,
6060
self.date_unit = date_unit
6161
self.default_handler = default_handler
6262

63-
self.is_copy = False
63+
self.is_copy = None
6464
self._format_axes()
6565

6666
def _format_axes(self):

pandas/tests/test_index.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1380,10 +1380,10 @@ def test_set_value_keeps_names(self):
13801380
columns=['one', 'two', 'three', 'four'],
13811381
index=idx)
13821382
df = df.sortlevel()
1383-
self.assert_(df.is_copy is False)
1383+
self.assert_(df.is_copy is None)
13841384
self.assertEqual(df.index.names, ('Name', 'Number'))
13851385
df = df.set_value(('grethe', '4'), 'one', 99.34)
1386-
self.assert_(df.is_copy is False)
1386+
self.assert_(df.is_copy is None)
13871387
self.assertEqual(df.index.names, ('Name', 'Number'))
13881388

13891389
def test_names(self):

pandas/tests/test_indexing.py

Lines changed: 30 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1980,19 +1980,19 @@ def test_detect_chained_assignment(self):
19801980
# work with the chain
19811981
expected = DataFrame([[-5,1],[-6,3]],columns=list('AB'))
19821982
df = DataFrame(np.arange(4).reshape(2,2),columns=list('AB'),dtype='int64')
1983-
self.assert_(not df.is_copy)
1983+
self.assert_(df.is_copy is None)
19841984

19851985
df['A'][0] = -5
19861986
df['A'][1] = -6
19871987
assert_frame_equal(df, expected)
19881988

19891989
expected = DataFrame([[-5,2],[np.nan,3.]],columns=list('AB'))
19901990
df = DataFrame({ 'A' : Series(range(2),dtype='int64'), 'B' : np.array(np.arange(2,4),dtype=np.float64)})
1991-
self.assert_(not df.is_copy)
1991+
self.assert_(df.is_copy is None)
19921992
df['A'][0] = -5
19931993
df['A'][1] = np.nan
19941994
assert_frame_equal(df, expected)
1995-
self.assert_(not df['A'].is_copy)
1995+
self.assert_(df['A'].is_copy is None)
19961996

19971997
# using a copy (the chain), fails
19981998
df = DataFrame({ 'A' : Series(range(2),dtype='int64'), 'B' : np.array(np.arange(2,4),dtype=np.float64)})
@@ -2004,7 +2004,7 @@ def f():
20042004
df = DataFrame({'a' : ['one', 'one', 'two',
20052005
'three', 'two', 'one', 'six'],
20062006
'c' : Series(range(7),dtype='int64') })
2007-
self.assert_(not df.is_copy)
2007+
self.assert_(df.is_copy is None)
20082008
expected = DataFrame({'a' : ['one', 'one', 'two',
20092009
'three', 'two', 'one', 'six'],
20102010
'c' : [42,42,2,3,4,42,6]})
@@ -2033,7 +2033,7 @@ def f():
20332033
# make sure that is_copy is picked up reconstruction
20342034
# GH5475
20352035
df = DataFrame({"A": [1,2]})
2036-
self.assert_(df.is_copy is False)
2036+
self.assert_(df.is_copy is None)
20372037
with tm.ensure_clean('__tmp__pickle') as path:
20382038
df.to_pickle(path)
20392039
df2 = pd.read_pickle(path)
@@ -2058,33 +2058,42 @@ def random_text(nobs=100):
20582058

20592059
# always a copy
20602060
x = df.iloc[[0,1,2]]
2061-
self.assert_(x.is_copy is True)
2061+
self.assert_(x.is_copy is not None)
20622062
x = df.iloc[[0,1,2,4]]
2063-
self.assert_(x.is_copy is True)
2063+
self.assert_(x.is_copy is not None)
20642064

20652065
# explicity copy
20662066
indexer = df.letters.apply(lambda x : len(x) > 10)
20672067
df = df.ix[indexer].copy()
2068-
self.assert_(df.is_copy is False)
2068+
self.assert_(df.is_copy is None)
20692069
df['letters'] = df['letters'].apply(str.lower)
20702070

20712071
# implicity take
20722072
df = random_text(100000)
20732073
indexer = df.letters.apply(lambda x : len(x) > 10)
20742074
df = df.ix[indexer]
2075-
self.assert_(df.is_copy is True)
2075+
self.assert_(df.is_copy is not None)
2076+
df['letters'] = df['letters'].apply(str.lower)
2077+
2078+
# implicity take 2
2079+
df = random_text(100000)
2080+
indexer = df.letters.apply(lambda x : len(x) > 10)
2081+
df = df.ix[indexer]
2082+
self.assert_(df.is_copy is not None)
20762083
df.loc[:,'letters'] = df['letters'].apply(str.lower)
20772084

2078-
# this will raise
2079-
#df['letters'] = df['letters'].apply(str.lower)
2085+
# should be ok even though its a copy!
2086+
self.assert_(df.is_copy is None)
2087+
df['letters'] = df['letters'].apply(str.lower)
2088+
self.assert_(df.is_copy is None)
20802089

20812090
df = random_text(100000)
20822091
indexer = df.letters.apply(lambda x : len(x) > 10)
20832092
df.ix[indexer,'letters'] = df.ix[indexer,'letters'].apply(str.lower)
20842093

20852094
# an identical take, so no copy
20862095
df = DataFrame({'a' : [1]}).dropna()
2087-
self.assert_(df.is_copy is False)
2096+
self.assert_(df.is_copy is None)
20882097
df['a'] += 1
20892098

20902099
# inplace ops
@@ -2123,7 +2132,15 @@ def f():
21232132
df[['c']][mask] = df[['b']][mask]
21242133
self.assertRaises(com.SettingWithCopyError, f)
21252134

2126-
pd.set_option('chained_assignment','warn')
2135+
# false positives GH6025
2136+
df = DataFrame ({'column1':['a', 'a', 'a'], 'column2': [4,8,9] })
2137+
str(df)
2138+
df['column1'] = df['column1'] + 'b'
2139+
str(df)
2140+
df = df [df['column2']!=8]
2141+
str(df)
2142+
df['column1'] = df['column1'] + 'c'
2143+
str(df)
21272144

21282145
def test_float64index_slicing_bug(self):
21292146
# GH 5557, related to slicing a float index

pandas/tools/tests/test_pivot.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ def _check_output(res, col, rows=['A', 'B'], cols=['C']):
174174
exp = self.data.groupby(rows)[col].mean()
175175
tm.assert_series_equal(cmarg, exp)
176176

177-
res.sortlevel(inplace=True)
177+
res = res.sortlevel()
178178
rmarg = res.xs(('All', ''))[:-1]
179179
exp = self.data.groupby(cols)[col].mean()
180180
tm.assert_series_equal(rmarg, exp)

0 commit comments

Comments
 (0)