Skip to content

BUG: less false positives with SettingWithCopy (GH6025) #6042

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jan 23, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ API Changes
- ``Series.sort`` will raise a ``ValueError`` (rather than a ``TypeError``) on sorting an
object that is a view of another (:issue:`5856`, :issue:`5853`)
- Raise/Warn ``SettingWithCopyError`` (according to the option ``chained_assignment`` in more cases,
when detecting chained assignment, related (:issue:`5938`)
when detecting chained assignment, related (:issue:`5938`, :issue:`6025`)
- DataFrame.head(0) returns self instead of empty frame (:issue:`5846`)
- ``autocorrelation_plot`` now accepts ``**kwargs``. (:issue:`5623`)
- ``convert_objects`` now accepts a ``convert_timedeltas='coerce'`` argument to allow forced dtype conversion of
Expand Down
6 changes: 4 additions & 2 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1582,7 +1582,7 @@ def _ixs(self, i, axis=0, copy=False):
new_values, copy = self._data.fast_2d_xs(i, copy=copy)
result = Series(new_values, index=self.columns,
name=self.index[i], dtype=new_values.dtype)
result.is_copy=copy
result._set_is_copy(self, copy=copy)
return result

# icol
Expand Down Expand Up @@ -1707,7 +1707,7 @@ def _getitem_multilevel(self, key):
if isinstance(result, Series):
result = Series(result, index=self.index, name=key)

result.is_copy=True
result._set_is_copy(self)
return result
else:
return self._get_item_cache(key)
Expand Down Expand Up @@ -1878,6 +1878,7 @@ def __setitem__(self, key, value):
self._set_item(key, value)

def _setitem_slice(self, key, value):
self._check_setitem_copy()
self.ix._setitem_with_indexer(key, value)

def _setitem_array(self, key, value):
Expand Down Expand Up @@ -1912,6 +1913,7 @@ def _setitem_frame(self, key, value):
raise TypeError(
'Cannot do boolean setting on mixed-type frame')

self._check_setitem_copy()
self.where(-key, value, inplace=True)

def _ensure_valid_index(self, value):
Expand Down
40 changes: 32 additions & 8 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import warnings
import operator
import weakref
import gc
import numpy as np
import pandas.lib as lib

Expand Down Expand Up @@ -97,7 +98,7 @@ def __init__(self, data, axes=None, copy=False, dtype=None,
for i, ax in enumerate(axes):
data = data.reindex_axis(ax, axis=i)

object.__setattr__(self, 'is_copy', False)
object.__setattr__(self, 'is_copy', None)
object.__setattr__(self, '_data', data)
object.__setattr__(self, '_item_cache', {})

Expand Down Expand Up @@ -994,6 +995,9 @@ def _get_item_cache(self, item):
res = self._box_item_values(item, values)
cache[item] = res
res._set_as_cached(item, self)

# for a chain
res.is_copy = self.is_copy
return res

def _set_as_cached(self, item, cacher):
Expand Down Expand Up @@ -1031,16 +1035,14 @@ def _maybe_update_cacher(self, clear=False):
# a copy
if ref is None:
del self._cacher
self.is_copy = True
self._check_setitem_copy(stacklevel=5, t='referant')
else:
try:
ref._maybe_cache_changed(cacher[0], self)
except:
pass
if ref.is_copy:
self.is_copy = True
self._check_setitem_copy(stacklevel=5, t='referant')

# check if we are a copy
self._check_setitem_copy(stacklevel=5, t='referant')

if clear:
self._clear_item_cache()
Expand All @@ -1055,13 +1057,35 @@ def _set_item(self, key, value):
self._data.set(key, value)
self._clear_item_cache()

def _set_is_copy(self, ref=None, copy=True):
if not copy:
self.is_copy = None
else:
if ref is not None:
self.is_copy = weakref.ref(ref)
else:
self.is_copy = None

def _check_setitem_copy(self, stacklevel=4, t='setting'):
""" validate if we are doing a settitem on a chained copy.

If you call this function, be sure to set the stacklevel such that the
user will see the error *at the level of setting*"""
if self.is_copy:

value = config.get_option('mode.chained_assignment')
if value is None:
return

# see if the copy is not actually refererd; if so, then disolve
# the copy weakref
try:
gc.collect(2)
if not gc.get_referents(self.is_copy()):
self.is_copy = None
return
except:
pass

if t == 'referant':
t = ("A value is trying to be set on a copy of a slice from a "
Expand Down Expand Up @@ -1143,7 +1167,7 @@ def take(self, indices, axis=0, convert=True, is_copy=True):

# maybe set copy if we didn't actually change the index
if is_copy and not result._get_axis(axis).equals(self._get_axis(axis)):
result.is_copy=is_copy
result._set_is_copy(self)

return result

Expand Down Expand Up @@ -1276,12 +1300,12 @@ def xs(self, key, axis=0, level=None, copy=True, drop_level=True):
new_values, copy = self._data.fast_2d_xs(loc, copy=copy)
result = Series(new_values, index=self.columns,
name=self.index[loc])
result.is_copy=True

else:
result = self[loc]
result.index = new_index

result._set_is_copy(self)
return result

_xs = xs
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ def _setitem_with_indexer(self, indexer, value):
labels = _safe_append_to_index(index, key)
self.obj._data = self.obj.reindex_axis(labels, i)._data
self.obj._maybe_update_cacher(clear=True)
self.obj.is_copy=False
self.obj.is_copy=None

if isinstance(labels, MultiIndex):
self.obj.sortlevel(inplace=True)
Expand Down Expand Up @@ -418,6 +418,7 @@ def can_do_equal_len():
if isinstance(value, ABCPanel):
value = self._align_panel(indexer, value)

# actually do the set
self.obj._data = self.obj._data.setitem(indexer, value)
self.obj._maybe_update_cacher(clear=True)

Expand Down
2 changes: 1 addition & 1 deletion pandas/io/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def __init__(self, obj, orient, date_format, double_precision,
self.date_unit = date_unit
self.default_handler = default_handler

self.is_copy = False
self.is_copy = None
self._format_axes()

def _format_axes(self):
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1380,10 +1380,10 @@ def test_set_value_keeps_names(self):
columns=['one', 'two', 'three', 'four'],
index=idx)
df = df.sortlevel()
self.assert_(df.is_copy is False)
self.assert_(df.is_copy is None)
self.assertEqual(df.index.names, ('Name', 'Number'))
df = df.set_value(('grethe', '4'), 'one', 99.34)
self.assert_(df.is_copy is False)
self.assert_(df.is_copy is None)
self.assertEqual(df.index.names, ('Name', 'Number'))

def test_names(self):
Expand Down
43 changes: 30 additions & 13 deletions pandas/tests/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1980,19 +1980,19 @@ def test_detect_chained_assignment(self):
# work with the chain
expected = DataFrame([[-5,1],[-6,3]],columns=list('AB'))
df = DataFrame(np.arange(4).reshape(2,2),columns=list('AB'),dtype='int64')
self.assert_(not df.is_copy)
self.assert_(df.is_copy is None)

df['A'][0] = -5
df['A'][1] = -6
assert_frame_equal(df, expected)

expected = DataFrame([[-5,2],[np.nan,3.]],columns=list('AB'))
df = DataFrame({ 'A' : Series(range(2),dtype='int64'), 'B' : np.array(np.arange(2,4),dtype=np.float64)})
self.assert_(not df.is_copy)
self.assert_(df.is_copy is None)
df['A'][0] = -5
df['A'][1] = np.nan
assert_frame_equal(df, expected)
self.assert_(not df['A'].is_copy)
self.assert_(df['A'].is_copy is None)

# using a copy (the chain), fails
df = DataFrame({ 'A' : Series(range(2),dtype='int64'), 'B' : np.array(np.arange(2,4),dtype=np.float64)})
Expand All @@ -2004,7 +2004,7 @@ def f():
df = DataFrame({'a' : ['one', 'one', 'two',
'three', 'two', 'one', 'six'],
'c' : Series(range(7),dtype='int64') })
self.assert_(not df.is_copy)
self.assert_(df.is_copy is None)
expected = DataFrame({'a' : ['one', 'one', 'two',
'three', 'two', 'one', 'six'],
'c' : [42,42,2,3,4,42,6]})
Expand Down Expand Up @@ -2033,7 +2033,7 @@ def f():
# make sure that is_copy is picked up reconstruction
# GH5475
df = DataFrame({"A": [1,2]})
self.assert_(df.is_copy is False)
self.assert_(df.is_copy is None)
with tm.ensure_clean('__tmp__pickle') as path:
df.to_pickle(path)
df2 = pd.read_pickle(path)
Expand All @@ -2058,33 +2058,42 @@ def random_text(nobs=100):

# always a copy
x = df.iloc[[0,1,2]]
self.assert_(x.is_copy is True)
self.assert_(x.is_copy is not None)
x = df.iloc[[0,1,2,4]]
self.assert_(x.is_copy is True)
self.assert_(x.is_copy is not None)

# explicity copy
indexer = df.letters.apply(lambda x : len(x) > 10)
df = df.ix[indexer].copy()
self.assert_(df.is_copy is False)
self.assert_(df.is_copy is None)
df['letters'] = df['letters'].apply(str.lower)

# implicity take
df = random_text(100000)
indexer = df.letters.apply(lambda x : len(x) > 10)
df = df.ix[indexer]
self.assert_(df.is_copy is True)
self.assert_(df.is_copy is not None)
df['letters'] = df['letters'].apply(str.lower)

# implicity take 2
df = random_text(100000)
indexer = df.letters.apply(lambda x : len(x) > 10)
df = df.ix[indexer]
self.assert_(df.is_copy is not None)
df.loc[:,'letters'] = df['letters'].apply(str.lower)

# this will raise
#df['letters'] = df['letters'].apply(str.lower)
# should be ok even though its a copy!
self.assert_(df.is_copy is None)
df['letters'] = df['letters'].apply(str.lower)
self.assert_(df.is_copy is None)

df = random_text(100000)
indexer = df.letters.apply(lambda x : len(x) > 10)
df.ix[indexer,'letters'] = df.ix[indexer,'letters'].apply(str.lower)

# an identical take, so no copy
df = DataFrame({'a' : [1]}).dropna()
self.assert_(df.is_copy is False)
self.assert_(df.is_copy is None)
df['a'] += 1

# inplace ops
Expand Down Expand Up @@ -2123,7 +2132,15 @@ def f():
df[['c']][mask] = df[['b']][mask]
self.assertRaises(com.SettingWithCopyError, f)

pd.set_option('chained_assignment','warn')
# false positives GH6025
df = DataFrame ({'column1':['a', 'a', 'a'], 'column2': [4,8,9] })
str(df)
df['column1'] = df['column1'] + 'b'
str(df)
df = df [df['column2']!=8]
str(df)
df['column1'] = df['column1'] + 'c'
str(df)

def test_float64index_slicing_bug(self):
# GH 5557, related to slicing a float index
Expand Down
2 changes: 1 addition & 1 deletion pandas/tools/tests/test_pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ def _check_output(res, col, rows=['A', 'B'], cols=['C']):
exp = self.data.groupby(rows)[col].mean()
tm.assert_series_equal(cmarg, exp)

res.sortlevel(inplace=True)
res = res.sortlevel()
rmarg = res.xs(('All', ''))[:-1]
exp = self.data.groupby(cols)[col].mean()
tm.assert_series_equal(rmarg, exp)
Expand Down