BUG: less false positives with SettingWithCopy (GH6025)

jreback · jreback · commit 9b90af760155 · 2014-01-22T17:49:39.000-05:00
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -59,7 +59,7 @@ API Changes
   - ``Series.sort`` will raise a ``ValueError`` (rather than a ``TypeError``) on sorting an
     object that is a view of another (:issue:`5856`, :issue:`5853`)
   - Raise/Warn ``SettingWithCopyError`` (according to the option ``chained_assignment`` in more cases,
-    when detecting chained assignment, related (:issue:`5938`)
+    when detecting chained assignment, related (:issue:`5938`, :issue:`6025`)
   - DataFrame.head(0) returns self instead of empty frame (:issue:`5846`)
   - ``autocorrelation_plot`` now accepts ``**kwargs``. (:issue:`5623`)
   - ``convert_objects`` now accepts a ``convert_timedeltas='coerce'`` argument to allow forced dtype conversion of
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1582,7 +1582,7 @@ def _ixs(self, i, axis=0, copy=False):
                     new_values, copy = self._data.fast_2d_xs(i, copy=copy)
                     result = Series(new_values, index=self.columns,
                                     name=self.index[i], dtype=new_values.dtype)
-                result.is_copy=copy
+                result._set_is_copy(self, copy=copy)
                 return result
 
         # icol
@@ -1707,7 +1707,7 @@ def _getitem_multilevel(self, key):
                     if isinstance(result, Series):
                         result = Series(result, index=self.index, name=key)
 
-            result.is_copy=True
+            result._set_is_copy(self)
             return result
         else:
             return self._get_item_cache(key)
@@ -1878,6 +1878,7 @@ def __setitem__(self, key, value):
             self._set_item(key, value)
 
     def _setitem_slice(self, key, value):
+        self._check_setitem_copy()
         self.ix._setitem_with_indexer(key, value)
 
     def _setitem_array(self, key, value):
@@ -1912,6 +1913,7 @@ def _setitem_frame(self, key, value):
                 raise TypeError(
                     'Cannot do boolean setting on mixed-type frame')
 
+        self._check_setitem_copy()
         self.where(-key, value, inplace=True)
 
     def _ensure_valid_index(self, value):
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -2,6 +2,7 @@
 import warnings
 import operator
 import weakref
+import gc
 import numpy as np
 import pandas.lib as lib
 
@@ -97,7 +98,7 @@ def __init__(self, data, axes=None, copy=False, dtype=None,
                 for i, ax in enumerate(axes):
                     data = data.reindex_axis(ax, axis=i)
 
-        object.__setattr__(self, 'is_copy', False)
+        object.__setattr__(self, 'is_copy', None)
         object.__setattr__(self, '_data', data)
         object.__setattr__(self, '_item_cache', {})
 
@@ -994,6 +995,9 @@ def _get_item_cache(self, item):
             res = self._box_item_values(item, values)
             cache[item] = res
             res._set_as_cached(item, self)
+
+            # for a chain
+            res.is_copy = self.is_copy
         return res
 
     def _set_as_cached(self, item, cacher):
@@ -1031,16 +1035,14 @@ def _maybe_update_cacher(self, clear=False):
             # a copy
             if ref is None:
                 del self._cacher
-                self.is_copy = True
-                self._check_setitem_copy(stacklevel=5, t='referant')
             else:
                 try:
                     ref._maybe_cache_changed(cacher[0], self)
                 except:
                     pass
-                if ref.is_copy:
-                    self.is_copy = True
-                    self._check_setitem_copy(stacklevel=5, t='referant')
+
+        # check if we are a copy
+        self._check_setitem_copy(stacklevel=5, t='referant')
 
         if clear:
             self._clear_item_cache()
@@ -1055,13 +1057,35 @@ def _set_item(self, key, value):
         self._data.set(key, value)
         self._clear_item_cache()
 
+    def _set_is_copy(self, ref=None, copy=True):
+        if not copy:
+            self.is_copy = None
+        else:
+            if ref is not None:
+                self.is_copy = weakref.ref(ref)
+            else:
+                self.is_copy = None
+
     def _check_setitem_copy(self, stacklevel=4, t='setting'):
         """ validate if we are doing a settitem on a chained copy.
 
         If you call this function, be sure to set the stacklevel such that the
         user will see the error *at the level of setting*"""
         if self.is_copy:
+
             value = config.get_option('mode.chained_assignment')
+            if value is None:
+                return
+
+            # see if the copy is not actually refererd; if so, then disolve
+            # the copy weakref
+            try:
+                gc.collect(2)
+                if not gc.get_referents(self.is_copy()):
+                    self.is_copy = None
+                    return
+            except:
+                pass
 
             if t == 'referant':
                 t = ("A value is trying to be set on a copy of a slice from a "
@@ -1143,7 +1167,7 @@ def take(self, indices, axis=0, convert=True, is_copy=True):
 
         # maybe set copy if we didn't actually change the index
         if is_copy and not result._get_axis(axis).equals(self._get_axis(axis)):
-            result.is_copy=is_copy
+            result._set_is_copy(self)
 
         return result
 
@@ -1276,12 +1300,12 @@ def xs(self, key, axis=0, level=None, copy=True, drop_level=True):
             new_values, copy = self._data.fast_2d_xs(loc, copy=copy)
             result = Series(new_values, index=self.columns,
                             name=self.index[loc])
-            result.is_copy=True
 
         else:
             result = self[loc]
             result.index = new_index
 
+        result._set_is_copy(self)
         return result
 
     _xs = xs
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
@@ -211,7 +211,7 @@ def _setitem_with_indexer(self, indexer, value):
                     labels = _safe_append_to_index(index, key)
                     self.obj._data = self.obj.reindex_axis(labels, i)._data
                     self.obj._maybe_update_cacher(clear=True)
-                    self.obj.is_copy=False
+                    self.obj.is_copy=None
 
                     if isinstance(labels, MultiIndex):
                         self.obj.sortlevel(inplace=True)
@@ -418,6 +418,7 @@ def can_do_equal_len():
             if isinstance(value, ABCPanel):
                 value = self._align_panel(indexer, value)
 
+            # actually do the set
             self.obj._data = self.obj._data.setitem(indexer, value)
             self.obj._maybe_update_cacher(clear=True)
 
diff --git a/pandas/io/json.py b/pandas/io/json.py
@@ -60,7 +60,7 @@ def __init__(self, obj, orient, date_format, double_precision,
         self.date_unit = date_unit
         self.default_handler = default_handler
 
-        self.is_copy = False
+        self.is_copy = None
         self._format_axes()
 
     def _format_axes(self):
diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py
@@ -1380,10 +1380,10 @@ def test_set_value_keeps_names(self):
             columns=['one', 'two', 'three', 'four'],
             index=idx)
         df = df.sortlevel()
-        self.assert_(df.is_copy is False)
+        self.assert_(df.is_copy is None)
         self.assertEqual(df.index.names, ('Name', 'Number'))
         df = df.set_value(('grethe', '4'), 'one', 99.34)
-        self.assert_(df.is_copy is False)
+        self.assert_(df.is_copy is None)
         self.assertEqual(df.index.names, ('Name', 'Number'))
 
     def test_names(self):
diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py
@@ -1980,19 +1980,19 @@ def test_detect_chained_assignment(self):
         # work with the chain
         expected = DataFrame([[-5,1],[-6,3]],columns=list('AB'))
         df = DataFrame(np.arange(4).reshape(2,2),columns=list('AB'),dtype='int64')
-        self.assert_(not df.is_copy)
+        self.assert_(df.is_copy is None)
 
         df['A'][0] = -5
         df['A'][1] = -6
         assert_frame_equal(df, expected)
 
         expected = DataFrame([[-5,2],[np.nan,3.]],columns=list('AB'))
         df = DataFrame({ 'A' : Series(range(2),dtype='int64'), 'B' : np.array(np.arange(2,4),dtype=np.float64)})
-        self.assert_(not df.is_copy)
+        self.assert_(df.is_copy is None)
         df['A'][0] = -5
         df['A'][1] = np.nan
         assert_frame_equal(df, expected)
-        self.assert_(not df['A'].is_copy)
+        self.assert_(df['A'].is_copy is None)
 
         # using a copy (the chain), fails
         df = DataFrame({ 'A' : Series(range(2),dtype='int64'), 'B' : np.array(np.arange(2,4),dtype=np.float64)})
@@ -2004,7 +2004,7 @@ def f():
         df = DataFrame({'a' : ['one', 'one', 'two',
                                'three', 'two', 'one', 'six'],
                         'c' : Series(range(7),dtype='int64') })
-        self.assert_(not df.is_copy)
+        self.assert_(df.is_copy is None)
         expected = DataFrame({'a' : ['one', 'one', 'two',
                                      'three', 'two', 'one', 'six'],
                               'c' : [42,42,2,3,4,42,6]})
@@ -2033,7 +2033,7 @@ def f():
         # make sure that is_copy is picked up reconstruction
         # GH5475
         df = DataFrame({"A": [1,2]})
-        self.assert_(df.is_copy is False)
+        self.assert_(df.is_copy is None)
         with tm.ensure_clean('__tmp__pickle') as path:
             df.to_pickle(path)
             df2 = pd.read_pickle(path)
@@ -2058,33 +2058,42 @@ def random_text(nobs=100):
 
         # always a copy
         x = df.iloc[[0,1,2]]
-        self.assert_(x.is_copy is True)
+        self.assert_(x.is_copy is not None)
         x = df.iloc[[0,1,2,4]]
-        self.assert_(x.is_copy is True)
+        self.assert_(x.is_copy is not None)
 
         # explicity copy
         indexer = df.letters.apply(lambda x : len(x) > 10)
         df = df.ix[indexer].copy()
-        self.assert_(df.is_copy is False)
+        self.assert_(df.is_copy is None)
         df['letters'] = df['letters'].apply(str.lower)
 
         # implicity take
         df = random_text(100000)
         indexer = df.letters.apply(lambda x : len(x) > 10)
         df = df.ix[indexer]
-        self.assert_(df.is_copy is True)
+        self.assert_(df.is_copy is not None)
+        df['letters'] = df['letters'].apply(str.lower)
+
+        # implicity take 2
+        df = random_text(100000)
+        indexer = df.letters.apply(lambda x : len(x) > 10)
+        df = df.ix[indexer]
+        self.assert_(df.is_copy is not None)
         df.loc[:,'letters'] = df['letters'].apply(str.lower)
 
-        # this will raise
-        #df['letters'] = df['letters'].apply(str.lower)
+        # should be ok even though its a copy!
+        self.assert_(df.is_copy is  None)
+        df['letters'] = df['letters'].apply(str.lower)
+        self.assert_(df.is_copy is None)
 
         df = random_text(100000)
         indexer = df.letters.apply(lambda x : len(x) > 10)
         df.ix[indexer,'letters'] = df.ix[indexer,'letters'].apply(str.lower)
 
         # an identical take, so no copy
         df = DataFrame({'a' : [1]}).dropna()
-        self.assert_(df.is_copy is False)
+        self.assert_(df.is_copy is None)
         df['a'] += 1
 
         # inplace ops
@@ -2123,7 +2132,15 @@ def f():
             df[['c']][mask] = df[['b']][mask]
         self.assertRaises(com.SettingWithCopyError, f)
 
-        pd.set_option('chained_assignment','warn')
+        # false positives GH6025
+        df = DataFrame ({'column1':['a', 'a', 'a'], 'column2': [4,8,9] })
+        str(df)
+        df['column1'] = df['column1'] + 'b'
+        str(df)
+        df = df [df['column2']!=8]
+        str(df)
+        df['column1'] = df['column1'] + 'c'
+        str(df)
 
     def test_float64index_slicing_bug(self):
         # GH 5557, related to slicing a float index
diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py
@@ -174,7 +174,7 @@ def _check_output(res, col, rows=['A', 'B'], cols=['C']):
             exp = self.data.groupby(rows)[col].mean()
             tm.assert_series_equal(cmarg, exp)
 
-            res.sortlevel(inplace=True)
+            res = res.sortlevel()
             rmarg = res.xs(('All', ''))[:-1]
             exp = self.data.groupby(cols)[col].mean()
             tm.assert_series_equal(rmarg, exp)