pandas-dev · jreback · Mar 14, 2013 · Mar 14, 2013
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -146,6 +146,7 @@ pandas 0.11.0
   - Bug in DataFrame column insertion when the column creation fails, existing frame is left in
     an irrecoverable state (GH3010_)
   - Bug in DataFrame update where non-specified values could cause dtype changes (GH3016_)
+  - Bug in DataFrame combine_first where non-specified values could cause dtype changes (GH3041_)
   - Formatting of an index that has ``nan`` was inconsistent or wrong (would fill from 
     other values), (GH2850_)
 
@@ -178,6 +179,7 @@ pandas 0.11.0
 .. _GH3010: https://github.com/pydata/pandas/issues/3010
 .. _GH3012: https://github.com/pydata/pandas/issues/3012
 .. _GH3029: https://github.com/pydata/pandas/issues/3029
+.. _GH3041: https://github.com/pydata/pandas/issues/3041
 
 
 pandas 0.10.1

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -3723,7 +3723,7 @@ def _compare(a, b):
         return self._constructor(data=new_data, index=self.index,
                                  columns=self.columns, copy=False)
 
-    def combine(self, other, func, fill_value=None):
+    def combine(self, other, func, fill_value=None, overwrite=True):
         """
         Add two DataFrame objects and do not propagate NaN values, so if for a
         (column, time) one frame is missing a value, it will default to the
@@ -3734,6 +3734,8 @@ def combine(self, other, func, fill_value=None):
         other : DataFrame
         func : function
         fill_value : scalar value
+        overwrite : boolean, default True
+            If True then overwrite values for common keys in the calling frame
 
         Returns
         -------
@@ -3760,9 +3762,16 @@ def combine(self, other, func, fill_value=None):
             series = this[col].values
             otherSeries = other[col].values
 
+            this_mask = isnull(series)
+            other_mask = isnull(otherSeries)
+
+            # don't overwrite columns unecessarily
+            # DO propogate if this column is not in the intersection
+            if not overwrite and other_mask.all():
+                result[col] = this[col].copy()
+                continue
+
             if do_fill:
-                this_mask = isnull(series)
-                other_mask = isnull(otherSeries)
                 series = series.copy()
                 otherSeries = otherSeries.copy()
                 series[this_mask] = fill_value
@@ -3798,7 +3807,7 @@ def combine_first(self, other):
         combined : DataFrame
         """
         combiner = lambda x, y: np.where(isnull(x), y, x)
-        return self.combine(other, combiner)
+        return self.combine(other, combiner, overwrite=False)
 
     def update(self, other, join='left', overwrite=True, filter_func=None,
                raise_conflict=False):

diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -7248,6 +7248,30 @@ def test_combine_first_mixed_bug(self):
         combined = frame1.combine_first(frame2)
         self.assertEqual(len(combined.columns), 5)
 
+        # gh 3016 (same as in update)
+        df = DataFrame([[1.,2.,False, True],[4.,5.,True,False]],
+                       columns=['A','B','bool1','bool2'])
+
+        other = DataFrame([[45,45]],index=[0],columns=['A','B'])
+        result = df.combine_first(other)
+        assert_frame_equal(result, df)
+
+        df.ix[0,'A'] = np.nan
+        result = df.combine_first(other)
+        df.ix[0,'A'] = 45
+        assert_frame_equal(result, df)
+
+        # doc example
+        df1 = DataFrame({'A' : [1., np.nan, 3., 5., np.nan],
+                         'B' : [np.nan, 2., 3., np.nan, 6.]})
+
+        df2 = DataFrame({'A' : [5., 2., 4., np.nan, 3., 7.],
+                         'B' : [np.nan, np.nan, 3., 4., 6., 8.]})
+
+        result = df1.combine_first(df2)
+        expected = DataFrame({ 'A' : [1,2,3,5,3,7.], 'B' : [np.nan,2,3,4,6,8] })
+        assert_frame_equal(result,expected)
+
     def test_update(self):
         df = DataFrame([[1.5, nan, 3.],
                         [1.5, nan, 3.],