pandas-dev · jreback · Jan 3, 2019 · Jun 29, 2018 · Dec 18, 2018 · Dec 26, 2018
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
@@ -20,7 +20,7 @@
     is_datetime64tz_dtype, is_datetimelike, is_dtype_equal,
     is_extension_array_dtype, is_float_dtype, is_int64_dtype, is_integer,
     is_integer_dtype, is_list_like, is_number, is_numeric_dtype,
-    needs_i8_conversion)
+    is_object_dtype, needs_i8_conversion)
 from pandas.core.dtypes.missing import isnull, na_value_for_dtype
 
 from pandas import Categorical, DataFrame, Index, MultiIndex, Series, Timedelta
@@ -891,7 +891,7 @@ def _maybe_coerce_merge_keys(self):
         # coerce these if they are originally incompatible types
         #
         # for example if these are categorical, but are not dtype_equal
-        # or if we have object and integer dtypes
+        # or if we have object and integer dtypes, that do not infer
 
         for lk, rk, name in zip(self.left_join_keys,
                                 self.right_join_keys,
@@ -901,6 +901,8 @@ def _maybe_coerce_merge_keys(self):
 
             lk_is_cat = is_categorical_dtype(lk)
             rk_is_cat = is_categorical_dtype(rk)
+            lk_is_object = is_object_dtype(lk)
+            rk_is_object = is_object_dtype(rk)
 
             # if either left or right is a categorical
             # then the must match exactly in categories & ordered
@@ -925,7 +927,7 @@ def _maybe_coerce_merge_keys(self):
             # the same, then proceed
             if is_numeric_dtype(lk) and is_numeric_dtype(rk):
                 if lk.dtype.kind == rk.dtype.kind:
-                    pass
+                    continue
 
                 # check whether ints and floats
                 elif is_integer_dtype(rk) and is_float_dtype(lk):
@@ -934,29 +936,34 @@ def _maybe_coerce_merge_keys(self):
                                       'columns where the float values '
                                       'are not equal to their int '
                                       'representation', UserWarning)
+                    continue
 
                 elif is_float_dtype(rk) and is_integer_dtype(lk):
                     if not (rk == rk.astype(lk.dtype))[~np.isnan(rk)].all():
                         warnings.warn('You are merging on int and float '
                                       'columns where the float values '
                                       'are not equal to their int '
                                       'representation', UserWarning)
+                    continue
 
                 # let's infer and see if we are ok
                 elif lib.infer_dtype(lk) == lib.infer_dtype(rk):
-                    pass
+                    continue
 
             # Check if we are trying to merge on obviously
             # incompatible dtypes GH 9780, GH 15800
 
-            # boolean values are considered as numeric, but are still allowed
-            # to be merged on object boolean values
-            elif ((is_numeric_dtype(lk) and not is_bool_dtype(lk))
-                    and not is_numeric_dtype(rk)):
-                raise ValueError(msg)
-            elif (not is_numeric_dtype(lk)
-                    and (is_numeric_dtype(rk) and not is_bool_dtype(rk))):
-                raise ValueError(msg)
+            # bool values are coerced to object
+            elif ((lk_is_object and is_bool_dtype(rk)) or
+                  (is_bool_dtype(lk) and rk_is_object)):
+                pass
+
+            # object values are allowed to be merged
+            elif ((lk_is_object and is_numeric_dtype(rk)) or
+                  (is_numeric_dtype(lk) and rk_is_object)):
+                continue
+
+            # datetimelikes must match exactly
             elif is_datetimelike(lk) and not is_datetimelike(rk):
                 raise ValueError(msg)
             elif not is_datetimelike(lk) and is_datetimelike(rk):
@@ -966,22 +973,24 @@ def _maybe_coerce_merge_keys(self):
             elif not is_datetime64tz_dtype(lk) and is_datetime64tz_dtype(rk):
                 raise ValueError(msg)
 
+            elif lk_is_object and rk_is_object:
+                continue
+
             # Houston, we have a problem!
             # let's coerce to object if the dtypes aren't
             # categorical, otherwise coerce to the category
             # dtype. If we coerced categories to object,
             # then we would lose type information on some
             # columns, and end up trying to merge
             # incompatible dtypes. See GH 16900.
-            else:
-                if name in self.left.columns:
-                    typ = lk.categories.dtype if lk_is_cat else object
-                    self.left = self.left.assign(
-                        **{name: self.left[name].astype(typ)})
-                if name in self.right.columns:
-                    typ = rk.categories.dtype if rk_is_cat else object
-                    self.right = self.right.assign(
-                        **{name: self.right[name].astype(typ)})
+            if name in self.left.columns:
+                typ = lk.categories.dtype if lk_is_cat else object
+                self.left = self.left.assign(
+                    **{name: self.left[name].astype(typ)})
+            if name in self.right.columns:
+                typ = rk.categories.dtype if rk_is_cat else object
+                self.right = self.right.assign(
+                    **{name: self.right[name].astype(typ)})
 
     def _validate_specification(self):
         # Hm, any way to make this logic less complicated??

diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py
@@ -16,7 +16,8 @@
                     Series, UInt64Index)
 from pandas.api.types import CategoricalDtype as CDT
 from pandas.compat import lrange
-from pandas.core.dtypes.common import is_categorical_dtype, is_object_dtype
+from pandas.core.dtypes.common import (
+    is_categorical_dtype, is_object_dtype, is_float_dtype, is_integer_dtype)
 from pandas.core.dtypes.dtypes import CategoricalDtype
 from pandas.core.reshape.concat import concat
 from pandas.core.reshape.merge import MergeError, merge
@@ -942,22 +943,8 @@ def test_different(self, right_vals):
         # GH 9780
         # We allow merging on object and categorical cols and cast
         # categorical cols to object
-        if (is_categorical_dtype(right['A'].dtype) or
-                is_object_dtype(right['A'].dtype)):
-            result = pd.merge(left, right, on='A')
-            assert is_object_dtype(result.A.dtype)
-
-        # GH 9780
-        # We raise for merging on object col and int/float col and
-        # merging on categorical col and int/float col
-        else:
-            msg = ("You are trying to merge on "
-                   "{lk_dtype} and {rk_dtype} columns. "
-                   "If you wish to proceed you should use "
-                   "pd.concat".format(lk_dtype=left['A'].dtype,
-                                      rk_dtype=right['A'].dtype))
-            with pytest.raises(ValueError, match=msg):
-                pd.merge(left, right, on='A')
+        result = pd.merge(left, right, on='A')
+        assert is_object_dtype(result.A.dtype)
 
     @pytest.mark.parametrize('d1', [np.int64, np.int32,
                                     np.int16, np.int8, np.uint8])
@@ -1055,22 +1042,53 @@ def test_merge_incompat_infer_boolean_object(self):
         result = pd.merge(df2, df1, on='key')
         assert_frame_equal(result, expected)
 
+    @pytest.mark.parametrize('df1_vals, df2_vals, left_type, right_type', [
+
+        # infer to numeric
+        ([0, 1, 2], ["0", "1", "2"],
+         is_integer_dtype, is_object_dtype),
+        ([0.0, 1.0, 2.0], ["0", "1", "2"],
+         is_float_dtype, is_object_dtype),
+
+        # unicode does not infer to numeric
+        ([0, 1, 2], [u"0", u"1", u"2"],
+         is_integer_dtype, is_object_dtype),
+
+        # merge on category coercs to object
+        ([0, 1, 2], Series(['a', 'b', 'a']).astype('category'),
+         is_object_dtype, is_object_dtype),
+        ([0.0, 1.0, 2.0], Series(['a', 'b', 'a']).astype('category'),
+         is_object_dtype, is_object_dtype),
+
+        # bool will infer if possible
+        ([0, 1], pd.Series([False, True], dtype=object),
+         is_integer_dtype, is_object_dtype),
+        ([0, 1], pd.Series([False, True], dtype=bool),
+         is_object_dtype, is_object_dtype)
+    ])
+    def test_merge_incompat_dtypes_are_ok(self, df1_vals, df2_vals,
+                                          left_type, right_type):
+        # these are explicity allowed incompat merges, that pass thru
+        # the result type is dependent on if the values on the rhs are
+        # inferred, otherwise these will be coereced to object
+
+        df1 = DataFrame({'A': df1_vals})
+        df2 = DataFrame({'A': df2_vals})
+
+        result = pd.merge(df1, df2, on=['A'])
+        assert left_type(result.A.dtype)
+        result = pd.merge(df2, df1, on=['A'])
+        assert right_type(result.A.dtype)
+
     @pytest.mark.parametrize('df1_vals, df2_vals', [
-        ([0, 1, 2], ["0", "1", "2"]),
-        ([0.0, 1.0, 2.0], ["0", "1", "2"]),
-        ([0, 1, 2], [u"0", u"1", u"2"]),
         (pd.date_range('1/1/2011', periods=2, freq='D'), ['2011-01-01',
                                                           '2011-01-02']),
         (pd.date_range('1/1/2011', periods=2, freq='D'), [0, 1]),
         (pd.date_range('1/1/2011', periods=2, freq='D'), [0.0, 1.0]),
         (pd.date_range('20130101', periods=3),
             pd.date_range('20130101', periods=3, tz='US/Eastern')),
-        ([0, 1, 2], Series(['a', 'b', 'a']).astype('category')),
-        ([0.0, 1.0, 2.0], Series(['a', 'b', 'a']).astype('category')),
-        # TODO ([0, 1], pd.Series([False, True], dtype=bool)),
-        ([0, 1], pd.Series([False, True], dtype=object))
     ])
-    def test_merge_incompat_dtypes(self, df1_vals, df2_vals):
+    def test_merge_incompat_dtypes_error(self, df1_vals, df2_vals):
         # GH 9780, GH 15800
         # Raise a ValueError when a user tries to merge on
         # dtypes that are incompatible (e.g., obj and int/float)