Skip to content
51 changes: 30 additions & 21 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
is_datetime64tz_dtype, is_datetimelike, is_dtype_equal,
is_extension_array_dtype, is_float_dtype, is_int64_dtype, is_integer,
is_integer_dtype, is_list_like, is_number, is_numeric_dtype,
needs_i8_conversion)
is_object_dtype, needs_i8_conversion)
from pandas.core.dtypes.missing import isnull, na_value_for_dtype

from pandas import Categorical, DataFrame, Index, MultiIndex, Series, Timedelta
Expand Down Expand Up @@ -891,7 +891,7 @@ def _maybe_coerce_merge_keys(self):
# coerce these if they are originally incompatible types
#
# for example if these are categorical, but are not dtype_equal
# or if we have object and integer dtypes
# or if we have object and integer dtypes, that do not infer

for lk, rk, name in zip(self.left_join_keys,
self.right_join_keys,
Expand All @@ -901,6 +901,8 @@ def _maybe_coerce_merge_keys(self):

lk_is_cat = is_categorical_dtype(lk)
rk_is_cat = is_categorical_dtype(rk)
lk_is_object = is_object_dtype(lk)
rk_is_object = is_object_dtype(rk)

# if either left or right is a categorical
# then the must match exactly in categories & ordered
Expand All @@ -925,7 +927,7 @@ def _maybe_coerce_merge_keys(self):
# the same, then proceed
if is_numeric_dtype(lk) and is_numeric_dtype(rk):
if lk.dtype.kind == rk.dtype.kind:
pass
continue

# check whether ints and floats
elif is_integer_dtype(rk) and is_float_dtype(lk):
Expand All @@ -934,29 +936,34 @@ def _maybe_coerce_merge_keys(self):
'columns where the float values '
'are not equal to their int '
'representation', UserWarning)
continue

elif is_float_dtype(rk) and is_integer_dtype(lk):
if not (rk == rk.astype(lk.dtype))[~np.isnan(rk)].all():
warnings.warn('You are merging on int and float '
'columns where the float values '
'are not equal to their int '
'representation', UserWarning)
continue

# let's infer and see if we are ok
elif lib.infer_dtype(lk) == lib.infer_dtype(rk):
pass
continue

# Check if we are trying to merge on obviously
# incompatible dtypes GH 9780, GH 15800

# boolean values are considered as numeric, but are still allowed
# to be merged on object boolean values
elif ((is_numeric_dtype(lk) and not is_bool_dtype(lk))
and not is_numeric_dtype(rk)):
raise ValueError(msg)
elif (not is_numeric_dtype(lk)
and (is_numeric_dtype(rk) and not is_bool_dtype(rk))):
raise ValueError(msg)
# bool values are coerced to object
elif ((lk_is_object and is_bool_dtype(rk)) or
(is_bool_dtype(lk) and rk_is_object)):
pass

# object values are allowed to be merged
elif ((lk_is_object and is_numeric_dtype(rk)) or
(is_numeric_dtype(lk) and rk_is_object)):
continue

# datetimelikes must match exactly
elif is_datetimelike(lk) and not is_datetimelike(rk):
raise ValueError(msg)
elif not is_datetimelike(lk) and is_datetimelike(rk):
Expand All @@ -966,22 +973,24 @@ def _maybe_coerce_merge_keys(self):
elif not is_datetime64tz_dtype(lk) and is_datetime64tz_dtype(rk):
raise ValueError(msg)

elif lk_is_object and rk_is_object:
continue

# Houston, we have a problem!
# let's coerce to object if the dtypes aren't
# categorical, otherwise coerce to the category
# dtype. If we coerced categories to object,
# then we would lose type information on some
# columns, and end up trying to merge
# incompatible dtypes. See GH 16900.
else:
if name in self.left.columns:
typ = lk.categories.dtype if lk_is_cat else object
self.left = self.left.assign(
**{name: self.left[name].astype(typ)})
if name in self.right.columns:
typ = rk.categories.dtype if rk_is_cat else object
self.right = self.right.assign(
**{name: self.right[name].astype(typ)})
if name in self.left.columns:
typ = lk.categories.dtype if lk_is_cat else object
self.left = self.left.assign(
**{name: self.left[name].astype(typ)})
if name in self.right.columns:
typ = rk.categories.dtype if rk_is_cat else object
self.right = self.right.assign(
**{name: self.right[name].astype(typ)})

def _validate_specification(self):
# Hm, any way to make this logic less complicated??
Expand Down
68 changes: 43 additions & 25 deletions pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@
Series, UInt64Index)
from pandas.api.types import CategoricalDtype as CDT
from pandas.compat import lrange
from pandas.core.dtypes.common import is_categorical_dtype, is_object_dtype
from pandas.core.dtypes.common import (
is_categorical_dtype, is_object_dtype, is_float_dtype, is_integer_dtype)
from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas.core.reshape.concat import concat
from pandas.core.reshape.merge import MergeError, merge
Expand Down Expand Up @@ -942,22 +943,8 @@ def test_different(self, right_vals):
# GH 9780
# We allow merging on object and categorical cols and cast
# categorical cols to object
if (is_categorical_dtype(right['A'].dtype) or
is_object_dtype(right['A'].dtype)):
result = pd.merge(left, right, on='A')
assert is_object_dtype(result.A.dtype)

# GH 9780
# We raise for merging on object col and int/float col and
# merging on categorical col and int/float col
else:
msg = ("You are trying to merge on "
"{lk_dtype} and {rk_dtype} columns. "
"If you wish to proceed you should use "
"pd.concat".format(lk_dtype=left['A'].dtype,
rk_dtype=right['A'].dtype))
with pytest.raises(ValueError, match=msg):
pd.merge(left, right, on='A')
result = pd.merge(left, right, on='A')
assert is_object_dtype(result.A.dtype)

@pytest.mark.parametrize('d1', [np.int64, np.int32,
np.int16, np.int8, np.uint8])
Expand Down Expand Up @@ -1055,22 +1042,53 @@ def test_merge_incompat_infer_boolean_object(self):
result = pd.merge(df2, df1, on='key')
assert_frame_equal(result, expected)

@pytest.mark.parametrize('df1_vals, df2_vals, left_type, right_type', [

# infer to numeric
([0, 1, 2], ["0", "1", "2"],
is_integer_dtype, is_object_dtype),
([0.0, 1.0, 2.0], ["0", "1", "2"],
is_float_dtype, is_object_dtype),

# unicode does not infer to numeric
([0, 1, 2], [u"0", u"1", u"2"],
is_integer_dtype, is_object_dtype),

# merge on category coercs to object
([0, 1, 2], Series(['a', 'b', 'a']).astype('category'),
is_object_dtype, is_object_dtype),
([0.0, 1.0, 2.0], Series(['a', 'b', 'a']).astype('category'),
is_object_dtype, is_object_dtype),

# bool will infer if possible
([0, 1], pd.Series([False, True], dtype=object),
is_integer_dtype, is_object_dtype),
([0, 1], pd.Series([False, True], dtype=bool),
is_object_dtype, is_object_dtype)
])
def test_merge_incompat_dtypes_are_ok(self, df1_vals, df2_vals,
left_type, right_type):
# these are explicity allowed incompat merges, that pass thru
# the result type is dependent on if the values on the rhs are
# inferred, otherwise these will be coereced to object

df1 = DataFrame({'A': df1_vals})
df2 = DataFrame({'A': df2_vals})

result = pd.merge(df1, df2, on=['A'])
assert left_type(result.A.dtype)
result = pd.merge(df2, df1, on=['A'])
assert right_type(result.A.dtype)

@pytest.mark.parametrize('df1_vals, df2_vals', [
([0, 1, 2], ["0", "1", "2"]),
([0.0, 1.0, 2.0], ["0", "1", "2"]),
([0, 1, 2], [u"0", u"1", u"2"]),
(pd.date_range('1/1/2011', periods=2, freq='D'), ['2011-01-01',
'2011-01-02']),
(pd.date_range('1/1/2011', periods=2, freq='D'), [0, 1]),
(pd.date_range('1/1/2011', periods=2, freq='D'), [0.0, 1.0]),
(pd.date_range('20130101', periods=3),
pd.date_range('20130101', periods=3, tz='US/Eastern')),
([0, 1, 2], Series(['a', 'b', 'a']).astype('category')),
([0.0, 1.0, 2.0], Series(['a', 'b', 'a']).astype('category')),
# TODO ([0, 1], pd.Series([False, True], dtype=bool)),
([0, 1], pd.Series([False, True], dtype=object))
])
def test_merge_incompat_dtypes(self, df1_vals, df2_vals):
def test_merge_incompat_dtypes_error(self, df1_vals, df2_vals):
# GH 9780, GH 15800
# Raise a ValueError when a user tries to merge on
# dtypes that are incompatible (e.g., obj and int/float)
Expand Down