Skip to content

Commit f9f1643

Browse files
authored
BUG: merge not always following documented sort behavior (#54611)
* BUG: merge producing inconsistent sort behavior * whatsnew * fix more tests * fix test * fix test * move whatsnew to notable bug fixes
1 parent e8b9749 commit f9f1643

File tree

11 files changed

+181
-50
lines changed

11 files changed

+181
-50
lines changed

doc/source/whatsnew/v2.2.0.rst

+42-3
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,49 @@ Notable bug fixes
3939

4040
These are bug fixes that might have notable behavior changes.
4141

42-
.. _whatsnew_220.notable_bug_fixes.notable_bug_fix1:
42+
.. _whatsnew_220.notable_bug_fixes.merge_sort_behavior:
4343

44-
notable_bug_fix1
45-
^^^^^^^^^^^^^^^^
44+
:func:`merge` and :meth:`DataFrame.join` now consistently follow documented sort behavior
45+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
46+
47+
In previous versions of pandas, :func:`merge` and :meth:`DataFrame.join` did not
48+
always return a result that followed the documented sort behavior. pandas now
49+
follows the documented sort behavior in merge and join operations (:issue:`54611`).
50+
51+
As documented, ``sort=True`` sorts the join keys lexicographically in the resulting
52+
:class:`DataFrame`. With ``sort=False``, the order of the join keys depends on the
53+
join type (``how`` keyword):
54+
55+
- ``how="left"``: preserve the order of the left keys
56+
- ``how="right"``: preserve the order of the right keys
57+
- ``how="inner"``: preserve the order of the left keys
58+
- ``how="outer"``: sort keys lexicographically
59+
60+
One example with changing behavior is inner joins with non-unique left join keys
61+
and ``sort=False``:
62+
63+
.. ipython:: python
64+
65+
left = pd.DataFrame({"a": [1, 2, 1]})
66+
right = pd.DataFrame({"a": [1, 2]})
67+
result = pd.merge(left, right, how="inner", on="a", sort=False)
68+
69+
*Old Behavior*
70+
71+
.. code-block:: ipython
72+
73+
In [5]: result
74+
Out[5]:
75+
a
76+
0 1
77+
1 1
78+
2 2
79+
80+
*New Behavior*
81+
82+
.. ipython:: python
83+
84+
result
4685
4786
.. _whatsnew_220.notable_bug_fixes.notable_bug_fix2:
4887

pandas/_libs/join.pyi

+1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ def inner_join(
66
left: np.ndarray, # const intp_t[:]
77
right: np.ndarray, # const intp_t[:]
88
max_groups: int,
9+
sort: bool = ...,
910
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
1011
def left_outer_join(
1112
left: np.ndarray, # const intp_t[:]

pandas/_libs/join.pyx

+15-2
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ from pandas._libs.dtypes cimport (
2323
@cython.wraparound(False)
2424
@cython.boundscheck(False)
2525
def inner_join(const intp_t[:] left, const intp_t[:] right,
26-
Py_ssize_t max_groups):
26+
Py_ssize_t max_groups, bint sort=True):
2727
cdef:
2828
Py_ssize_t i, j, k, count = 0
2929
intp_t[::1] left_sorter, right_sorter
@@ -70,7 +70,20 @@ def inner_join(const intp_t[:] left, const intp_t[:] right,
7070
_get_result_indexer(left_sorter, left_indexer)
7171
_get_result_indexer(right_sorter, right_indexer)
7272

73-
return np.asarray(left_indexer), np.asarray(right_indexer)
73+
if not sort:
74+
# if not asked to sort, revert to original order
75+
if len(left) == len(left_indexer):
76+
# no multiple matches for any row on the left
77+
# this is a short-cut to avoid groupsort_indexer
78+
# otherwise, the `else` path also works in this case
79+
rev = np.empty(len(left), dtype=np.intp)
80+
rev.put(np.asarray(left_sorter), np.arange(len(left)))
81+
else:
82+
rev, _ = groupsort_indexer(left_indexer, len(left))
83+
84+
return np.asarray(left_indexer).take(rev), np.asarray(right_indexer).take(rev)
85+
else:
86+
return np.asarray(left_indexer), np.asarray(right_indexer)
7487

7588

7689
@cython.wraparound(False)

pandas/core/frame.py

+8-8
Original file line numberDiff line numberDiff line change
@@ -418,10 +418,10 @@
418418
lkey value_x rkey value_y
419419
0 foo 1 foo 5
420420
1 foo 1 foo 8
421-
2 foo 5 foo 5
422-
3 foo 5 foo 8
423-
4 bar 2 bar 6
424-
5 baz 3 baz 7
421+
2 bar 2 bar 6
422+
3 baz 3 baz 7
423+
4 foo 5 foo 5
424+
5 foo 5 foo 8
425425
426426
Merge DataFrames df1 and df2 with specified left and right suffixes
427427
appended to any overlapping columns.
@@ -431,10 +431,10 @@
431431
lkey value_left rkey value_right
432432
0 foo 1 foo 5
433433
1 foo 1 foo 8
434-
2 foo 5 foo 5
435-
3 foo 5 foo 8
436-
4 bar 2 bar 6
437-
5 baz 3 baz 7
434+
2 bar 2 bar 6
435+
3 baz 3 baz 7
436+
4 foo 5 foo 5
437+
5 foo 5 foo 8
438438
439439
Merge DataFrames df1 and df2, but raise an exception if the DataFrames have
440440
any overlapping columns.

pandas/core/indexes/base.py

+6-8
Original file line numberDiff line numberDiff line change
@@ -4633,7 +4633,7 @@ def join(
46334633
_validate_join_method(how)
46344634

46354635
if not self.is_unique and not other.is_unique:
4636-
return self._join_non_unique(other, how=how)
4636+
return self._join_non_unique(other, how=how, sort=sort)
46374637
elif not self.is_unique or not other.is_unique:
46384638
if self.is_monotonic_increasing and other.is_monotonic_increasing:
46394639
# Note: 2023-08-15 we *do* have tests that get here with
@@ -4645,7 +4645,7 @@ def join(
46454645
# go through object dtype for ea till engine is supported properly
46464646
return self._join_monotonic(other, how=how)
46474647
else:
4648-
return self._join_non_unique(other, how=how)
4648+
return self._join_non_unique(other, how=how, sort=sort)
46494649
elif (
46504650
# GH48504: exclude MultiIndex to avoid going through MultiIndex._values
46514651
self.is_monotonic_increasing
@@ -4679,15 +4679,13 @@ def _join_via_get_indexer(
46794679
elif how == "right":
46804680
join_index = other
46814681
elif how == "inner":
4682-
# TODO: sort=False here for backwards compat. It may
4683-
# be better to use the sort parameter passed into join
4684-
join_index = self.intersection(other, sort=False)
4682+
join_index = self.intersection(other, sort=sort)
46854683
elif how == "outer":
46864684
# TODO: sort=True here for backwards compat. It may
46874685
# be better to use the sort parameter passed into join
46884686
join_index = self.union(other)
46894687

4690-
if sort:
4688+
if sort and how in ["left", "right"]:
46914689
join_index = join_index.sort_values()
46924690

46934691
if join_index is self:
@@ -4784,15 +4782,15 @@ def _join_multi(self, other: Index, how: JoinHow):
47844782

47854783
@final
47864784
def _join_non_unique(
4787-
self, other: Index, how: JoinHow = "left"
4785+
self, other: Index, how: JoinHow = "left", sort: bool = False
47884786
) -> tuple[Index, npt.NDArray[np.intp], npt.NDArray[np.intp]]:
47894787
from pandas.core.reshape.merge import get_join_indexers
47904788

47914789
# We only get here if dtypes match
47924790
assert self.dtype == other.dtype
47934791

47944792
left_idx, right_idx = get_join_indexers(
4795-
[self._values], [other._values], how=how, sort=True
4793+
[self._values], [other._values], how=how, sort=sort
47964794
)
47974795
mask = left_idx == -1
47984796

pandas/core/reshape/merge.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -1679,6 +1679,9 @@ def get_join_indexers(
16791679
elif not sort and how in ["left", "outer"]:
16801680
return _get_no_sort_one_missing_indexer(left_n, False)
16811681

1682+
if not sort and how == "outer":
1683+
sort = True
1684+
16821685
# get left & right join labels and num. of levels at each location
16831686
mapped = (
16841687
_factorize_keys(left_keys[n], right_keys[n], sort=sort, how=how)
@@ -1697,7 +1700,7 @@ def get_join_indexers(
16971700
lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort, how=how)
16981701
# preserve left frame order if how == 'left' and sort == False
16991702
kwargs = {}
1700-
if how in ("left", "right"):
1703+
if how in ("inner", "left", "right"):
17011704
kwargs["sort"] = sort
17021705
join_func = {
17031706
"inner": libjoin.inner_join,

pandas/tests/extension/base/reshaping.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -236,9 +236,9 @@ def test_merge_on_extension_array_duplicates(self, data):
236236
result = pd.merge(df1, df2, on="key")
237237
expected = pd.DataFrame(
238238
{
239-
"key": key.take([0, 0, 0, 0, 1]),
240-
"val_x": [1, 1, 3, 3, 2],
241-
"val_y": [1, 3, 1, 3, 2],
239+
"key": key.take([0, 0, 1, 2, 2]),
240+
"val_x": [1, 1, 2, 3, 3],
241+
"val_y": [1, 3, 2, 1, 3],
242242
}
243243
)
244244
tm.assert_frame_equal(result, expected)

pandas/tests/indexes/numeric/test_join.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,13 @@ def test_join_non_unique(self):
1111

1212
joined, lidx, ridx = left.join(left, return_indexers=True)
1313

14-
exp_joined = Index([3, 3, 3, 3, 4, 4, 4, 4])
14+
exp_joined = Index([4, 4, 4, 4, 3, 3, 3, 3])
1515
tm.assert_index_equal(joined, exp_joined)
1616

17-
exp_lidx = np.array([2, 2, 3, 3, 0, 0, 1, 1], dtype=np.intp)
17+
exp_lidx = np.array([0, 0, 1, 1, 2, 2, 3, 3], dtype=np.intp)
1818
tm.assert_numpy_array_equal(lidx, exp_lidx)
1919

20-
exp_ridx = np.array([2, 3, 2, 3, 0, 1, 0, 1], dtype=np.intp)
20+
exp_ridx = np.array([0, 1, 0, 1, 2, 3, 2, 3], dtype=np.intp)
2121
tm.assert_numpy_array_equal(ridx, exp_ridx)
2222

2323
def test_join_inner(self):

pandas/tests/reshape/merge/test_merge.py

+89-12
Original file line numberDiff line numberDiff line change
@@ -1462,13 +1462,14 @@ def test_merge_readonly(self):
14621462

14631463
def _check_merge(x, y):
14641464
for how in ["inner", "left", "outer"]:
1465-
result = x.join(y, how=how)
1465+
for sort in [True, False]:
1466+
result = x.join(y, how=how, sort=sort)
14661467

1467-
expected = merge(x.reset_index(), y.reset_index(), how=how, sort=True)
1468-
expected = expected.set_index("index")
1468+
expected = merge(x.reset_index(), y.reset_index(), how=how, sort=sort)
1469+
expected = expected.set_index("index")
14691470

1470-
# TODO check_names on merge?
1471-
tm.assert_frame_equal(result, expected, check_names=False)
1471+
# TODO check_names on merge?
1472+
tm.assert_frame_equal(result, expected, check_names=False)
14721473

14731474

14741475
class TestMergeDtypes:
@@ -1751,7 +1752,7 @@ def test_merge_string_dtype(self, how, expected_data, any_string_dtype):
17511752
"how, expected_data",
17521753
[
17531754
("inner", [[True, 1, 4], [False, 5, 3]]),
1754-
("outer", [[True, 1, 4], [False, 5, 3]]),
1755+
("outer", [[False, 5, 3], [True, 1, 4]]),
17551756
("left", [[True, 1, 4], [False, 5, 3]]),
17561757
("right", [[False, 5, 3], [True, 1, 4]]),
17571758
],
@@ -2331,9 +2332,9 @@ def test_merge_suffix(col1, col2, kwargs, expected_cols):
23312332
"outer",
23322333
DataFrame(
23332334
{
2334-
"A": [100, 200, 1, 300],
2335-
"B1": [60, 70, 80, np.nan],
2336-
"B2": [600, 700, np.nan, 800],
2335+
"A": [1, 100, 200, 300],
2336+
"B1": [80, 60, 70, np.nan],
2337+
"B2": [np.nan, 600, 700, 800],
23372338
}
23382339
),
23392340
),
@@ -2752,9 +2753,9 @@ def test_merge_outer_with_NaN(dtype):
27522753
result = merge(right, left, on="key", how="outer")
27532754
expected = DataFrame(
27542755
{
2755-
"key": [np.nan, np.nan, 1, 2],
2756-
"col2": [3, 4, np.nan, np.nan],
2757-
"col1": [np.nan, np.nan, 1, 2],
2756+
"key": [1, 2, np.nan, np.nan],
2757+
"col2": [np.nan, np.nan, 3, 4],
2758+
"col1": [1, 2, np.nan, np.nan],
27582759
},
27592760
dtype=dtype,
27602761
)
@@ -2847,3 +2848,79 @@ def test_merge_multiindex_single_level():
28472848

28482849
result = df.merge(df2, left_on=["col"], right_index=True, how="left")
28492850
tm.assert_frame_equal(result, expected)
2851+
2852+
2853+
@pytest.mark.parametrize("how", ["left", "right", "inner", "outer"])
2854+
@pytest.mark.parametrize("sort", [True, False])
2855+
@pytest.mark.parametrize("on_index", [True, False])
2856+
@pytest.mark.parametrize("left_unique", [True, False])
2857+
@pytest.mark.parametrize("left_monotonic", [True, False])
2858+
@pytest.mark.parametrize("right_unique", [True, False])
2859+
@pytest.mark.parametrize("right_monotonic", [True, False])
2860+
def test_merge_combinations(
2861+
how, sort, on_index, left_unique, left_monotonic, right_unique, right_monotonic
2862+
):
2863+
# GH 54611
2864+
left = [2, 3]
2865+
if left_unique:
2866+
left.append(4 if left_monotonic else 1)
2867+
else:
2868+
left.append(3 if left_monotonic else 2)
2869+
2870+
right = [2, 3]
2871+
if right_unique:
2872+
right.append(4 if right_monotonic else 1)
2873+
else:
2874+
right.append(3 if right_monotonic else 2)
2875+
2876+
left = DataFrame({"key": left})
2877+
right = DataFrame({"key": right})
2878+
2879+
if on_index:
2880+
left = left.set_index("key")
2881+
right = right.set_index("key")
2882+
on_kwargs = {"left_index": True, "right_index": True}
2883+
else:
2884+
on_kwargs = {"on": "key"}
2885+
2886+
result = merge(left, right, how=how, sort=sort, **on_kwargs)
2887+
2888+
if on_index:
2889+
left = left.reset_index()
2890+
right = right.reset_index()
2891+
2892+
if how in ["left", "right", "inner"]:
2893+
if how in ["left", "inner"]:
2894+
expected, other, other_unique = left, right, right_unique
2895+
else:
2896+
expected, other, other_unique = right, left, left_unique
2897+
if how == "inner":
2898+
keep_values = set(left["key"].values).intersection(right["key"].values)
2899+
keep_mask = expected["key"].isin(keep_values)
2900+
expected = expected[keep_mask]
2901+
if sort:
2902+
expected = expected.sort_values("key")
2903+
if not other_unique:
2904+
other_value_counts = other["key"].value_counts()
2905+
repeats = other_value_counts.reindex(expected["key"].values, fill_value=1)
2906+
repeats = repeats.astype(np.intp)
2907+
expected = expected["key"].repeat(repeats.values)
2908+
expected = expected.to_frame()
2909+
elif how == "outer":
2910+
if on_index and left_unique and left["key"].equals(right["key"]):
2911+
expected = DataFrame({"key": left["key"]})
2912+
else:
2913+
left_counts = left["key"].value_counts()
2914+
right_counts = right["key"].value_counts()
2915+
expected_counts = left_counts.mul(right_counts, fill_value=1)
2916+
expected_counts = expected_counts.astype(np.intp)
2917+
expected = expected_counts.index.values.repeat(expected_counts.values)
2918+
expected = DataFrame({"key": expected})
2919+
expected = expected.sort_values("key")
2920+
2921+
if on_index:
2922+
expected = expected.set_index("key")
2923+
else:
2924+
expected = expected.reset_index(drop=True)
2925+
2926+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)