BUG: merge not always following documented sort behavior (#54611)

lukemanley · web-flow · commit f9f164339c48 · 2023-08-22T17:30:45.000-07:00
* BUG: merge producing inconsistent sort behavior

* whatsnew

* fix more tests

* fix test

* fix test

* move whatsnew to notable bug fixes
diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
@@ -39,10 +39,49 @@ Notable bug fixes
 
 These are bug fixes that might have notable behavior changes.
 
-.. _whatsnew_220.notable_bug_fixes.notable_bug_fix1:
+.. _whatsnew_220.notable_bug_fixes.merge_sort_behavior:
 
-notable_bug_fix1
-^^^^^^^^^^^^^^^^
+:func:`merge` and :meth:`DataFrame.join` now consistently follow documented sort behavior
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In previous versions of pandas, :func:`merge` and :meth:`DataFrame.join` did not
+always return a result that followed the documented sort behavior. pandas now
+follows the documented sort behavior in merge and join operations (:issue:`54611`).
+
+As documented, ``sort=True`` sorts the join keys lexicographically in the resulting
+:class:`DataFrame`. With ``sort=False``, the order of the join keys depends on the
+join type (``how`` keyword):
+
+- ``how="left"``: preserve the order of the left keys
+- ``how="right"``: preserve the order of the right keys
+- ``how="inner"``: preserve the order of the left keys
+- ``how="outer"``: sort keys lexicographically
+
+One example with changing behavior is inner joins with non-unique left join keys
+and ``sort=False``:
+
+.. ipython:: python
+
+    left = pd.DataFrame({"a": [1, 2, 1]})
+    right = pd.DataFrame({"a": [1, 2]})
+    result = pd.merge(left, right, how="inner", on="a", sort=False)
+
+*Old Behavior*
+
+.. code-block:: ipython
+
+    In [5]: result
+    Out[5]:
+       a
+    0  1
+    1  1
+    2  2
+
+*New Behavior*
+
+.. ipython:: python
+
+    result
 
 .. _whatsnew_220.notable_bug_fixes.notable_bug_fix2:
 
diff --git a/pandas/_libs/join.pyi b/pandas/_libs/join.pyi
@@ -6,6 +6,7 @@ def inner_join(
     left: np.ndarray,  # const intp_t[:]
     right: np.ndarray,  # const intp_t[:]
     max_groups: int,
+    sort: bool = ...,
 ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
 def left_outer_join(
     left: np.ndarray,  # const intp_t[:]
diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx
@@ -23,7 +23,7 @@ from pandas._libs.dtypes cimport (
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def inner_join(const intp_t[:] left, const intp_t[:] right,
-               Py_ssize_t max_groups):
+               Py_ssize_t max_groups, bint sort=True):
     cdef:
         Py_ssize_t i, j, k, count = 0
         intp_t[::1] left_sorter, right_sorter
@@ -70,7 +70,20 @@ def inner_join(const intp_t[:] left, const intp_t[:] right,
         _get_result_indexer(left_sorter, left_indexer)
         _get_result_indexer(right_sorter, right_indexer)
 
-    return np.asarray(left_indexer), np.asarray(right_indexer)
+    if not sort:
+        # if not asked to sort, revert to original order
+        if len(left) == len(left_indexer):
+            # no multiple matches for any row on the left
+            # this is a short-cut to avoid groupsort_indexer
+            # otherwise, the `else` path also works in this case
+            rev = np.empty(len(left), dtype=np.intp)
+            rev.put(np.asarray(left_sorter), np.arange(len(left)))
+        else:
+            rev, _ = groupsort_indexer(left_indexer, len(left))
+
+        return np.asarray(left_indexer).take(rev), np.asarray(right_indexer).take(rev)
+    else:
+        return np.asarray(left_indexer), np.asarray(right_indexer)
 
 
 @cython.wraparound(False)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -418,10 +418,10 @@
   lkey  value_x rkey  value_y
 0  foo        1  foo        5
 1  foo        1  foo        8
-2  foo        5  foo        5
-3  foo        5  foo        8
-4  bar        2  bar        6
-5  baz        3  baz        7
+2  bar        2  bar        6
+3  baz        3  baz        7
+4  foo        5  foo        5
+5  foo        5  foo        8
 
 Merge DataFrames df1 and df2 with specified left and right suffixes
 appended to any overlapping columns.
@@ -431,10 +431,10 @@
   lkey  value_left rkey  value_right
 0  foo           1  foo            5
 1  foo           1  foo            8
-2  foo           5  foo            5
-3  foo           5  foo            8
-4  bar           2  bar            6
-5  baz           3  baz            7
+2  bar           2  bar            6
+3  baz           3  baz            7
+4  foo           5  foo            5
+5  foo           5  foo            8
 
 Merge DataFrames df1 and df2, but raise an exception if the DataFrames have
 any overlapping columns.
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -4633,7 +4633,7 @@ def join(
         _validate_join_method(how)
 
         if not self.is_unique and not other.is_unique:
-            return self._join_non_unique(other, how=how)
+            return self._join_non_unique(other, how=how, sort=sort)
         elif not self.is_unique or not other.is_unique:
             if self.is_monotonic_increasing and other.is_monotonic_increasing:
                 # Note: 2023-08-15 we *do* have tests that get here with
@@ -4645,7 +4645,7 @@ def join(
                     # go through object dtype for ea till engine is supported properly
                     return self._join_monotonic(other, how=how)
             else:
-                return self._join_non_unique(other, how=how)
+                return self._join_non_unique(other, how=how, sort=sort)
         elif (
             # GH48504: exclude MultiIndex to avoid going through MultiIndex._values
             self.is_monotonic_increasing
@@ -4679,15 +4679,13 @@ def _join_via_get_indexer(
         elif how == "right":
             join_index = other
         elif how == "inner":
-            # TODO: sort=False here for backwards compat. It may
-            # be better to use the sort parameter passed into join
-            join_index = self.intersection(other, sort=False)
+            join_index = self.intersection(other, sort=sort)
         elif how == "outer":
             # TODO: sort=True here for backwards compat. It may
             # be better to use the sort parameter passed into join
             join_index = self.union(other)
 
-        if sort:
+        if sort and how in ["left", "right"]:
             join_index = join_index.sort_values()
 
         if join_index is self:
@@ -4784,15 +4782,15 @@ def _join_multi(self, other: Index, how: JoinHow):
 
     @final
     def _join_non_unique(
-        self, other: Index, how: JoinHow = "left"
+        self, other: Index, how: JoinHow = "left", sort: bool = False
     ) -> tuple[Index, npt.NDArray[np.intp], npt.NDArray[np.intp]]:
         from pandas.core.reshape.merge import get_join_indexers
 
         # We only get here if dtypes match
         assert self.dtype == other.dtype
 
         left_idx, right_idx = get_join_indexers(
-            [self._values], [other._values], how=how, sort=True
+            [self._values], [other._values], how=how, sort=sort
         )
         mask = left_idx == -1
 
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
@@ -1679,6 +1679,9 @@ def get_join_indexers(
         elif not sort and how in ["left", "outer"]:
             return _get_no_sort_one_missing_indexer(left_n, False)
 
+    if not sort and how == "outer":
+        sort = True
+
     # get left & right join labels and num. of levels at each location
     mapped = (
         _factorize_keys(left_keys[n], right_keys[n], sort=sort, how=how)
@@ -1697,7 +1700,7 @@ def get_join_indexers(
     lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort, how=how)
     # preserve left frame order if how == 'left' and sort == False
     kwargs = {}
-    if how in ("left", "right"):
+    if how in ("inner", "left", "right"):
         kwargs["sort"] = sort
     join_func = {
         "inner": libjoin.inner_join,
diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py
@@ -236,9 +236,9 @@ def test_merge_on_extension_array_duplicates(self, data):
         result = pd.merge(df1, df2, on="key")
         expected = pd.DataFrame(
             {
-                "key": key.take([0, 0, 0, 0, 1]),
-                "val_x": [1, 1, 3, 3, 2],
-                "val_y": [1, 3, 1, 3, 2],
+                "key": key.take([0, 0, 1, 2, 2]),
+                "val_x": [1, 1, 2, 3, 3],
+                "val_y": [1, 3, 2, 1, 3],
             }
         )
         tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/indexes/numeric/test_join.py b/pandas/tests/indexes/numeric/test_join.py
@@ -11,13 +11,13 @@ def test_join_non_unique(self):
 
         joined, lidx, ridx = left.join(left, return_indexers=True)
 
-        exp_joined = Index([3, 3, 3, 3, 4, 4, 4, 4])
+        exp_joined = Index([4, 4, 4, 4, 3, 3, 3, 3])
         tm.assert_index_equal(joined, exp_joined)
 
-        exp_lidx = np.array([2, 2, 3, 3, 0, 0, 1, 1], dtype=np.intp)
+        exp_lidx = np.array([0, 0, 1, 1, 2, 2, 3, 3], dtype=np.intp)
         tm.assert_numpy_array_equal(lidx, exp_lidx)
 
-        exp_ridx = np.array([2, 3, 2, 3, 0, 1, 0, 1], dtype=np.intp)
+        exp_ridx = np.array([0, 1, 0, 1, 2, 3, 2, 3], dtype=np.intp)
         tm.assert_numpy_array_equal(ridx, exp_ridx)
 
     def test_join_inner(self):
diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py
@@ -1462,13 +1462,14 @@ def test_merge_readonly(self):
 
 def _check_merge(x, y):
     for how in ["inner", "left", "outer"]:
-        result = x.join(y, how=how)
+        for sort in [True, False]:
+            result = x.join(y, how=how, sort=sort)
 
-        expected = merge(x.reset_index(), y.reset_index(), how=how, sort=True)
-        expected = expected.set_index("index")
+            expected = merge(x.reset_index(), y.reset_index(), how=how, sort=sort)
+            expected = expected.set_index("index")
 
-        # TODO check_names on merge?
-        tm.assert_frame_equal(result, expected, check_names=False)
+            # TODO check_names on merge?
+            tm.assert_frame_equal(result, expected, check_names=False)
 
 
 class TestMergeDtypes:
@@ -1751,7 +1752,7 @@ def test_merge_string_dtype(self, how, expected_data, any_string_dtype):
         "how, expected_data",
         [
             ("inner", [[True, 1, 4], [False, 5, 3]]),
-            ("outer", [[True, 1, 4], [False, 5, 3]]),
+            ("outer", [[False, 5, 3], [True, 1, 4]]),
             ("left", [[True, 1, 4], [False, 5, 3]]),
             ("right", [[False, 5, 3], [True, 1, 4]]),
         ],
@@ -2331,9 +2332,9 @@ def test_merge_suffix(col1, col2, kwargs, expected_cols):
             "outer",
             DataFrame(
                 {
-                    "A": [100, 200, 1, 300],
-                    "B1": [60, 70, 80, np.nan],
-                    "B2": [600, 700, np.nan, 800],
+                    "A": [1, 100, 200, 300],
+                    "B1": [80, 60, 70, np.nan],
+                    "B2": [np.nan, 600, 700, 800],
                 }
             ),
         ),
@@ -2752,9 +2753,9 @@ def test_merge_outer_with_NaN(dtype):
     result = merge(right, left, on="key", how="outer")
     expected = DataFrame(
         {
-            "key": [np.nan, np.nan, 1, 2],
-            "col2": [3, 4, np.nan, np.nan],
-            "col1": [np.nan, np.nan, 1, 2],
+            "key": [1, 2, np.nan, np.nan],
+            "col2": [np.nan, np.nan, 3, 4],
+            "col1": [1, 2, np.nan, np.nan],
         },
         dtype=dtype,
     )
@@ -2847,3 +2848,79 @@ def test_merge_multiindex_single_level():
 
     result = df.merge(df2, left_on=["col"], right_index=True, how="left")
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("how", ["left", "right", "inner", "outer"])
+@pytest.mark.parametrize("sort", [True, False])
+@pytest.mark.parametrize("on_index", [True, False])
+@pytest.mark.parametrize("left_unique", [True, False])
+@pytest.mark.parametrize("left_monotonic", [True, False])
+@pytest.mark.parametrize("right_unique", [True, False])
+@pytest.mark.parametrize("right_monotonic", [True, False])
+def test_merge_combinations(
+    how, sort, on_index, left_unique, left_monotonic, right_unique, right_monotonic
+):
+    # GH 54611
+    left = [2, 3]
+    if left_unique:
+        left.append(4 if left_monotonic else 1)
+    else:
+        left.append(3 if left_monotonic else 2)
+
+    right = [2, 3]
+    if right_unique:
+        right.append(4 if right_monotonic else 1)
+    else:
+        right.append(3 if right_monotonic else 2)
+
+    left = DataFrame({"key": left})
+    right = DataFrame({"key": right})
+
+    if on_index:
+        left = left.set_index("key")
+        right = right.set_index("key")
+        on_kwargs = {"left_index": True, "right_index": True}
+    else:
+        on_kwargs = {"on": "key"}
+
+    result = merge(left, right, how=how, sort=sort, **on_kwargs)
+
+    if on_index:
+        left = left.reset_index()
+        right = right.reset_index()
+
+    if how in ["left", "right", "inner"]:
+        if how in ["left", "inner"]:
+            expected, other, other_unique = left, right, right_unique
+        else:
+            expected, other, other_unique = right, left, left_unique
+        if how == "inner":
+            keep_values = set(left["key"].values).intersection(right["key"].values)
+            keep_mask = expected["key"].isin(keep_values)
+            expected = expected[keep_mask]
+        if sort:
+            expected = expected.sort_values("key")
+        if not other_unique:
+            other_value_counts = other["key"].value_counts()
+            repeats = other_value_counts.reindex(expected["key"].values, fill_value=1)
+            repeats = repeats.astype(np.intp)
+            expected = expected["key"].repeat(repeats.values)
+            expected = expected.to_frame()
+    elif how == "outer":
+        if on_index and left_unique and left["key"].equals(right["key"]):
+            expected = DataFrame({"key": left["key"]})
+        else:
+            left_counts = left["key"].value_counts()
+            right_counts = right["key"].value_counts()
+            expected_counts = left_counts.mul(right_counts, fill_value=1)
+            expected_counts = expected_counts.astype(np.intp)
+            expected = expected_counts.index.values.repeat(expected_counts.values)
+            expected = DataFrame({"key": expected})
+            expected = expected.sort_values("key")
+
+    if on_index:
+        expected = expected.set_index("key")
+    else:
+        expected = expected.reset_index(drop=True)
+
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py
diff --git a/pandas/tests/strings/test_cat.py b/pandas/tests/strings/test_cat.py

Original file line number	Diff line number	Diff line change
`@@ -236,9 +236,9 @@ def test_merge_on_extension_array_duplicates(self, data):`
`236`	`236`	`result = pd.merge(df1, df2, on="key")`
`237`	`237`	`expected = pd.DataFrame(`
`238`	`238`	`{`
`239`		`- "key": key.take([0, 0, 0, 0, 1]),`
`240`		`- "val_x": [1, 1, 3, 3, 2],`
`241`		`- "val_y": [1, 3, 1, 3, 2],`
	`239`	`+ "key": key.take([0, 0, 1, 2, 2]),`
	`240`	`+ "val_x": [1, 1, 2, 3, 3],`
	`241`	`+ "val_y": [1, 3, 2, 1, 3],`
`242`	`242`	`}`
`243`	`243`	`)`
`244`	`244`	`tm.assert_frame_equal(result, expected)`