From 16b9642c79e28846dfcb92400ca3b951fdac4fb2 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 21 Nov 2023 20:13:54 -0500 Subject: [PATCH] outer join on equal indexes to sort by default --- pandas/core/indexes/base.py | 3 +-- pandas/tests/indexes/datetimes/test_join.py | 10 +++++++--- pandas/tests/indexes/multi/test_join.py | 15 +++++++++++---- pandas/tests/indexes/period/test_join.py | 5 +++-- pandas/tests/indexes/test_base.py | 7 +++++-- pandas/tests/indexes/test_old_base.py | 6 +++++- pandas/tests/indexes/timedeltas/test_join.py | 3 ++- pandas/tests/reshape/merge/test_merge.py | 17 +++++++---------- 8 files changed, 41 insertions(+), 25 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index cbf7dc5ba67d2..619ad3f4921b0 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4653,9 +4653,8 @@ def _join_via_get_indexer( elif how == "inner": join_index = self.intersection(other, sort=sort) elif how == "outer": - # TODO: sort=True here for backwards compat. It may - # be better to use the sort parameter passed into join join_index = self.union(other) + join_index = _maybe_try_sort(join_index, sort=None) if sort and how in ["left", "right"]: join_index = join_index.sort_values() diff --git a/pandas/tests/indexes/datetimes/test_join.py b/pandas/tests/indexes/datetimes/test_join.py index 959fbab0dcec6..09e28ce239370 100644 --- a/pandas/tests/indexes/datetimes/test_join.py +++ b/pandas/tests/indexes/datetimes/test_join.py @@ -32,7 +32,8 @@ def test_does_not_convert_mixed_integer(self): r_idx_type="i", c_idx_type="dt", ) - cols = df.columns.join(df.index, how="outer") + with tm.assert_produces_warning(RuntimeWarning): + cols = df.columns.join(df.index, how="outer") joined = cols.join(df.columns) assert cols.dtype == np.dtype("O") assert cols.dtype == joined.dtype @@ -53,8 +54,11 @@ def test_join_with_period_index(self, join_type): ) s = df.iloc[:5, 0] - expected = df.columns.astype("O").join(s.index, how=join_type) - result = df.columns.join(s.index, how=join_type) + warning = RuntimeWarning if join_type == "outer" else None + with tm.assert_produces_warning(warning): + expected = df.columns.astype("O").join(s.index, how=join_type) + with tm.assert_produces_warning(warning): + result = df.columns.join(s.index, how=join_type) tm.assert_index_equal(expected, result) def test_join_object_index(self): diff --git a/pandas/tests/indexes/multi/test_join.py b/pandas/tests/indexes/multi/test_join.py index 700af142958b3..e9d42ded007b3 100644 --- a/pandas/tests/indexes/multi/test_join.py +++ b/pandas/tests/indexes/multi/test_join.py @@ -51,8 +51,11 @@ def test_join_level_corner_case(idx): def test_join_self(idx, join_type): - joined = idx.join(idx, how=join_type) - tm.assert_index_equal(joined, idx) + result = idx.join(idx, how=join_type) + expected = idx + if join_type == "outer": + expected = expected.sort_values() + tm.assert_index_equal(result, expected) def test_join_multi(): @@ -91,8 +94,12 @@ def test_join_multi(): def test_join_self_unique(idx, join_type): if idx.is_unique: - joined = idx.join(idx, how=join_type) - assert (idx == joined).all() + result = idx.join(idx, how=join_type) + if join_type == "outer": + expected = idx.sort_values() + else: + expected = idx + tm.assert_index_equal(result, expected) def test_join_multi_wrong_order(): diff --git a/pandas/tests/indexes/period/test_join.py b/pandas/tests/indexes/period/test_join.py index 191dba2be0c5d..0f7ce99c3c430 100644 --- a/pandas/tests/indexes/period/test_join.py +++ b/pandas/tests/indexes/period/test_join.py @@ -44,11 +44,12 @@ def test_join_does_not_recur(self): ) ser = df.iloc[:2, 0] - res = ser.index.join(df.columns, how="outer") + with tm.assert_produces_warning(RuntimeWarning): + result = ser.index.join(df.columns, how="outer") expected = Index( [ser.index[0], ser.index[1], df.columns[0], df.columns[1]], object ) - tm.assert_index_equal(res, expected) + tm.assert_index_equal(result, expected) def test_join_mismatched_freq_raises(self): index = period_range("1/1/2000", "1/20/2000", freq="D") diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index dc624f0271a73..f69707914c06a 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -979,8 +979,11 @@ def test_slice_keep_name(self): indirect=True, ) def test_join_self(self, index, join_type): - joined = index.join(index, how=join_type) - assert index is joined + result = index.join(index, how=join_type) + expected = index + if join_type == "outer": + expected = expected.sort_values() + tm.assert_index_equal(result, expected) @pytest.mark.parametrize("method", ["strip", "rstrip", "lstrip"]) def test_str_attribute(self, method): diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index f08de8e65451c..056d4bbf6f4c4 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -30,6 +30,7 @@ period_range, ) import pandas._testing as tm +import pandas.core.algorithms as algos from pandas.core.arrays import BaseMaskedArray @@ -646,7 +647,10 @@ def test_join_self_unique(self, join_type, simple_index): idx = simple_index if idx.is_unique: joined = idx.join(idx, how=join_type) - assert (idx == joined).all() + expected = simple_index + if join_type == "outer": + expected = algos.safe_sort(expected) + tm.assert_index_equal(joined, expected) def test_map(self, simple_index): # callable diff --git a/pandas/tests/indexes/timedeltas/test_join.py b/pandas/tests/indexes/timedeltas/test_join.py index 89579d0c86f20..caf2b3bc7de6a 100644 --- a/pandas/tests/indexes/timedeltas/test_join.py +++ b/pandas/tests/indexes/timedeltas/test_join.py @@ -35,7 +35,8 @@ def test_does_not_convert_mixed_integer(self): c_idx_type="td", ) - cols = df.columns.join(df.index, how="outer") + with tm.assert_produces_warning(RuntimeWarning): + cols = df.columns.join(df.index, how="outer") joined = cols.join(df.columns) assert cols.dtype == np.dtype("O") assert cols.dtype == joined.dtype diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 7538894bbf1c9..1ba15678da19a 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2913,16 +2913,13 @@ def test_merge_combinations( expected = expected["key"].repeat(repeats.values) expected = expected.to_frame() elif how == "outer": - if on_index and left_unique and left["key"].equals(right["key"]): - expected = DataFrame({"key": left["key"]}) - else: - left_counts = left["key"].value_counts() - right_counts = right["key"].value_counts() - expected_counts = left_counts.mul(right_counts, fill_value=1) - expected_counts = expected_counts.astype(np.intp) - expected = expected_counts.index.values.repeat(expected_counts.values) - expected = DataFrame({"key": expected}) - expected = expected.sort_values("key") + left_counts = left["key"].value_counts() + right_counts = right["key"].value_counts() + expected_counts = left_counts.mul(right_counts, fill_value=1) + expected_counts = expected_counts.astype(np.intp) + expected = expected_counts.index.values.repeat(expected_counts.values) + expected = DataFrame({"key": expected}) + expected = expected.sort_values("key") if on_index: expected = expected.set_index("key")