From 89effc06e8c48b8f0c86075cb6b5294adaac7712 Mon Sep 17 00:00:00 2001 From: GYHHAHA <1801214626@qq.com> Date: Mon, 30 Nov 2020 09:26:40 +0800 Subject: [PATCH 1/5] fix-merge_ordered --- doc/source/whatsnew/v1.2.0.rst | 2 ++ pandas/core/reshape/merge.py | 15 ++++++++---- .../tests/reshape/merge/test_merge_ordered.py | 24 +++++++++++++++++++ 3 files changed, 37 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 3fab4850dd1ec..68a5641aec5e1 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -756,6 +756,8 @@ Reshaping - Bug in :func:`concat` resulting in a ``ValueError`` when at least one of both inputs had a non-unique index (:issue:`36263`) - Bug in :meth:`DataFrame.merge` and :meth:`pandas.merge` returning inconsistent ordering in result for ``how=right`` and ``how=left`` (:issue:`35382`) - Bug in :func:`merge_ordered` couldn't handle list-like ``left_by`` or ``right_by`` (:issue:`35269`) +- Bug in :func:`merge_ordered` returned wrong join result when length of ``left_by`` or ``right_by`` equals to the rows of ``left`` or ``right`` (:issue:`38166`) +- Bug in :func:`merge_ordered` didn't raise when elements in ``left_by`` or ``right_by`` not exist in ``left`` columns or ``right`` columns (:issue:`38167`) Sparse ^^^^^^ diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 545117dd84f93..211581fb691d2 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -114,11 +114,8 @@ def _groupby_and_merge(by, on, left: "DataFrame", right: "DataFrame", merge_piec # if we can groupby the rhs # then we can get vastly better perf - - try: + if all([item in right.columns for item in by]): rby = right.groupby(by, sort=False) - except KeyError: - pass for key, lhs in lby: @@ -274,10 +271,20 @@ def _merger(x, y): if left_by is not None and right_by is not None: raise ValueError("Can only group either left or right frames") elif left_by is not None: + if isinstance(left_by, str): + left_by = [left_by] + check = list(filter(lambda i: i not in left.columns, left_by)) + if len(check) != 0: + raise KeyError(f"{check} not found in left columns") result, _ = _groupby_and_merge( left_by, on, left, right, lambda x, y: _merger(x, y) ) elif right_by is not None: + if isinstance(right_by, str): + right_by = [right_by] + check = list(filter(lambda i: i not in right.columns, right_by)) + if len(check) != 0: + raise KeyError(f"{check} not found in right columns") result, _ = _groupby_and_merge( right_by, on, right, left, lambda x, y: _merger(y, x) ) diff --git a/pandas/tests/reshape/merge/test_merge_ordered.py b/pandas/tests/reshape/merge/test_merge_ordered.py index 8389a6bb9be10..b5bec19cb2243 100644 --- a/pandas/tests/reshape/merge/test_merge_ordered.py +++ b/pandas/tests/reshape/merge/test_merge_ordered.py @@ -177,3 +177,27 @@ def test_list_type_by(self, left, right, on, left_by, right_by, expected): ) tm.assert_frame_equal(result, expected) + + def test_left_by_length_equals_to_right_shape0(self): + # GH 38166 + l = DataFrame([["g", "h", 1], ["g", "h", 3]], columns=list("GHT")) + r = DataFrame([[2, 1]], columns=list("TE")) + result = merge_ordered(l, r, on="T", left_by=["G", "H"]) + expected = DataFrame( + { + "G": ["g"] * 3, + "H": ["h"] * 3, + "T": [1, 2, 3], + "E": [np.nan, 1.0, np.nan] + } + ) + + tm.assert_frame_equal(result, expected) + + def test_elements_not_in_by_but_in_df(self): + # GH 38167 + l = DataFrame([["g", "h", 1], ["g", "h", 3]], columns=list("GHT")) + r = DataFrame([[2, 1]], columns=list("TE")) + msg = r"\['h'\] not found in left columns" + with pytest.raises(KeyError, match=msg): + merge_ordered(l, r, on="T", left_by=["G", "h"]) From 11fb77747b2810fa9c893626b77f05d97212a69e Mon Sep 17 00:00:00 2001 From: GYHHAHA <1801214626@qq.com> Date: Mon, 30 Nov 2020 09:29:47 +0800 Subject: [PATCH 2/5] Update test_merge_ordered.py --- pandas/tests/reshape/merge/test_merge_ordered.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge_ordered.py b/pandas/tests/reshape/merge/test_merge_ordered.py index b5bec19cb2243..a22a5c29a50c4 100644 --- a/pandas/tests/reshape/merge/test_merge_ordered.py +++ b/pandas/tests/reshape/merge/test_merge_ordered.py @@ -180,9 +180,9 @@ def test_list_type_by(self, left, right, on, left_by, right_by, expected): def test_left_by_length_equals_to_right_shape0(self): # GH 38166 - l = DataFrame([["g", "h", 1], ["g", "h", 3]], columns=list("GHT")) - r = DataFrame([[2, 1]], columns=list("TE")) - result = merge_ordered(l, r, on="T", left_by=["G", "H"]) + left = DataFrame([["g", "h", 1], ["g", "h", 3]], columns=list("GHT")) + right = DataFrame([[2, 1]], columns=list("TE")) + result = merge_ordered(left, right, on="T", left_by=["G", "H"]) expected = DataFrame( { "G": ["g"] * 3, @@ -196,8 +196,8 @@ def test_left_by_length_equals_to_right_shape0(self): def test_elements_not_in_by_but_in_df(self): # GH 38167 - l = DataFrame([["g", "h", 1], ["g", "h", 3]], columns=list("GHT")) - r = DataFrame([[2, 1]], columns=list("TE")) + left = DataFrame([["g", "h", 1], ["g", "h", 3]], columns=list("GHT")) + right = DataFrame([[2, 1]], columns=list("TE")) msg = r"\['h'\] not found in left columns" with pytest.raises(KeyError, match=msg): - merge_ordered(l, r, on="T", left_by=["G", "h"]) + merge_ordered(left, right, on="T", left_by=["G", "h"]) From bae61e2d7c1d80fe1ddb821403d77d1efed70c6a Mon Sep 17 00:00:00 2001 From: GYHHAHA <1801214626@qq.com> Date: Mon, 30 Nov 2020 09:37:13 +0800 Subject: [PATCH 3/5] Update test_merge_ordered.py --- pandas/tests/reshape/merge/test_merge_ordered.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge_ordered.py b/pandas/tests/reshape/merge/test_merge_ordered.py index a22a5c29a50c4..faf6cf2835f2a 100644 --- a/pandas/tests/reshape/merge/test_merge_ordered.py +++ b/pandas/tests/reshape/merge/test_merge_ordered.py @@ -185,10 +185,7 @@ def test_left_by_length_equals_to_right_shape0(self): result = merge_ordered(left, right, on="T", left_by=["G", "H"]) expected = DataFrame( { - "G": ["g"] * 3, - "H": ["h"] * 3, - "T": [1, 2, 3], - "E": [np.nan, 1.0, np.nan] + {"G": ["g"] * 3, "H": ["h"] * 3, "T": [1, 2, 3], "E": [np.nan, 1.0, np.nan]} } ) From 8220203c938c6478cc3db5248e96493f1487eaa8 Mon Sep 17 00:00:00 2001 From: GYHHAHA <1801214626@qq.com> Date: Mon, 30 Nov 2020 09:47:55 +0800 Subject: [PATCH 4/5] Update test_merge_ordered.py --- pandas/tests/reshape/merge/test_merge_ordered.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge_ordered.py b/pandas/tests/reshape/merge/test_merge_ordered.py index faf6cf2835f2a..37664a3a931cb 100644 --- a/pandas/tests/reshape/merge/test_merge_ordered.py +++ b/pandas/tests/reshape/merge/test_merge_ordered.py @@ -184,9 +184,7 @@ def test_left_by_length_equals_to_right_shape0(self): right = DataFrame([[2, 1]], columns=list("TE")) result = merge_ordered(left, right, on="T", left_by=["G", "H"]) expected = DataFrame( - { - {"G": ["g"] * 3, "H": ["h"] * 3, "T": [1, 2, 3], "E": [np.nan, 1.0, np.nan]} - } + {"G": ["g"] * 3, "H": ["h"] * 3, "T": [1, 2, 3], "E": [np.nan, 1.0, np.nan]} ) tm.assert_frame_equal(result, expected) From af2251130c9f3df5bdad58537d0f3e64298f5ee6 Mon Sep 17 00:00:00 2001 From: GYHHAHA <1801214626@qq.com> Date: Mon, 30 Nov 2020 11:51:41 +0800 Subject: [PATCH 5/5] fix ci & pre --- pandas/core/reshape/merge.py | 6 +++--- pandas/tests/reshape/merge/test_merge_ordered.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 211581fb691d2..2c6cdb846221f 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -114,7 +114,7 @@ def _groupby_and_merge(by, on, left: "DataFrame", right: "DataFrame", merge_piec # if we can groupby the rhs # then we can get vastly better perf - if all([item in right.columns for item in by]): + if all(item in right.columns for item in by): rby = right.groupby(by, sort=False) for key, lhs in lby: @@ -273,7 +273,7 @@ def _merger(x, y): elif left_by is not None: if isinstance(left_by, str): left_by = [left_by] - check = list(filter(lambda i: i not in left.columns, left_by)) + check = set(left_by).difference(left.columns) if len(check) != 0: raise KeyError(f"{check} not found in left columns") result, _ = _groupby_and_merge( @@ -282,7 +282,7 @@ def _merger(x, y): elif right_by is not None: if isinstance(right_by, str): right_by = [right_by] - check = list(filter(lambda i: i not in right.columns, right_by)) + check = set(right_by).difference(right.columns) if len(check) != 0: raise KeyError(f"{check} not found in right columns") result, _ = _groupby_and_merge( diff --git a/pandas/tests/reshape/merge/test_merge_ordered.py b/pandas/tests/reshape/merge/test_merge_ordered.py index 37664a3a931cb..4a70719df5c57 100644 --- a/pandas/tests/reshape/merge/test_merge_ordered.py +++ b/pandas/tests/reshape/merge/test_merge_ordered.py @@ -193,6 +193,6 @@ def test_elements_not_in_by_but_in_df(self): # GH 38167 left = DataFrame([["g", "h", 1], ["g", "h", 3]], columns=list("GHT")) right = DataFrame([[2, 1]], columns=list("TE")) - msg = r"\['h'\] not found in left columns" + msg = r"\{'h'\} not found in left columns" with pytest.raises(KeyError, match=msg): merge_ordered(left, right, on="T", left_by=["G", "h"])