Skip to content

Commit 7986bc2

Browse files
ENH: Enable lazy copy in merge() for CoW (#51297)
1 parent d04f9c5 commit 7986bc2

File tree

2 files changed

+67
-4
lines changed

2 files changed

+67
-4
lines changed

pandas/core/reshape/merge.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ def merge(
134134
right_index: bool = False,
135135
sort: bool = False,
136136
suffixes: Suffixes = ("_x", "_y"),
137-
copy: bool = True,
137+
copy: bool | None = None,
138138
indicator: str | bool = False,
139139
validate: str | None = None,
140140
) -> DataFrame:
@@ -744,7 +744,7 @@ def _reindex_and_concat(
744744
join_index: Index,
745745
left_indexer: npt.NDArray[np.intp] | None,
746746
right_indexer: npt.NDArray[np.intp] | None,
747-
copy: bool,
747+
copy: bool | None,
748748
) -> DataFrame:
749749
"""
750750
reindex along index and concat along columns.
@@ -793,7 +793,7 @@ def _reindex_and_concat(
793793
result = concat([left, right], axis=1, copy=copy)
794794
return result
795795

796-
def get_result(self, copy: bool = True) -> DataFrame:
796+
def get_result(self, copy: bool | None = True) -> DataFrame:
797797
if self.indicator:
798798
self.left, self.right = self._indicator_pre_merge(self.left, self.right)
799799

@@ -1800,7 +1800,7 @@ def __init__(
18001800
sort=True, # factorize sorts
18011801
)
18021802

1803-
def get_result(self, copy: bool = True) -> DataFrame:
1803+
def get_result(self, copy: bool | None = True) -> DataFrame:
18041804
join_index, left_indexer, right_indexer = self._get_join_info()
18051805

18061806
llabels, rlabels = _items_overlap_with_suffix(

pandas/tests/copy_view/test_functions.py

+63
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
import numpy as np
22

3+
import pandas.util._test_decorators as td
4+
35
from pandas import (
46
DataFrame,
57
Series,
68
concat,
9+
merge,
710
)
811
import pandas._testing as tm
912
from pandas.tests.copy_view.util import get_array
@@ -177,3 +180,63 @@ def test_concat_mixed_series_frame(using_copy_on_write):
177180
if using_copy_on_write:
178181
assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))
179182
tm.assert_frame_equal(result, expected)
183+
184+
185+
@td.skip_copy_on_write_not_yet_implemented # TODO(CoW)
186+
def test_merge_on_key(using_copy_on_write):
187+
df1 = DataFrame({"key": ["a", "b", "c"], "a": [1, 2, 3]})
188+
df2 = DataFrame({"key": ["a", "b", "c"], "b": [4, 5, 6]})
189+
df1_orig = df1.copy()
190+
df2_orig = df2.copy()
191+
192+
result = merge(df1, df2, on="key")
193+
194+
if using_copy_on_write:
195+
assert np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
196+
assert np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
197+
assert not np.shares_memory(get_array(result, "key"), get_array(df1, "key"))
198+
assert not np.shares_memory(get_array(result, "key"), get_array(df2, "key"))
199+
else:
200+
assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
201+
assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
202+
203+
result.iloc[0, 1] = 0
204+
if using_copy_on_write:
205+
assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
206+
assert np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
207+
208+
result.iloc[0, 2] = 0
209+
if using_copy_on_write:
210+
assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
211+
tm.assert_frame_equal(df1, df1_orig)
212+
tm.assert_frame_equal(df2, df2_orig)
213+
214+
215+
def test_merge_on_index(using_copy_on_write):
216+
df1 = DataFrame({"a": [1, 2, 3]})
217+
df2 = DataFrame({"b": [4, 5, 6]})
218+
df1_orig = df1.copy()
219+
df2_orig = df2.copy()
220+
221+
result = merge(df1, df2, left_index=True, right_index=True)
222+
223+
if using_copy_on_write:
224+
assert np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
225+
assert np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
226+
else:
227+
assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
228+
assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
229+
230+
result.iloc[0, 0] = 0
231+
if using_copy_on_write:
232+
assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
233+
assert np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
234+
235+
result.iloc[0, 1] = 0
236+
if using_copy_on_write:
237+
assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
238+
tm.assert_frame_equal(df1, df1_orig)
239+
tm.assert_frame_equal(df2, df2_orig)
240+
241+
242+
# TODO(CoW) add merge tests where one of left/right isn't copied

0 commit comments

Comments
 (0)