From 24f624d465c5dbedfa8832e3be37f6faf4634e46 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 4 Nov 2022 15:13:00 +0100 Subject: [PATCH 1/4] API / CoW: constructing Series from Series creates lazy copy (with default copy=False) --- pandas/core/series.py | 3 +- pandas/tests/copy_view/test_constructors.py | 75 +++++++++++++++++++++ pandas/tests/copy_view/test_indexing.py | 3 - 3 files changed, 77 insertions(+), 4 deletions(-) create mode 100644 pandas/tests/copy_view/test_constructors.py diff --git a/pandas/core/series.py b/pandas/core/series.py index 9f05eba00b05c..eb070bef6e2ed 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -419,10 +419,11 @@ def __init__( elif isinstance(data, Series): if index is None: index = data.index + data = data._mgr.copy(deep=False) else: data = data.reindex(index, copy=copy) copy = False - data = data._mgr + data = data._mgr elif is_dict_like(data): data, index = self._init_dict(data, index, dtype) dtype = None diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py new file mode 100644 index 0000000000000..c04c733e5ee1d --- /dev/null +++ b/pandas/tests/copy_view/test_constructors.py @@ -0,0 +1,75 @@ +import numpy as np + +from pandas import Series + +# ----------------------------------------------------------------------------- +# Copy/view behaviour for Series / DataFrame constructors + + +def test_series_from_series(using_copy_on_write): + # Case: constructing a Series from another Series object follows CoW rules: + # a new object is returned and thus mutations are not propagated + ser = Series([1, 2, 3], name="name") + + # default is copy=False -> new Series is a shallow copy / view of original + result = Series(ser) + + # the shallow copy still shares memory + assert np.shares_memory(ser.values, result.values) + + if using_copy_on_write: + assert result._mgr.refs is not None + + if using_copy_on_write: + # mutating new series copy doesn't mutate original + result.iloc[0] = 0 + assert ser.iloc[0] == 1 + # mutating triggered a copy-on-write -> no longer shares memory + assert not np.shares_memory(ser.values, result.values) + else: + # mutating shallow copy does mutate original + result.iloc[0] = 0 + assert ser.iloc[0] == 0 + # and still shares memory + assert np.shares_memory(ser.values, result.values) + + # the same when modifying the parent + result = Series(ser) + + if using_copy_on_write: + # mutating original doesn't mutate new series + ser.iloc[0] = 0 + assert result.iloc[0] == 1 + else: + # mutating original does mutate shallow copy + ser.iloc[0] = 0 + assert result.iloc[0] == 0 + + +def test_series_from_series_with_reindex(using_copy_on_write): + # Case: constructing a Series from another Series with specifying an index + # that potentially requires a reindex of the values + ser = Series([1, 2, 3], name="name") + + # passing an index that doesn't actually require a reindex of the values + # -> without CoW we get an actual mutating view + for index in [ + ser.index, + ser.index.copy(), + list(ser.index), + ser.index.rename("idx"), + ]: + result = Series(ser, index=index) + assert np.shares_memory(ser.values, result.values) + result.iloc[0] = 0 + if using_copy_on_write: + assert ser.iloc[0] == 1 + else: + assert ser.iloc[0] == 0 + + # ensure that if an actual reindex is needed, we don't have any refs + # (mutating the result wouldn't trigger CoW) + result = Series(ser, index=[0, 1, 2, 3]) + assert not np.shares_memory(ser.values, result.values) + if using_copy_on_write: + assert result._mgr.refs is None or result._mgr.refs[0] is None diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index b8028fd28f8f8..927594eb681db 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -795,6 +795,3 @@ def test_dataframe_add_column_from_series(): df.loc[2, "new"] = 100 expected_s = Series([0, 11, 12]) tm.assert_series_equal(s, expected_s) - - -# TODO add tests for constructors From 1ebbfe8065afe6b49fac4ab149619df12944dd7e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 3 Jan 2023 15:45:31 +0100 Subject: [PATCH 2/4] limit to CoW --- pandas/core/series.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 6216cd43e76a5..952980e7fbe0d 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -420,7 +420,13 @@ def __init__( elif isinstance(data, Series): if index is None: index = data.index - data = data._mgr.copy(deep=False) + if ( + get_option("mode.copy_on_write") + and get_option("mode.data_manager") == "block" + ): + data = data._mgr.copy(deep=False) + else: + data = data._mgr else: data = data.reindex(index, copy=copy) copy = False From 006ced479f4025012b83ae599cdc036c077cfb87 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 3 Jan 2023 15:48:44 +0100 Subject: [PATCH 3/4] add whatsnew --- doc/source/whatsnew/v2.0.0.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 22f6659367683..0aa7debfca89f 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -77,6 +77,13 @@ be set to ``"pyarrow"`` to return pyarrow-backed, nullable :class:`ArrowDtype` ( df_pyarrow = pd.read_csv(data, use_nullable_dtypes=True, engine="pyarrow") df_pyarrow.dtypes +Copy on write improvements +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +* The :class:`Series` constructor will now create a lazy copy (deferring the copy until + a modification to the data happens) when constructing a Series from an existing + Series with the default of ``copy=False`` (:issue:`50471`) + .. _whatsnew_200.enhancements.other: Other enhancements From 4bd8ea10bae3370ac9576aec445867a184ea91ab Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 13 Jan 2023 16:49:10 +0100 Subject: [PATCH 4/4] use helper --- pandas/core/series.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 6f2ce999735a1..925d22979576d 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -426,10 +426,7 @@ def __init__( elif isinstance(data, Series): if index is None: index = data.index - if ( - get_option("mode.copy_on_write") - and get_option("mode.data_manager") == "block" - ): + if using_copy_on_write(): data = data._mgr.copy(deep=False) else: data = data._mgr