From 3a08a45676196095c3b63d4559b9dc2a26500330 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 14 Mar 2023 15:02:56 +0100 Subject: [PATCH 01/10] CoW: Handle EA dtypes in __array__ --- pandas/core/generic.py | 4 ++-- pandas/core/series.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 17e4a4c142f66..1bdcda15458b6 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -103,6 +103,7 @@ validate_inclusive, ) +from pandas.core.dtypes.astype import astype_is_view from pandas.core.dtypes.common import ( ensure_object, ensure_platform_int, @@ -1995,8 +1996,7 @@ def empty(self) -> bool_t: def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray: values = self._values arr = np.asarray(values, dtype=dtype) - if arr is values and using_copy_on_write(): - # TODO(CoW) also properly handle extension dtypes + if using_copy_on_write() and astype_is_view(values.dtype, arr.dtype): arr = arr.view() arr.flags.writeable = False return arr diff --git a/pandas/core/series.py b/pandas/core/series.py index e8d6491e43007..c97a1819d88de 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -58,6 +58,7 @@ validate_percentile, ) +from pandas.core.dtypes.astype import astype_is_view from pandas.core.dtypes.cast import ( LossySetitemError, convert_dtypes, @@ -896,8 +897,7 @@ def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray: """ values = self._values arr = np.asarray(values, dtype=dtype) - if arr is values and using_copy_on_write(): - # TODO(CoW) also properly handle extension dtypes + if using_copy_on_write() and astype_is_view(values.dtype, arr.dtype): arr = arr.view() arr.flags.writeable = False return arr From 38453c49a0ff2c4b1a44957da831e892b958659e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 14 Mar 2023 16:31:07 +0100 Subject: [PATCH 02/10] CoW: __array__ not recognizing ea dtypes --- pandas/core/generic.py | 10 ++++--- pandas/tests/copy_view/test_array.py | 41 ++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 4 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 1bdcda15458b6..51d8e13e13aeb 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -103,11 +103,11 @@ validate_inclusive, ) -from pandas.core.dtypes.astype import astype_is_view from pandas.core.dtypes.common import ( ensure_object, ensure_platform_int, ensure_str, + is_1d_only_ea_dtype, is_bool, is_bool_dtype, is_datetime64_any_dtype, @@ -1996,9 +1996,11 @@ def empty(self) -> bool_t: def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray: values = self._values arr = np.asarray(values, dtype=dtype) - if using_copy_on_write() and astype_is_view(values.dtype, arr.dtype): - arr = arr.view() - arr.flags.writeable = False + if arr is values and using_copy_on_write() and self._mgr.is_single_block: + # Check if self._values coerced data + if not is_1d_only_ea_dtype(self.dtypes.iloc[0]): + arr = arr.view() + arr.flags.writeable = False return arr @final diff --git a/pandas/tests/copy_view/test_array.py b/pandas/tests/copy_view/test_array.py index 501ef27bc291e..9e9b499c46ed2 100644 --- a/pandas/tests/copy_view/test_array.py +++ b/pandas/tests/copy_view/test_array.py @@ -110,3 +110,44 @@ def test_series_to_numpy(using_copy_on_write): arr = ser.to_numpy(dtype="float64") assert not np.shares_memory(arr, get_array(ser, "name")) assert arr.flags.writeable is True + + +def test_series_array_ea_dtypes(using_copy_on_write): + ser = Series([1, 2, 3], dtype="Int64") + arr = np.asarray(ser, dtype="int64") + assert np.shares_memory(arr, get_array(ser)) + if using_copy_on_write: + assert arr.flags.writeable is False + else: + assert arr.flags.writeable is True + + arr = np.asarray(ser) + assert not np.shares_memory(arr, get_array(ser)) + if using_copy_on_write: + # TODO(CoW): This should be True + assert arr.flags.writeable is False + else: + assert arr.flags.writeable is True + + +def test_dataframe_array_ea_dtypes(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3]}, dtype="Int64") + arr = np.asarray(df, dtype="int64") + # TODO: This should be able to share memory, but we are roundtripping + # through object + assert not np.shares_memory(arr, get_array(df, "a")) + assert arr.flags.writeable is True + + arr = np.asarray(df) + if using_copy_on_write: + # TODO This really should be True + assert arr.flags.writeable is False + else: + assert arr.flags.writeable is True + + +def test_dataframe_multiple_numpy_dtypes(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": 1.5}) + arr = np.asarray(df) + assert not np.shares_memory(arr, get_array(df, "a")) + assert arr.flags.writeable is True From ee751b4cccf65d36d990f83ae7c2e0934aeda0e6 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 14 Mar 2023 16:42:02 +0100 Subject: [PATCH 03/10] Fix writeable flag --- pandas/core/frame.py | 5 ++++- pandas/tests/copy_view/test_array.py | 6 +----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 650df0d18e54d..c75125cf4fc55 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1004,7 +1004,10 @@ def _values(self) -> np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray: arr = blocks[0].values if arr.ndim == 1: # non-2D ExtensionArray - return self.values + values = self.values + # Handle read_only flag only in public methods + values.flags.writeable = True + return values # more generally, whatever we allow in NDArrayBackedExtensionBlock arr = cast("np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray", arr) diff --git a/pandas/tests/copy_view/test_array.py b/pandas/tests/copy_view/test_array.py index 9e9b499c46ed2..7ee1e4baab6f6 100644 --- a/pandas/tests/copy_view/test_array.py +++ b/pandas/tests/copy_view/test_array.py @@ -139,11 +139,7 @@ def test_dataframe_array_ea_dtypes(using_copy_on_write): assert arr.flags.writeable is True arr = np.asarray(df) - if using_copy_on_write: - # TODO This really should be True - assert arr.flags.writeable is False - else: - assert arr.flags.writeable is True + assert arr.flags.writeable is True def test_dataframe_multiple_numpy_dtypes(using_copy_on_write): From a4eb6128a9cb6b2f8f4db7b3b50518af9630a642 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 14 Mar 2023 17:29:33 +0100 Subject: [PATCH 04/10] Skip for now --- pandas/core/frame.py | 5 +---- pandas/tests/copy_view/test_array.py | 6 +++++- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c75125cf4fc55..650df0d18e54d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1004,10 +1004,7 @@ def _values(self) -> np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray: arr = blocks[0].values if arr.ndim == 1: # non-2D ExtensionArray - values = self.values - # Handle read_only flag only in public methods - values.flags.writeable = True - return values + return self.values # more generally, whatever we allow in NDArrayBackedExtensionBlock arr = cast("np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray", arr) diff --git a/pandas/tests/copy_view/test_array.py b/pandas/tests/copy_view/test_array.py index 7ee1e4baab6f6..ed91d3410de74 100644 --- a/pandas/tests/copy_view/test_array.py +++ b/pandas/tests/copy_view/test_array.py @@ -139,7 +139,11 @@ def test_dataframe_array_ea_dtypes(using_copy_on_write): assert arr.flags.writeable is True arr = np.asarray(df) - assert arr.flags.writeable is True + if using_copy_on_write: + # TODO(CoW): This should be True + assert arr.flags.writeable is False + else: + assert arr.flags.writeable is True def test_dataframe_multiple_numpy_dtypes(using_copy_on_write): From c515870403b581c4c3c4ddc780d8d765fe129e49 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 14 Mar 2023 22:45:08 +0100 Subject: [PATCH 05/10] Fix for non numeric --- pandas/core/generic.py | 4 +++- pandas/tests/copy_view/test_array.py | 12 +++++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 51d8e13e13aeb..1721607eb8588 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1998,7 +1998,9 @@ def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray: arr = np.asarray(values, dtype=dtype) if arr is values and using_copy_on_write() and self._mgr.is_single_block: # Check if self._values coerced data - if not is_1d_only_ea_dtype(self.dtypes.iloc[0]): + if not is_1d_only_ea_dtype(self.dtypes.iloc[0]) or not is_numeric_dtype( + self.dtypes.iloc[0] + ): arr = arr.view() arr.flags.writeable = False return arr diff --git a/pandas/tests/copy_view/test_array.py b/pandas/tests/copy_view/test_array.py index ed91d3410de74..92ee8a80d7543 100644 --- a/pandas/tests/copy_view/test_array.py +++ b/pandas/tests/copy_view/test_array.py @@ -146,7 +146,17 @@ def test_dataframe_array_ea_dtypes(using_copy_on_write): assert arr.flags.writeable is True -def test_dataframe_multiple_numpy_dtypes(using_copy_on_write): +def test_dataframe_array_string_dtype(using_copy_on_write): + df = DataFrame({"a": ["a", "b"]}, dtype="string") + arr = np.asarray(df) + assert np.shares_memory(arr, get_array(df, "a")) + if using_copy_on_write: + assert arr.flags.writeable is False + else: + assert arr.flags.writeable is True + + +def test_dataframe_multiple_numpy_dtypes(): df = DataFrame({"a": [1, 2, 3], "b": 1.5}) arr = np.asarray(df) assert not np.shares_memory(arr, get_array(df, "a")) From 5568ec25f6b8d1c875e37244e72fefc0baec0c97 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 15 Mar 2023 00:35:19 +0100 Subject: [PATCH 06/10] Fix array manager --- pandas/tests/copy_view/test_array.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/copy_view/test_array.py b/pandas/tests/copy_view/test_array.py index 92ee8a80d7543..ce02f37de3917 100644 --- a/pandas/tests/copy_view/test_array.py +++ b/pandas/tests/copy_view/test_array.py @@ -146,10 +146,11 @@ def test_dataframe_array_ea_dtypes(using_copy_on_write): assert arr.flags.writeable is True -def test_dataframe_array_string_dtype(using_copy_on_write): +def test_dataframe_array_string_dtype(using_copy_on_write, using_array_manager): df = DataFrame({"a": ["a", "b"]}, dtype="string") arr = np.asarray(df) - assert np.shares_memory(arr, get_array(df, "a")) + if not using_array_manager: + assert np.shares_memory(arr, get_array(df, "a")) if using_copy_on_write: assert arr.flags.writeable is False else: From 8922467d76087baf502cd89af6055b15aa9a78c2 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 15 Mar 2023 13:32:51 +0100 Subject: [PATCH 07/10] Add test --- pandas/tests/copy_view/test_array.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/tests/copy_view/test_array.py b/pandas/tests/copy_view/test_array.py index ce02f37de3917..962f1ec15ef7b 100644 --- a/pandas/tests/copy_view/test_array.py +++ b/pandas/tests/copy_view/test_array.py @@ -123,11 +123,7 @@ def test_series_array_ea_dtypes(using_copy_on_write): arr = np.asarray(ser) assert not np.shares_memory(arr, get_array(ser)) - if using_copy_on_write: - # TODO(CoW): This should be True - assert arr.flags.writeable is False - else: - assert arr.flags.writeable is True + assert arr.flags.writeable is True def test_dataframe_array_ea_dtypes(using_copy_on_write): @@ -162,3 +158,9 @@ def test_dataframe_multiple_numpy_dtypes(): arr = np.asarray(df) assert not np.shares_memory(arr, get_array(df, "a")) assert arr.flags.writeable is True + + +def test_empty_dataframe(): + df = DataFrame() + arr = np.asarray(df) + assert arr.flags.writeable is True From 413fc7cdb1a49ed421faf5202637021692165ea1 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 15 Mar 2023 13:33:28 +0100 Subject: [PATCH 08/10] Add comment --- pandas/core/generic.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 1721607eb8588..b97b4f3a79a03 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1997,7 +1997,8 @@ def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray: values = self._values arr = np.asarray(values, dtype=dtype) if arr is values and using_copy_on_write() and self._mgr.is_single_block: - # Check if self._values coerced data + # self._values coerces to object for non-numpy dtypes, so we have to + # check for all cases where coercion to object creates a copy if not is_1d_only_ea_dtype(self.dtypes.iloc[0]) or not is_numeric_dtype( self.dtypes.iloc[0] ): From 2d8a2f2c3895d4c2679239d0ba33aca4ae0d4d29 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 15 Mar 2023 22:21:11 +0100 Subject: [PATCH 09/10] Improve solution --- pandas/core/generic.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b97b4f3a79a03..4fa1b530194aa 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -103,11 +103,11 @@ validate_inclusive, ) +from pandas.core.dtypes.astype import astype_is_view from pandas.core.dtypes.common import ( ensure_object, ensure_platform_int, ensure_str, - is_1d_only_ea_dtype, is_bool, is_bool_dtype, is_datetime64_any_dtype, @@ -1997,10 +1997,9 @@ def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray: values = self._values arr = np.asarray(values, dtype=dtype) if arr is values and using_copy_on_write() and self._mgr.is_single_block: - # self._values coerces to object for non-numpy dtypes, so we have to - # check for all cases where coercion to object creates a copy - if not is_1d_only_ea_dtype(self.dtypes.iloc[0]) or not is_numeric_dtype( - self.dtypes.iloc[0] + # Check if both conversions can be done without a copy + if astype_is_view(self.dtypes.iloc[0], values.dtype) and astype_is_view( + values.dtype, arr.dtype ): arr = arr.view() arr.flags.writeable = False From 5c0f1bb8f6718de9153328e229edb364cafdd4b2 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 2 Apr 2023 03:00:47 -0400 Subject: [PATCH 10/10] Switch and add test --- pandas/core/generic.py | 6 +++++- pandas/tests/copy_view/test_array.py | 10 ++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index bd47bdf295791..bd71fbc23b868 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2006,7 +2006,11 @@ def empty(self) -> bool_t: def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray: values = self._values arr = np.asarray(values, dtype=dtype) - if arr is values and using_copy_on_write() and self._mgr.is_single_block: + if ( + astype_is_view(values.dtype, arr.dtype) + and using_copy_on_write() + and self._mgr.is_single_block + ): # Check if both conversions can be done without a copy if astype_is_view(self.dtypes.iloc[0], values.dtype) and astype_is_view( values.dtype, arr.dtype diff --git a/pandas/tests/copy_view/test_array.py b/pandas/tests/copy_view/test_array.py index bb4e28b589248..62a6a3374e612 100644 --- a/pandas/tests/copy_view/test_array.py +++ b/pandas/tests/copy_view/test_array.py @@ -4,6 +4,7 @@ from pandas import ( DataFrame, Series, + date_range, ) import pandas._testing as tm from pandas.tests.copy_view.util import get_array @@ -169,6 +170,15 @@ def test_dataframe_multiple_numpy_dtypes(): assert arr.flags.writeable is True +def test_values_is_ea(using_copy_on_write): + df = DataFrame({"a": date_range("2012-01-01", periods=3)}) + arr = np.asarray(df) + if using_copy_on_write: + assert arr.flags.writeable is False + else: + assert arr.flags.writeable is True + + def test_empty_dataframe(): df = DataFrame() arr = np.asarray(df)