From 58f066256186e6ca4845c2c498b46f2ce5d56f40 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sun, 26 Mar 2023 11:21:37 -0400 Subject: [PATCH 1/4] BUG: __from_arrow__ doesn't accept pyarrow null arrays for numeric masked types --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/core/arrays/arrow/_arrow_utils.py | 5 +++++ pandas/core/arrays/boolean.py | 9 ++++++++- pandas/core/arrays/numeric.py | 4 +++- pandas/tests/arrays/masked/test_arrow_compat.py | 7 +++++++ 5 files changed, 24 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index ec5d08e75f0e4..aa53dc9d34efe 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1208,6 +1208,7 @@ Numeric - Bug in :meth:`~arrays.ArrowExtensionArray.mode` where ``dropna=False`` was not respected when there was ``NA`` values (:issue:`50982`) - Bug in :meth:`DataFrame.query` with ``engine="numexpr"`` and column names are ``min`` or ``max`` would raise a ``TypeError`` (:issue:`50937`) - Bug in :meth:`DataFrame.min` and :meth:`DataFrame.max` with tz-aware data containing ``pd.NaT`` and ``axis=1`` would return incorrect results (:issue:`51242`) +- Bug where the ``__from_arrow__`` method of masked ExtensionDtypes(e.g. :class:`Float64Dtype`, :class:`BooleanDtype`) would not accept pyarrow arrays of type ``pyarrow.null()`` (:issue:`52223`) Conversion ^^^^^^^^^^ diff --git a/pandas/core/arrays/arrow/_arrow_utils.py b/pandas/core/arrays/arrow/_arrow_utils.py index 6e6ef6a2c20a8..2a053fac2985c 100644 --- a/pandas/core/arrays/arrow/_arrow_utils.py +++ b/pandas/core/arrays/arrow/_arrow_utils.py @@ -42,6 +42,11 @@ def pyarrow_array_to_numpy_and_mask( """ dtype = np.dtype(dtype) + if pyarrow.types.is_null(arr.type): + # No initialization of data is needed since everything is null + data = np.empty(len(arr), dtype=dtype) + mask = np.zeros(len(arr), dtype=bool) + return data, mask buflist = arr.buffers() # Since Arrow buffers might contain padding and the data might be offset, # the buffer gets sliced here before handing it to numpy. diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 54bd4220bc060..d42faa6d68031 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -108,7 +108,7 @@ def __from_arrow__( """ import pyarrow - if array.type != pyarrow.bool_(): + if array.type != pyarrow.bool_() and not pyarrow.types.is_null(array.type): raise TypeError(f"Expected array of boolean type, got {array.type} instead") if isinstance(array, pyarrow.Array): @@ -119,6 +119,13 @@ def __from_arrow__( results = [] for arr in chunks: + if pyarrow.types.is_null(arr.type): + mask = np.ones(len(arr), dtype=bool) + # No need to init data, since all null + data = np.empty(len(arr), dtype=bool) + bool_arr = BooleanArray(data, mask) + results.append(bool_arr) + continue buflist = arr.buffers() data = pyarrow.BooleanArray.from_buffers( arr.type, len(arr), [None, buflist[1]], offset=arr.offset diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index b4b665d5264a7..cf9606ba6b656 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -76,7 +76,9 @@ def __from_arrow__( array_class = self.construct_array_type() pyarrow_type = pyarrow.from_numpy_dtype(self.type) - if not array.type.equals(pyarrow_type): + if not array.type.equals(pyarrow_type) and not pyarrow.types.is_null( + array.type + ): # test_from_arrow_type_error raise for string, but allow # through itemsize conversion GH#31896 rt_dtype = pandas_dtype(array.type.to_pandas_dtype()) diff --git a/pandas/tests/arrays/masked/test_arrow_compat.py b/pandas/tests/arrays/masked/test_arrow_compat.py index 6b0081321ef22..035a3eed2e069 100644 --- a/pandas/tests/arrays/masked/test_arrow_compat.py +++ b/pandas/tests/arrays/masked/test_arrow_compat.py @@ -184,6 +184,13 @@ def test_pyarrow_array_to_numpy_and_mask(np_dtype_to_arrays): tm.assert_numpy_array_equal(mask, mask_expected_empty) +def test_from_arrow_null(data): + arr = pa.nulls(10) + res = data.dtype.__from_arrow__(arr) + assert res.isna().all() + assert len(res) == 10 + + def test_from_arrow_type_error(data): # ensure that __from_arrow__ returns a TypeError when getting a wrong # array type From 61a93c80fe7e1e958c81e4c0a85f3807270b6387 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sun, 2 Apr 2023 17:18:19 -0400 Subject: [PATCH 2/4] simplify --- pandas/core/arrays/boolean.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index d42faa6d68031..8bd2e81da082c 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -113,19 +113,20 @@ def __from_arrow__( if isinstance(array, pyarrow.Array): chunks = [array] + length = len(array) else: # pyarrow.ChunkedArray chunks = array.chunks + length = array.length + + if pyarrow.types.is_null(array.type): + mask = np.ones(length, dtype=bool) + # No need to init data, since all null + data = np.empty(length, dtype=bool) + return BooleanArray(data, mask) results = [] for arr in chunks: - if pyarrow.types.is_null(arr.type): - mask = np.ones(len(arr), dtype=bool) - # No need to init data, since all null - data = np.empty(len(arr), dtype=bool) - bool_arr = BooleanArray(data, mask) - results.append(bool_arr) - continue buflist = arr.buffers() data = pyarrow.BooleanArray.from_buffers( arr.type, len(arr), [None, buflist[1]], offset=arr.offset From 2b87f304231b0755d60c6b7815da51cd48a4c89f Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 3 Apr 2023 07:09:39 -0400 Subject: [PATCH 3/4] move whatsnew --- doc/source/whatsnew/v2.0.0.rst | 1 - doc/source/whatsnew/v2.1.0.rst | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index aa53dc9d34efe..ec5d08e75f0e4 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1208,7 +1208,6 @@ Numeric - Bug in :meth:`~arrays.ArrowExtensionArray.mode` where ``dropna=False`` was not respected when there was ``NA`` values (:issue:`50982`) - Bug in :meth:`DataFrame.query` with ``engine="numexpr"`` and column names are ``min`` or ``max`` would raise a ``TypeError`` (:issue:`50937`) - Bug in :meth:`DataFrame.min` and :meth:`DataFrame.max` with tz-aware data containing ``pd.NaT`` and ``axis=1`` would return incorrect results (:issue:`51242`) -- Bug where the ``__from_arrow__`` method of masked ExtensionDtypes(e.g. :class:`Float64Dtype`, :class:`BooleanDtype`) would not accept pyarrow arrays of type ``pyarrow.null()`` (:issue:`52223`) Conversion ^^^^^^^^^^ diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 243ba3b8df119..9384a1246a22c 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -169,7 +169,7 @@ Timezones Numeric ^^^^^^^ - Bug in :meth:`Series.corr` and :meth:`Series.cov` raising ``AttributeError`` for masked dtypes (:issue:`51422`) -- +- Bug where the ``__from_arrow__`` method of masked ExtensionDtypes(e.g. :class:`Float64Dtype`, :class:`BooleanDtype`) would not accept pyarrow arrays of type ``pyarrow.null()`` (:issue:`52223`) Conversion ^^^^^^^^^^ From f8a9ddaebb8948c769bc633024671f030b129841 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 5 Apr 2023 20:25:04 -0400 Subject: [PATCH 4/4] update whatsnew and add more tests --- doc/source/whatsnew/v2.1.0.rst | 2 +- pandas/core/arrays/boolean.py | 2 +- pandas/tests/arrays/masked/test_arrow_compat.py | 6 ++++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 48a353e2deaf4..89a67cd196bdf 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -186,7 +186,6 @@ Timezones Numeric ^^^^^^^ - Bug in :meth:`Series.corr` and :meth:`Series.cov` raising ``AttributeError`` for masked dtypes (:issue:`51422`) -- Bug where the ``__from_arrow__`` method of masked ExtensionDtypes(e.g. :class:`Float64Dtype`, :class:`BooleanDtype`) would not accept pyarrow arrays of type ``pyarrow.null()`` (:issue:`52223`) - Bug in :meth:`DataFrame.corrwith` raising ``NotImplementedError`` for pyarrow-backed dtypes (:issue:`52314`) - @@ -267,6 +266,7 @@ Sparse ExtensionArray ^^^^^^^^^^^^^^ +- Bug where the ``__from_arrow__`` method of masked ExtensionDtypes(e.g. :class:`Float64Dtype`, :class:`BooleanDtype`) would not accept pyarrow arrays of type ``pyarrow.null()`` (:issue:`52223`) - Styler diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 8bd2e81da082c..f6bc8a87a4c60 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -117,7 +117,7 @@ def __from_arrow__( else: # pyarrow.ChunkedArray chunks = array.chunks - length = array.length + length = array.length() if pyarrow.types.is_null(array.type): mask = np.ones(length, dtype=bool) diff --git a/pandas/tests/arrays/masked/test_arrow_compat.py b/pandas/tests/arrays/masked/test_arrow_compat.py index 035a3eed2e069..fc2094bd9f4a8 100644 --- a/pandas/tests/arrays/masked/test_arrow_compat.py +++ b/pandas/tests/arrays/masked/test_arrow_compat.py @@ -184,8 +184,10 @@ def test_pyarrow_array_to_numpy_and_mask(np_dtype_to_arrays): tm.assert_numpy_array_equal(mask, mask_expected_empty) -def test_from_arrow_null(data): - arr = pa.nulls(10) +@pytest.mark.parametrize( + "arr", [pa.nulls(10), pa.chunked_array([pa.nulls(4), pa.nulls(6)])] +) +def test_from_arrow_null(data, arr): res = data.dtype.__from_arrow__(arr) assert res.isna().all() assert len(res) == 10