Skip to content

Commit 5f17047

Browse files
[backport 3.0.x] BUG: use fill_null fallback for bug in pyarrow 21 on Windows (fixes join, fillna, duplicated, etc with pyarrow-backed arrays) (#64081) (#64194)
Co-authored-by: Jeongmin Gil <puduck622@gmail.com>
1 parent 0d3a8cb commit 5f17047

File tree

5 files changed

+68
-6
lines changed

5 files changed

+68
-6
lines changed

doc/source/whatsnew/v3.0.1.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ Bug fixes
3333
- Fixed a bug in the :func:`comparison_op` raising a ``TypeError`` for zerodim
3434
subclasses of ``np.ndarray`` (:issue:`63205`)
3535
- Added additional typing aliases in :py:mod:`pandas.api.typing.aliases` (:issue:`64098`)
36+
- Fixed bug in :func:`merge` where NaN values in pyarrow-backed string dtype join keys were incorrectly matched with non-NaN values on Windows with pyarrow 21 (:issue:`64060`)
3637
- Fixed bug in :meth:`DataFrame.loc` when setting new row and new columns simultaneously filling existing columns with ``b''`` instead of ``NaN`` (:issue:`58316`)
3738
- Fixed thread safety issues in :class:`DataFrame` internals on the free-threaded build (:issue:`63685`).
3839
- Prevent buffer overflow in :meth:`Rolling.corr` and :meth:`Rolling.cov` with variable windows when passing ``other`` with a longer index than the original window. This now raises ``ValueError`` (:issue:`62937`)

pandas/compat/pyarrow.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22

33
from __future__ import annotations
44

5+
import sys
6+
from typing import Any
7+
58
from pandas.util.version import Version
69

710
PYARROW_MIN_VERSION = "13.0.0"
@@ -32,3 +35,57 @@
3235
pa_version_under21p0 = True
3336
pa_version_under22p0 = True
3437
HAS_PYARROW = False
38+
39+
40+
def _safe_fill_null(
41+
arr: pa.Array | pa.ChunkedArray, fill_value: Any
42+
) -> pa.Array | pa.ChunkedArray:
43+
"""
44+
Safe wrapper for pyarrow.compute.fill_null with fallback for Windows + pyarrow 21.
45+
46+
pyarrow 21.0.0 on Windows has a bug in fill_null that incorrectly fills null values.
47+
This function uses a fallback implementation for that specific case, otherwise uses
48+
the standard pyarrow.compute.fill_null.
49+
50+
Parameters
51+
----------
52+
arr : pyarrow.Array | pyarrow.ChunkedArray
53+
Input array with potential null values.
54+
fill_value : Any
55+
Value to fill nulls with.
56+
57+
Returns
58+
-------
59+
pyarrow.Array | pyarrow.ChunkedArray
60+
Array with nulls filled with fill_value.
61+
"""
62+
import pyarrow.compute as pc
63+
64+
is_windows = sys.platform in ["win32", "cygwin"]
65+
use_fallback = (
66+
HAS_PYARROW and is_windows and not pa_version_under21p0 and pa_version_under22p0
67+
)
68+
if not use_fallback or isinstance(fill_value, (pa.Array, pa.ChunkedArray)):
69+
return pc.fill_null(arr, fill_value)
70+
71+
fill_scalar = pa.scalar(fill_value, type=arr.type)
72+
73+
if pa.types.is_duration(arr.type):
74+
75+
def fill_null_duration(arr: pa.Array, fill_scalar: pa.Scalar) -> pa.Array:
76+
mask = pc.is_null(arr)
77+
zero_duration = pa.scalar(0, type=arr.type)
78+
arr_zeroed = pc.if_else(mask, zero_duration, arr)
79+
return pc.if_else(mask, fill_scalar, arr_zeroed)
80+
81+
if isinstance(arr, pa.ChunkedArray):
82+
return pa.chunked_array(
83+
[fill_null_duration(chunk, fill_scalar) for chunk in arr.chunks]
84+
)
85+
return fill_null_duration(arr, fill_scalar)
86+
87+
if isinstance(arr, pa.ChunkedArray):
88+
return pa.chunked_array(
89+
[pc.if_else(pc.is_null(chunk), fill_scalar, chunk) for chunk in arr.chunks]
90+
)
91+
return pc.if_else(pc.is_null(arr), fill_scalar, arr)

pandas/core/arrays/arrow/array.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,8 @@
9595
import pyarrow as pa
9696
import pyarrow.compute as pc
9797

98+
from pandas.compat.pyarrow import _safe_fill_null
99+
98100
from pandas.core.dtypes.dtypes import ArrowDtype
99101

100102
ARROW_CMP_FUNCS = {
@@ -1404,7 +1406,7 @@ def fillna(
14041406

14051407
try:
14061408
return self._from_pyarrow_array(
1407-
pc.fill_null(self._pa_array, fill_value=fill_value)
1409+
_safe_fill_null(self._pa_array, fill_value=fill_value)
14081410
)
14091411
except pa.ArrowNotImplementedError:
14101412
# ArrowNotImplementedError: Function 'coalesce' has no kernel
@@ -1470,7 +1472,7 @@ def factorize(
14701472
combined = encoded.combine_chunks()
14711473
pa_indices = combined.indices
14721474
if pa_indices.null_count > 0:
1473-
pa_indices = pc.fill_null(pa_indices, -1)
1475+
pa_indices = _safe_fill_null(pa_indices, -1)
14741476
indices = pa_indices.to_numpy(zero_copy_only=False, writable=True).astype(
14751477
np.intp, copy=False
14761478
)
@@ -1929,7 +1931,7 @@ def _str_accumulate(
19291931
return self._from_pyarrow_array(pa_array)
19301932
if skipna:
19311933
if name == "cumsum":
1932-
pa_array = pc.fill_null(pa_array, "")
1934+
pa_array = _safe_fill_null(pa_array, "")
19331935
else:
19341936
# We can retain the running min/max by forward/backward filling.
19351937
pa_array = pc.fill_null_forward(pa_array)

pandas/core/reshape/merge.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2837,7 +2837,8 @@ def _factorize_keys(
28372837
isinstance(lk.dtype, StringDtype) and lk.dtype.storage == "pyarrow"
28382838
):
28392839
import pyarrow as pa
2840-
import pyarrow.compute as pc
2840+
2841+
from pandas.compat.pyarrow import _safe_fill_null
28412842

28422843
len_lk = len(lk)
28432844
lk = lk._pa_array # type: ignore[attr-defined]
@@ -2849,10 +2850,10 @@ def _factorize_keys(
28492850
)
28502851

28512852
llab, rlab, count = (
2852-
pc.fill_null(dc.indices[slice(len_lk)], -1)
2853+
_safe_fill_null(dc.indices[slice(len_lk)], -1)
28532854
.to_numpy()
28542855
.astype(np.intp, copy=False),
2855-
pc.fill_null(dc.indices[slice(len_lk, None)], -1)
2856+
_safe_fill_null(dc.indices[slice(len_lk, None)], -1)
28562857
.to_numpy()
28572858
.astype(np.intp, copy=False),
28582859
len(dc.dictionary),

scripts/validate_unwanted_patterns.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@
6161
"_DatetimeTZBlock",
6262
"_check_pyarrow_available",
6363
"_parser", # https://github.com/pandas-dev/pandas/issues/60833
64+
"_safe_fill_null",
6465
}
6566

6667

0 commit comments

Comments
 (0)