|
2 | 2 |
|
3 | 3 | from __future__ import annotations |
4 | 4 |
|
| 5 | +import sys |
| 6 | +from typing import Any |
| 7 | + |
5 | 8 | from pandas.util.version import Version |
6 | 9 |
|
7 | 10 | PYARROW_MIN_VERSION = "13.0.0" |
|
32 | 35 | pa_version_under21p0 = True |
33 | 36 | pa_version_under22p0 = True |
34 | 37 | HAS_PYARROW = False |
| 38 | + |
| 39 | + |
| 40 | +def _safe_fill_null( |
| 41 | + arr: pa.Array | pa.ChunkedArray, fill_value: Any |
| 42 | +) -> pa.Array | pa.ChunkedArray: |
| 43 | + """ |
| 44 | + Safe wrapper for pyarrow.compute.fill_null with fallback for Windows + pyarrow 21. |
| 45 | +
|
| 46 | + pyarrow 21.0.0 on Windows has a bug in fill_null that incorrectly fills null values. |
| 47 | + This function uses a fallback implementation for that specific case, otherwise uses |
| 48 | + the standard pyarrow.compute.fill_null. |
| 49 | +
|
| 50 | + Parameters |
| 51 | + ---------- |
| 52 | + arr : pyarrow.Array | pyarrow.ChunkedArray |
| 53 | + Input array with potential null values. |
| 54 | + fill_value : Any |
| 55 | + Value to fill nulls with. |
| 56 | +
|
| 57 | + Returns |
| 58 | + ------- |
| 59 | + pyarrow.Array | pyarrow.ChunkedArray |
| 60 | + Array with nulls filled with fill_value. |
| 61 | + """ |
| 62 | + import pyarrow.compute as pc |
| 63 | + |
| 64 | + is_windows = sys.platform in ["win32", "cygwin"] |
| 65 | + use_fallback = ( |
| 66 | + HAS_PYARROW and is_windows and not pa_version_under21p0 and pa_version_under22p0 |
| 67 | + ) |
| 68 | + if not use_fallback or isinstance(fill_value, (pa.Array, pa.ChunkedArray)): |
| 69 | + return pc.fill_null(arr, fill_value) |
| 70 | + |
| 71 | + fill_scalar = pa.scalar(fill_value, type=arr.type) |
| 72 | + |
| 73 | + if pa.types.is_duration(arr.type): |
| 74 | + |
| 75 | + def fill_null_duration(arr: pa.Array, fill_scalar: pa.Scalar) -> pa.Array: |
| 76 | + mask = pc.is_null(arr) |
| 77 | + zero_duration = pa.scalar(0, type=arr.type) |
| 78 | + arr_zeroed = pc.if_else(mask, zero_duration, arr) |
| 79 | + return pc.if_else(mask, fill_scalar, arr_zeroed) |
| 80 | + |
| 81 | + if isinstance(arr, pa.ChunkedArray): |
| 82 | + return pa.chunked_array( |
| 83 | + [fill_null_duration(chunk, fill_scalar) for chunk in arr.chunks] |
| 84 | + ) |
| 85 | + return fill_null_duration(arr, fill_scalar) |
| 86 | + |
| 87 | + if isinstance(arr, pa.ChunkedArray): |
| 88 | + return pa.chunked_array( |
| 89 | + [pc.if_else(pc.is_null(chunk), fill_scalar, chunk) for chunk in arr.chunks] |
| 90 | + ) |
| 91 | + return pc.if_else(pc.is_null(arr), fill_scalar, arr) |
0 commit comments