From 293a0ef8f1ef66097939809c86b45091758a1cda Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 10 May 2023 17:49:57 -0400 Subject: [PATCH 01/10] BUG/REF: ArrowExtensionArray non-nanosecond units --- doc/source/whatsnew/v2.1.0.rst | 1 + pandas/core/arrays/arrow/array.py | 204 +++++++++++++++++---------- pandas/tests/extension/test_arrow.py | 72 +++++++++- 3 files changed, 200 insertions(+), 77 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 010773b2806a2..779ca50e1ac41 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -434,6 +434,7 @@ ExtensionArray - Bug in :meth:`Series.quantile` for pyarrow temporal types raising ArrowInvalid (:issue:`52678`) - Bug in :meth:`Series.rank` returning wrong order for small values with ``Float64`` dtype (:issue:`52471`) - Bug where the ``__from_arrow__`` method of masked ExtensionDtypes(e.g. :class:`Float64Dtype`, :class:`BooleanDtype`) would not accept pyarrow arrays of type ``pyarrow.null()`` (:issue:`52223`) +- Bug in :class:`~arrays.ArrowExtensionArray` converting pandas non-nanosecond temporal objects from non-zero values to zero values (:issue:`#####`) - Styler diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index d842e49589c4d..6cb839fe5ada4 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -243,36 +243,9 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal """ Construct a new ExtensionArray from a sequence of scalars. """ - pa_dtype = to_pyarrow_type(dtype) - if ( - isinstance(scalars, np.ndarray) - and isinstance(dtype, ArrowDtype) - and ( - pa.types.is_large_binary(pa_dtype) or pa.types.is_large_string(pa_dtype) - ) - ): - # See https://github.com/apache/arrow/issues/35289 - scalars = scalars.tolist() - - if isinstance(scalars, cls): - scalars = scalars._pa_array - elif not isinstance(scalars, (pa.Array, pa.ChunkedArray)): - if copy and is_array_like(scalars): - # pa array should not get updated when numpy array is updated - scalars = scalars.copy() - try: - scalars = pa.array(scalars, type=pa_dtype, from_pandas=True) - except pa.ArrowInvalid: - # GH50430: let pyarrow infer type, then cast - scalars = pa.array(scalars, from_pandas=True) - if pa_dtype and scalars.type != pa_dtype: - scalars = scalars.cast(pa_dtype) - arr = cls(scalars) - if pa.types.is_duration(scalars.type) and scalars.null_count > 0: - # GH52843: upstream bug for duration types when originally - # constructed with data containing numpy NaT. - # https://github.com/apache/arrow/issues/35088 - arr = arr.fillna(arr.dtype.na_value) + pa_type = to_pyarrow_type(dtype) + pa_array = cls._box_pa_array(scalars, pa_type=pa_type, copy=copy) + arr = cls(pa_array) return arr @classmethod @@ -466,65 +439,50 @@ def __setstate__(self, state) -> None: def _cmp_method(self, other, op): pc_func = ARROW_CMP_FUNCS[op.__name__] - if isinstance(other, ArrowExtensionArray): - result = pc_func(self._pa_array, other._pa_array) - elif isinstance(other, (np.ndarray, list)): - result = pc_func(self._pa_array, other) - elif isinstance(other, BaseMaskedArray): - # GH 52625 - result = pc_func(self._pa_array, other.__arrow_array__()) - elif is_scalar(other): - try: - result = pc_func(self._pa_array, pa.scalar(other)) - except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid): + try: + result = pc_func(self._pa_array, self._box_pa(other)) + except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid): + if is_scalar(other): mask = isna(self) | isna(other) valid = ~mask result = np.zeros(len(self), dtype="bool") result[valid] = op(np.array(self)[valid], other) result = pa.array(result, type=pa.bool_()) result = pc.if_else(valid, result, None) - else: - raise NotImplementedError( - f"{op.__name__} not implemented for {type(other)}" - ) - return ArrowExtensionArray(result) + else: + raise NotImplementedError( + f"{op.__name__} not implemented for {type(other)}" + ) + return type(self)(result) def _evaluate_op_method(self, other, op, arrow_funcs): pa_type = self._pa_array.type + other = self._box_pa(other) + if (pa.types.is_string(pa_type) or pa.types.is_binary(pa_type)) and op in [ operator.add, roperator.radd, ]: sep = pa.scalar("", type=pa_type) - if isinstance(other, type(self)): - other = other._pa_array if op is operator.add: result = pc.binary_join_element_wise(self._pa_array, other, sep) else: result = pc.binary_join_element_wise(other, self._pa_array, sep) return type(self)(result) + if ( + isinstance(other, pa.Scalar) + and pc.is_null(other).as_py() + and op.__name__ in ARROW_LOGICAL_FUNCS + ): + # pyarrow kleene ops require null to be typed + other = other.cast(pa_type) + pc_func = arrow_funcs[op.__name__] if pc_func is NotImplemented: raise NotImplementedError(f"{op.__name__} not implemented.") - if isinstance(other, ArrowExtensionArray): - result = pc_func(self._pa_array, other._pa_array) - elif isinstance(other, (np.ndarray, list)): - result = pc_func(self._pa_array, pa.array(other, from_pandas=True)) - elif isinstance(other, BaseMaskedArray): - # GH 52625 - result = pc_func(self._pa_array, other.__arrow_array__()) - elif is_scalar(other): - if isna(other) and op.__name__ in ARROW_LOGICAL_FUNCS: - # pyarrow kleene ops require null to be typed - pa_scalar = pa.scalar(None, type=self._pa_array.type) - else: - pa_scalar = pa.scalar(other) - result = pc_func(self._pa_array, pa_scalar) - else: - raise NotImplementedError( - f"{op.__name__} not implemented for {type(other)}" - ) + + result = pc_func(self._pa_array, other) return type(self)(result) def _logical_method(self, other, op): @@ -1601,18 +1559,114 @@ def _mode(self, dropna: bool = True) -> Self: return type(self)(most_common) - def _maybe_convert_setitem_value(self, value): - """Maybe convert value to be pyarrow compatible.""" - if value is None: - return value - if isinstance(value, (pa.Scalar, pa.Array, pa.ChunkedArray)): - return value + @classmethod + def _box_pa(cls, value, pa_type=None): if is_list_like(value): - pa_box = pa.array + return cls._box_pa_array(value, pa_type) + return cls._box_pa_scalar(value, pa_type) + + @classmethod + def _box_pa_scalar(cls, value, pa_type=None): + if isinstance(value, pa.Scalar): + pa_scalar = value + elif isna(value): + pa_scalar = pa.scalar(None, type=pa_type) else: - pa_box = pa.scalar + # GH#####: pyarrow does not yet handle pandas non-nano correctly + # see https://github.com/apache/arrow/issues/33321 + from pandas import ( + Timedelta, + Timestamp, + ) + + if isinstance(value, Timedelta): + if pa_type is None: + pa_type = pa.duration(value.unit) + elif value.unit != pa_type.unit: + value = value.as_unit(pa_type.unit) + value = value._value + elif isinstance(value, Timestamp): + if pa_type is None: + pa_type = pa.timestamp(value.unit, tz=value.tz) + elif value.unit != pa_type.unit: + value = value.as_unit(pa_type.unit) + value = value._value + + pa_scalar = pa.scalar(value, type=pa_type, from_pandas=True) + + if pa_type is not None and pa_scalar.type != pa_type: + pa_scalar = pa_scalar.cast(pa_type) + + return pa_scalar + + @classmethod + def _box_pa_array(cls, value, pa_type=None, copy: bool = False): + if isinstance(value, cls): + pa_array = value._pa_array + elif isinstance(value, (pa.Array, pa.ChunkedArray)): + pa_array = value + elif isinstance(value, BaseMaskedArray): + # GH 52625 + pa_array = value.__arrow_array__() + else: + if ( + isinstance(value, np.ndarray) + and pa_type is not None + and ( + pa.types.is_large_binary(pa_type) + or pa.types.is_large_string(pa_type) + ) + ): + # See https://github.com/apache/arrow/issues/35289 + value = value.tolist() + elif copy and is_array_like(value): + # pa array should not get updated when numpy array is updated + value = value.copy() + + if ( + pa_type is not None + and pa.types.is_duration(pa_type) + and (not isinstance(value, np.ndarray) or value.dtype.kind not in "mi") + ): + # GH#####: pyarrow does not yet handle pandas non-nano correctly + # see https://github.com/apache/arrow/issues/33321 + from pandas import to_timedelta + + value = to_timedelta(value, unit=pa_type.unit).as_unit(pa_type.unit) + value = value.to_numpy() + + try: + pa_array = pa.array(value, type=pa_type, from_pandas=True) + except pa.ArrowInvalid: + # GH50430: let pyarrow infer type, then cast + pa_array = pa.array(value, from_pandas=True) + + if pa_type is None and pa.types.is_duration(pa_array.type): + # GH#####: pyarrow does not yet handle pandas non-nano correctly + # see https://github.com/apache/arrow/issues/33321 + from pandas import to_timedelta + + value = to_timedelta(value) + value = value.to_numpy() + pa_array = pa.array(value, type=pa_type, from_pandas=True) + + if pa.types.is_duration(pa_array.type) and pa_array.null_count > 0: + # GH52843: upstream bug for duration types when originally + # constructed with data containing numpy NaT. + # https://github.com/apache/arrow/issues/35088 + arr = cls(pa_array) + arr = arr.fillna(arr.dtype.na_value) + pa_array = arr._pa_array + + if pa_type is not None and pa_array.type != pa_type: + pa_array = pa_array.cast(pa_type) + + return pa_array + + def _maybe_convert_setitem_value(self, value): + """Maybe convert value to be pyarrow compatible.""" try: - value = pa_box(value, type=self._pa_array.type, from_pandas=True) + value = self._box_pa(value, self._pa_array.type) except pa.ArrowTypeError as err: msg = f"Invalid value '{str(value)}' for dtype {self.dtype}" raise TypeError(msg) from err diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 5078a4e8078f8..a5a2400d434f5 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1719,8 +1719,9 @@ def test_setitem_null_slice(data): result = orig.copy() result[:] = data[0] - expected = ArrowExtensionArray( - pa.array([data[0]] * len(data), type=data._pa_array.type) + expected = ArrowExtensionArray._from_sequence( + [data[0]] * len(data), + dtype=data._pa_array.type, ) tm.assert_extension_array_equal(result, expected) @@ -2919,3 +2920,70 @@ def test_infer_dtype_pyarrow_dtype(data, request): request.node.add_marker(mark) assert res == lib.infer_dtype(list(data), skipna=True) + + +@pytest.mark.parametrize( + "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES +) +def test_from_sequence_temporal(pa_type): + # GH##### + val = 3 + unit = pa_type.unit + if pa.types.is_duration(pa_type): + seq = [pd.Timedelta(val, unit=unit).as_unit(unit)] + else: + seq = [pd.Timestamp(val, unit=unit, tz=pa_type.tz).as_unit(unit)] + + result = ArrowExtensionArray._from_sequence(seq, dtype=pa_type) + expected = ArrowExtensionArray(pa.array([val], type=pa_type)) + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize( + "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES +) +def test_setitem_temporal(pa_type): + # GH##### + unit = pa_type.unit + if pa.types.is_duration(pa_type): + val = pd.Timedelta(1, unit=unit).as_unit(unit) + else: + val = pd.Timestamp(1, unit=unit, tz=pa_type.tz).as_unit(unit) + + arr = ArrowExtensionArray(pa.array([1, 2, 3], type=pa_type)) + + result = arr.copy() + result[:] = val + expected = ArrowExtensionArray(pa.array([1, 1, 1], type=pa_type)) + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize( + "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES +) +def test_arithmetic_temporal(pa_type): + # GH##### + arr = ArrowExtensionArray(pa.array([1, 2, 3], type=pa_type)) + unit = pa_type.unit + + result = arr - pd.Timedelta(1, unit=unit).as_unit(unit) + expected = ArrowExtensionArray(pa.array([0, 1, 2], type=pa_type)) + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize( + "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES +) +def test_comparison_temporal(pa_type): + # GH##### + unit = pa_type.unit + if pa.types.is_duration(pa_type): + val = pd.Timedelta(1, unit=unit).as_unit(unit) + else: + val = pd.Timestamp(1, unit=unit, tz=pa_type.tz).as_unit(unit) + + arr = ArrowExtensionArray(pa.array([1, 2, 3], type=pa_type)) + + result = arr > val + expected = ArrowExtensionArray(pa.array([False, True, True], type=pa.bool_())) + tm.assert_extension_array_equal(result, expected) From e4919316e57997a6ec280a9e987a9251cc83a669 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 10 May 2023 17:59:23 -0400 Subject: [PATCH 02/10] mypy --- pandas/core/arrays/arrow/array.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 6cb839fe5ada4..39dc4d4b0e662 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1560,13 +1560,15 @@ def _mode(self, dropna: bool = True) -> Self: return type(self)(most_common) @classmethod - def _box_pa(cls, value, pa_type=None): + def _box_pa( + cls, value, pa_type: pa.DataType | None = None + ) -> pa.Array | pa.ChunkedArray | pa.Scalar: if is_list_like(value): return cls._box_pa_array(value, pa_type) return cls._box_pa_scalar(value, pa_type) @classmethod - def _box_pa_scalar(cls, value, pa_type=None): + def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar: if isinstance(value, pa.Scalar): pa_scalar = value elif isna(value): @@ -1600,7 +1602,9 @@ def _box_pa_scalar(cls, value, pa_type=None): return pa_scalar @classmethod - def _box_pa_array(cls, value, pa_type=None, copy: bool = False): + def _box_pa_array( + cls, value, pa_type: pa.DataType | None = None, copy: bool = False + ) -> pa.Array | pa.ChunkedArray: if isinstance(value, cls): pa_array = value._pa_array elif isinstance(value, (pa.Array, pa.ChunkedArray)): From 961b4b9d8cf021cad9fd69d3ee7e79888f5c7fc9 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 10 May 2023 18:04:55 -0400 Subject: [PATCH 03/10] gh refs --- doc/source/whatsnew/v2.1.0.rst | 2 +- pandas/core/arrays/arrow/array.py | 6 +++--- pandas/tests/extension/test_arrow.py | 8 ++++---- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 779ca50e1ac41..e3ede4d0dffa6 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -431,10 +431,10 @@ Sparse ExtensionArray ^^^^^^^^^^^^^^ +- Bug in :class:`~arrays.ArrowExtensionArray` converting pandas non-nanosecond temporal objects from non-zero values to zero values (:issue:`53171`) - Bug in :meth:`Series.quantile` for pyarrow temporal types raising ArrowInvalid (:issue:`52678`) - Bug in :meth:`Series.rank` returning wrong order for small values with ``Float64`` dtype (:issue:`52471`) - Bug where the ``__from_arrow__`` method of masked ExtensionDtypes(e.g. :class:`Float64Dtype`, :class:`BooleanDtype`) would not accept pyarrow arrays of type ``pyarrow.null()`` (:issue:`52223`) -- Bug in :class:`~arrays.ArrowExtensionArray` converting pandas non-nanosecond temporal objects from non-zero values to zero values (:issue:`#####`) - Styler diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 39dc4d4b0e662..96417b8eb1898 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1574,7 +1574,7 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar: elif isna(value): pa_scalar = pa.scalar(None, type=pa_type) else: - # GH#####: pyarrow does not yet handle pandas non-nano correctly + # GH 53171: pyarrow does not yet handle pandas non-nano correctly # see https://github.com/apache/arrow/issues/33321 from pandas import ( Timedelta, @@ -1632,7 +1632,7 @@ def _box_pa_array( and pa.types.is_duration(pa_type) and (not isinstance(value, np.ndarray) or value.dtype.kind not in "mi") ): - # GH#####: pyarrow does not yet handle pandas non-nano correctly + # GH 53171: pyarrow does not yet handle pandas non-nano correctly # see https://github.com/apache/arrow/issues/33321 from pandas import to_timedelta @@ -1646,7 +1646,7 @@ def _box_pa_array( pa_array = pa.array(value, from_pandas=True) if pa_type is None and pa.types.is_duration(pa_array.type): - # GH#####: pyarrow does not yet handle pandas non-nano correctly + # GH 53171: pyarrow does not yet handle pandas non-nano correctly # see https://github.com/apache/arrow/issues/33321 from pandas import to_timedelta diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index a5a2400d434f5..8772cdfd1a256 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2926,7 +2926,7 @@ def test_infer_dtype_pyarrow_dtype(data, request): "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES ) def test_from_sequence_temporal(pa_type): - # GH##### + # GH 53171 val = 3 unit = pa_type.unit if pa.types.is_duration(pa_type): @@ -2943,7 +2943,7 @@ def test_from_sequence_temporal(pa_type): "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES ) def test_setitem_temporal(pa_type): - # GH##### + # GH 53171 unit = pa_type.unit if pa.types.is_duration(pa_type): val = pd.Timedelta(1, unit=unit).as_unit(unit) @@ -2962,7 +2962,7 @@ def test_setitem_temporal(pa_type): "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES ) def test_arithmetic_temporal(pa_type): - # GH##### + # GH 53171 arr = ArrowExtensionArray(pa.array([1, 2, 3], type=pa_type)) unit = pa_type.unit @@ -2975,7 +2975,7 @@ def test_arithmetic_temporal(pa_type): "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES ) def test_comparison_temporal(pa_type): - # GH##### + # GH 53171 unit = pa_type.unit if pa.types.is_duration(pa_type): val = pd.Timedelta(1, unit=unit).as_unit(unit) From 299b20d9300ad15c87d54434a601f17b646052b8 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 10 May 2023 19:55:12 -0400 Subject: [PATCH 04/10] fixes --- pandas/core/arrays/arrow/array.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 96417b8eb1898..8af8dd1ce6479 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -453,7 +453,7 @@ def _cmp_method(self, other, op): raise NotImplementedError( f"{op.__name__} not implemented for {type(other)}" ) - return type(self)(result) + return ArrowExtensionArray(result) def _evaluate_op_method(self, other, op, arrow_funcs): pa_type = self._pa_array.type @@ -1611,6 +1611,8 @@ def _box_pa_array( pa_array = value elif isinstance(value, BaseMaskedArray): # GH 52625 + if copy: + value = value.copy() pa_array = value.__arrow_array__() else: if ( From 47ec5ef3fab2eecdeb7be89d30de86fd7d4b0ab3 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 10 May 2023 22:18:57 -0400 Subject: [PATCH 05/10] xfail min versions --- pandas/tests/extension/test_arrow.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 8772cdfd1a256..183445c360c6a 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2958,6 +2958,11 @@ def test_setitem_temporal(pa_type): tm.assert_extension_array_equal(result, expected) +@pytest.mark.xfail( + pa_version_under8p0, + reason="Function 'add_checked' has no kernel matching input types", + raises=pa.ArrowNotImplementedError, +) @pytest.mark.parametrize( "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES ) From 2a031ae1492bdcc88c0a0377bab97a5d999bc865 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 10 May 2023 22:20:30 -0400 Subject: [PATCH 06/10] docstrings --- pandas/core/arrays/arrow/array.py | 256 +++++++++++++++++------------- 1 file changed, 146 insertions(+), 110 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 8af8dd1ce6479..6ae9c23bbf920 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -321,6 +321,152 @@ def _from_sequence_of_strings( ) return cls._from_sequence(scalars, dtype=pa_type, copy=copy) + @classmethod + def _box_pa( + cls, value, pa_type: pa.DataType | None = None + ) -> pa.Array | pa.ChunkedArray | pa.Scalar: + """ + Box value into a pyarrow Array, ChunkedArray or Scalar. + + Parameters + ---------- + value : any + pa_type : pa.DataType | None + + Returns + ------- + pa.Array or pa.ChunkedArray or pa.Scalar + """ + if is_list_like(value): + return cls._box_pa_array(value, pa_type) + return cls._box_pa_scalar(value, pa_type) + + @classmethod + def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar: + """ + Box value into a pyarrow Scalar. + + Parameters + ---------- + value : any + pa_type : pa.DataType | None + + Returns + ------- + pa.Scalar + """ + if isinstance(value, pa.Scalar): + pa_scalar = value + elif isna(value): + pa_scalar = pa.scalar(None, type=pa_type) + else: + # GH 53171: pyarrow does not yet handle pandas non-nano correctly + # see https://github.com/apache/arrow/issues/33321 + from pandas import ( + Timedelta, + Timestamp, + ) + + if isinstance(value, Timedelta): + if pa_type is None: + pa_type = pa.duration(value.unit) + elif value.unit != pa_type.unit: + value = value.as_unit(pa_type.unit) + value = value._value + elif isinstance(value, Timestamp): + if pa_type is None: + pa_type = pa.timestamp(value.unit, tz=value.tz) + elif value.unit != pa_type.unit: + value = value.as_unit(pa_type.unit) + value = value._value + + pa_scalar = pa.scalar(value, type=pa_type, from_pandas=True) + + if pa_type is not None and pa_scalar.type != pa_type: + pa_scalar = pa_scalar.cast(pa_type) + + return pa_scalar + + @classmethod + def _box_pa_array( + cls, value, pa_type: pa.DataType | None = None, copy: bool = False + ) -> pa.Array | pa.ChunkedArray: + """ + Box value into a pyarrow Array or ChunkedArray. + + Parameters + ---------- + value : Sequence + pa_type : pa.DataType | None + + Returns + ------- + pa.Array or pa.ChunkedArray + """ + if isinstance(value, cls): + pa_array = value._pa_array + elif isinstance(value, (pa.Array, pa.ChunkedArray)): + pa_array = value + elif isinstance(value, BaseMaskedArray): + # GH 52625 + if copy: + value = value.copy() + pa_array = value.__arrow_array__() + else: + if ( + isinstance(value, np.ndarray) + and pa_type is not None + and ( + pa.types.is_large_binary(pa_type) + or pa.types.is_large_string(pa_type) + ) + ): + # See https://github.com/apache/arrow/issues/35289 + value = value.tolist() + elif copy and is_array_like(value): + # pa array should not get updated when numpy array is updated + value = value.copy() + + if ( + pa_type is not None + and pa.types.is_duration(pa_type) + and (not isinstance(value, np.ndarray) or value.dtype.kind not in "mi") + ): + # GH 53171: pyarrow does not yet handle pandas non-nano correctly + # see https://github.com/apache/arrow/issues/33321 + from pandas import to_timedelta + + value = to_timedelta(value, unit=pa_type.unit).as_unit(pa_type.unit) + value = value.to_numpy() + + try: + pa_array = pa.array(value, type=pa_type, from_pandas=True) + except pa.ArrowInvalid: + # GH50430: let pyarrow infer type, then cast + pa_array = pa.array(value, from_pandas=True) + + if pa_type is None and pa.types.is_duration(pa_array.type): + # GH 53171: pyarrow does not yet handle pandas non-nano correctly + # see https://github.com/apache/arrow/issues/33321 + from pandas import to_timedelta + + value = to_timedelta(value) + value = value.to_numpy() + pa_array = pa.array(value, type=pa_type, from_pandas=True) + + if pa.types.is_duration(pa_array.type) and pa_array.null_count > 0: + # GH52843: upstream bug for duration types when originally + # constructed with data containing numpy NaT. + # https://github.com/apache/arrow/issues/35088 + arr = cls(pa_array) + arr = arr.fillna(arr.dtype.na_value) + pa_array = arr._pa_array + + if pa_type is not None and pa_array.type != pa_type: + pa_array = pa_array.cast(pa_type) + + return pa_array + def __getitem__(self, item: PositionalIndexer): """Select a subset of self. @@ -1559,116 +1705,6 @@ def _mode(self, dropna: bool = True) -> Self: return type(self)(most_common) - @classmethod - def _box_pa( - cls, value, pa_type: pa.DataType | None = None - ) -> pa.Array | pa.ChunkedArray | pa.Scalar: - if is_list_like(value): - return cls._box_pa_array(value, pa_type) - return cls._box_pa_scalar(value, pa_type) - - @classmethod - def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar: - if isinstance(value, pa.Scalar): - pa_scalar = value - elif isna(value): - pa_scalar = pa.scalar(None, type=pa_type) - else: - # GH 53171: pyarrow does not yet handle pandas non-nano correctly - # see https://github.com/apache/arrow/issues/33321 - from pandas import ( - Timedelta, - Timestamp, - ) - - if isinstance(value, Timedelta): - if pa_type is None: - pa_type = pa.duration(value.unit) - elif value.unit != pa_type.unit: - value = value.as_unit(pa_type.unit) - value = value._value - elif isinstance(value, Timestamp): - if pa_type is None: - pa_type = pa.timestamp(value.unit, tz=value.tz) - elif value.unit != pa_type.unit: - value = value.as_unit(pa_type.unit) - value = value._value - - pa_scalar = pa.scalar(value, type=pa_type, from_pandas=True) - - if pa_type is not None and pa_scalar.type != pa_type: - pa_scalar = pa_scalar.cast(pa_type) - - return pa_scalar - - @classmethod - def _box_pa_array( - cls, value, pa_type: pa.DataType | None = None, copy: bool = False - ) -> pa.Array | pa.ChunkedArray: - if isinstance(value, cls): - pa_array = value._pa_array - elif isinstance(value, (pa.Array, pa.ChunkedArray)): - pa_array = value - elif isinstance(value, BaseMaskedArray): - # GH 52625 - if copy: - value = value.copy() - pa_array = value.__arrow_array__() - else: - if ( - isinstance(value, np.ndarray) - and pa_type is not None - and ( - pa.types.is_large_binary(pa_type) - or pa.types.is_large_string(pa_type) - ) - ): - # See https://github.com/apache/arrow/issues/35289 - value = value.tolist() - elif copy and is_array_like(value): - # pa array should not get updated when numpy array is updated - value = value.copy() - - if ( - pa_type is not None - and pa.types.is_duration(pa_type) - and (not isinstance(value, np.ndarray) or value.dtype.kind not in "mi") - ): - # GH 53171: pyarrow does not yet handle pandas non-nano correctly - # see https://github.com/apache/arrow/issues/33321 - from pandas import to_timedelta - - value = to_timedelta(value, unit=pa_type.unit).as_unit(pa_type.unit) - value = value.to_numpy() - - try: - pa_array = pa.array(value, type=pa_type, from_pandas=True) - except pa.ArrowInvalid: - # GH50430: let pyarrow infer type, then cast - pa_array = pa.array(value, from_pandas=True) - - if pa_type is None and pa.types.is_duration(pa_array.type): - # GH 53171: pyarrow does not yet handle pandas non-nano correctly - # see https://github.com/apache/arrow/issues/33321 - from pandas import to_timedelta - - value = to_timedelta(value) - value = value.to_numpy() - pa_array = pa.array(value, type=pa_type, from_pandas=True) - - if pa.types.is_duration(pa_array.type) and pa_array.null_count > 0: - # GH52843: upstream bug for duration types when originally - # constructed with data containing numpy NaT. - # https://github.com/apache/arrow/issues/35088 - arr = cls(pa_array) - arr = arr.fillna(arr.dtype.na_value) - pa_array = arr._pa_array - - if pa_type is not None and pa_array.type != pa_type: - pa_array = pa_array.cast(pa_type) - - return pa_array - def _maybe_convert_setitem_value(self, value): """Maybe convert value to be pyarrow compatible.""" try: From 9c48d6a91e5c14ce335632f4c85cb8c64ab20503 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Thu, 11 May 2023 03:19:12 -0400 Subject: [PATCH 07/10] fix test --- pandas/tests/extension/test_arrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 183445c360c6a..4cdf26a25d6a3 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2960,7 +2960,7 @@ def test_setitem_temporal(pa_type): @pytest.mark.xfail( pa_version_under8p0, - reason="Function 'add_checked' has no kernel matching input types", + reason="Function 'subtract_checked' has no kernel matching input types", raises=pa.ArrowNotImplementedError, ) @pytest.mark.parametrize( From b460e6ecd6fb6d47ab93236f1d8e2e8d8b24ee5a Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Thu, 11 May 2023 06:15:41 -0400 Subject: [PATCH 08/10] fix test --- pandas/tests/extension/test_arrow.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 4cdf26a25d6a3..6718b2d288466 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2958,19 +2958,20 @@ def test_setitem_temporal(pa_type): tm.assert_extension_array_equal(result, expected) -@pytest.mark.xfail( - pa_version_under8p0, - reason="Function 'subtract_checked' has no kernel matching input types", - raises=pa.ArrowNotImplementedError, -) @pytest.mark.parametrize( "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES ) -def test_arithmetic_temporal(pa_type): +def test_arithmetic_temporal(pa_type, request): # GH 53171 + if pa_version_under8p0 and pa.types.is_duration(pa_type): + mark = pytest.mark.xfail( + raises=pa.ArrowNotImplementedError, + reason="Function 'subtract_checked' has no kernel matching input types", + ) + request.node.add_marker(mark) + arr = ArrowExtensionArray(pa.array([1, 2, 3], type=pa_type)) unit = pa_type.unit - result = arr - pd.Timedelta(1, unit=unit).as_unit(unit) expected = ArrowExtensionArray(pa.array([0, 1, 2], type=pa_type)) tm.assert_extension_array_equal(result, expected) From 8e55b2645a5f339fbb6e8b6fd2d3bc621c3064e3 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Fri, 12 May 2023 05:44:27 -0400 Subject: [PATCH 09/10] update imports --- pandas/core/arrays/arrow/array.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 6ae9c23bbf920..7942d2cbc83d5 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -362,7 +362,7 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar: else: # GH 53171: pyarrow does not yet handle pandas non-nano correctly # see https://github.com/apache/arrow/issues/33321 - from pandas import ( + from pandas._libs.tslibs import ( Timedelta, Timestamp, ) @@ -434,7 +434,7 @@ def _box_pa_array( ): # GH 53171: pyarrow does not yet handle pandas non-nano correctly # see https://github.com/apache/arrow/issues/33321 - from pandas import to_timedelta + from pandas.core.tools.timedeltas import to_timedelta value = to_timedelta(value, unit=pa_type.unit).as_unit(pa_type.unit) value = value.to_numpy() @@ -448,7 +448,7 @@ def _box_pa_array( if pa_type is None and pa.types.is_duration(pa_array.type): # GH 53171: pyarrow does not yet handle pandas non-nano correctly # see https://github.com/apache/arrow/issues/33321 - from pandas import to_timedelta + from pandas.core.tools.timedeltas import to_timedelta value = to_timedelta(value) value = value.to_numpy() From 04bce55373ae2c6251944fa2f33b9b7e4874325b Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Fri, 12 May 2023 22:25:45 -0400 Subject: [PATCH 10/10] move imports --- pandas/core/arrays/arrow/array.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 7942d2cbc83d5..d201a9ba3a1d9 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -18,6 +18,10 @@ import numpy as np from pandas._libs import lib +from pandas._libs.tslibs import ( + Timedelta, + Timestamp, +) from pandas.compat import ( pa_version_under7p0, pa_version_under8p0, @@ -362,11 +366,6 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar: else: # GH 53171: pyarrow does not yet handle pandas non-nano correctly # see https://github.com/apache/arrow/issues/33321 - from pandas._libs.tslibs import ( - Timedelta, - Timestamp, - ) - if isinstance(value, Timedelta): if pa_type is None: pa_type = pa.duration(value.unit)