From 04da6eb76c89a93023d24d73109c739f226c10e8 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sat, 24 Dec 2022 10:52:27 -0500 Subject: [PATCH 1/3] BUG: Series(strings).astype("float64[pyarrow]") raising --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/core/arrays/arrow/array.py | 22 ++++++++++++---------- pandas/tests/extension/test_arrow.py | 8 ++++++++ 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 75ba169600962..c3201115dfbe1 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -830,6 +830,7 @@ Conversion - Bug in :meth:`Series.convert_dtypes` not converting dtype to nullable dtype when :class:`Series` contains ``NA`` and has dtype ``object`` (:issue:`48791`) - Bug where any :class:`ExtensionDtype` subclass with ``kind="M"`` would be interpreted as a timezone type (:issue:`34986`) - Bug in :class:`.arrays.ArrowExtensionArray` that would raise ``NotImplementedError`` when passed a sequence of strings or binary (:issue:`49172`) +- Bug in :meth:`Series.astype` raising ``pyarrow.ArrowInvalid`` when converting from a non-pyarrow string dtype to a pyarrow numeric type (:issue:`#####`) - Bug in :func:`to_datetime` was not respecting ``exact`` argument when ``format`` was an ISO8601 format (:issue:`12649`) - Bug in :meth:`TimedeltaArray.astype` raising ``TypeError`` when converting to a pyarrow duration type (:issue:`49795`) - diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 6250c298f291f..3cb15e026a52f 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -206,17 +206,19 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal Construct a new ExtensionArray from a sequence of scalars. """ pa_dtype = to_pyarrow_type(dtype) - is_cls = isinstance(scalars, cls) - if is_cls or isinstance(scalars, (pa.Array, pa.ChunkedArray)): - if is_cls: - scalars = scalars._data - if pa_dtype: - scalars = scalars.cast(pa_dtype) - return cls(scalars) + if isinstance(scalars, cls): + scalars = scalars._data + elif isinstance(scalars, (pa.Array, pa.ChunkedArray)): + pass else: - return cls( - pa.chunked_array(pa.array(scalars, type=pa_dtype, from_pandas=True)) - ) + try: + scalars = pa.array(scalars, type=pa_dtype, from_pandas=True) + except pa.ArrowInvalid: + # GH#####: let pyarrow infer type, then cast + scalars = pa.array(scalars, from_pandas=True) + if pa_dtype: + scalars = scalars.cast(pa_dtype) + return cls(scalars) @classmethod def _from_sequence_of_strings( diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 9b42b86efd0d0..3df695e036eb7 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1471,6 +1471,14 @@ def test_astype_from_non_pyarrow(data): tm.assert_extension_array_equal(result, data) +def test_astype_float_from_non_pyarrow_str(): + # GH##### + ser = pd.Series(["1.0"]) + result = ser.astype("float64[pyarrow]") + expected = pd.Series([1.0], dtype="float64[pyarrow]") + tm.assert_series_equal(result, expected) + + def test_to_numpy_with_defaults(data): # GH49973 result = data.to_numpy() From e708cbd1037584c469eeb6d81f0c067f3abb2d27 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sat, 24 Dec 2022 10:57:56 -0500 Subject: [PATCH 2/3] gh refs --- doc/source/whatsnew/v2.0.0.rst | 2 +- pandas/core/arrays/arrow/array.py | 2 +- pandas/tests/extension/test_arrow.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index c3201115dfbe1..12b0d90e68ab9 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -830,7 +830,7 @@ Conversion - Bug in :meth:`Series.convert_dtypes` not converting dtype to nullable dtype when :class:`Series` contains ``NA`` and has dtype ``object`` (:issue:`48791`) - Bug where any :class:`ExtensionDtype` subclass with ``kind="M"`` would be interpreted as a timezone type (:issue:`34986`) - Bug in :class:`.arrays.ArrowExtensionArray` that would raise ``NotImplementedError`` when passed a sequence of strings or binary (:issue:`49172`) -- Bug in :meth:`Series.astype` raising ``pyarrow.ArrowInvalid`` when converting from a non-pyarrow string dtype to a pyarrow numeric type (:issue:`#####`) +- Bug in :meth:`Series.astype` raising ``pyarrow.ArrowInvalid`` when converting from a non-pyarrow string dtype to a pyarrow numeric type (:issue:`50430`) - Bug in :func:`to_datetime` was not respecting ``exact`` argument when ``format`` was an ISO8601 format (:issue:`12649`) - Bug in :meth:`TimedeltaArray.astype` raising ``TypeError`` when converting to a pyarrow duration type (:issue:`49795`) - diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 3cb15e026a52f..dc17fde6f21fd 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -214,7 +214,7 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal try: scalars = pa.array(scalars, type=pa_dtype, from_pandas=True) except pa.ArrowInvalid: - # GH#####: let pyarrow infer type, then cast + # GH50430: let pyarrow infer type, then cast scalars = pa.array(scalars, from_pandas=True) if pa_dtype: scalars = scalars.cast(pa_dtype) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 3df695e036eb7..37abdefa25f6e 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1472,7 +1472,7 @@ def test_astype_from_non_pyarrow(data): def test_astype_float_from_non_pyarrow_str(): - # GH##### + # GH50430 ser = pd.Series(["1.0"]) result = ser.astype("float64[pyarrow]") expected = pd.Series([1.0], dtype="float64[pyarrow]") From 166db8beadab2a4f0603d5f9cf4dab846ff8e80b Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sat, 24 Dec 2022 11:27:32 -0500 Subject: [PATCH 3/3] simplify --- pandas/core/arrays/arrow/array.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index dc17fde6f21fd..7e954b3d1d1ec 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -208,9 +208,7 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal pa_dtype = to_pyarrow_type(dtype) if isinstance(scalars, cls): scalars = scalars._data - elif isinstance(scalars, (pa.Array, pa.ChunkedArray)): - pass - else: + elif not isinstance(scalars, (pa.Array, pa.ChunkedArray)): try: scalars = pa.array(scalars, type=pa_dtype, from_pandas=True) except pa.ArrowInvalid: