From b501de77c74b99d7acbadd38277efc3a88384d8f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 21 Apr 2023 18:04:33 +0200 Subject: [PATCH 1/6] Use actual isinstance check for pyarrow.Array instead of hasattr(.. 'type') duck typing --- pandas/compat/pyarrow.py | 14 ++++++++++++++ pandas/core/arrays/string_.py | 7 +++++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index 020ec346490ff..d1ebceb86c7d4 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -4,6 +4,8 @@ from pandas.util.version import Version +PYARROW_INSTALLED = None + try: import pyarrow as pa @@ -14,9 +16,21 @@ pa_version_under9p0 = _palv < Version("9.0.0") pa_version_under10p0 = _palv < Version("10.0.0") pa_version_under11p0 = _palv < Version("11.0.0") + + PYARROW_INSTALLED = True except ImportError: + pa = None + pa_version_under7p0 = True pa_version_under8p0 = True pa_version_under9p0 = True pa_version_under10p0 = True pa_version_under11p0 = True + + PYARROW_INSTALLED = False + + +def is_pyarrow_array(obj) -> bool: + if PYARROW_INSTALLED: + return isinstance(obj, (pa.Array, pa.ChunkedArray)) + return False diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index e1ad812ac10bf..612752064a0c9 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -14,7 +14,10 @@ missing as libmissing, ) from pandas._libs.arrays import NDArrayBacked -from pandas.compat import pa_version_under7p0 +from pandas.compat import ( + is_pyarrow_array, + pa_version_under7p0, +) from pandas.compat.numpy import function as nv from pandas.util._decorators import doc @@ -356,7 +359,7 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal result[na_values] = libmissing.NA else: - if hasattr(scalars, "type"): + if is_pyarrow_array(scalars): # pyarrow array; we cannot rely on the "to_numpy" check in # ensure_string_array because calling scalars.to_numpy would set # zero_copy_only to True which caused problems see GH#52076 From 6ec3d3445ab92221de8d8dbabfa23ae6e6069c74 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 21 Apr 2023 20:01:46 +0200 Subject: [PATCH 2/6] fix import --- pandas/compat/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 745b20dc4e764..cf25b3babaf76 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -24,6 +24,7 @@ import pandas.compat.compressors from pandas.compat.numpy import is_numpy_dev from pandas.compat.pyarrow import ( + is_pyarrow_array, pa_version_under7p0, pa_version_under8p0, pa_version_under9p0, @@ -156,6 +157,7 @@ def get_lzma_file() -> type[pandas.compat.compressors.LZMAFile]: __all__ = [ "is_numpy_dev", + "is_pyarrow_array", "pa_version_under7p0", "pa_version_under8p0", "pa_version_under9p0", From 3c888334b0472d22a01afecda04963a84376178a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 28 Apr 2023 15:06:21 +0200 Subject: [PATCH 3/6] move to cython --- pandas/_libs/lib.pyi | 1 + pandas/_libs/lib.pyx | 16 ++++++++++++++++ pandas/compat/__init__.py | 2 -- pandas/compat/pyarrow.py | 14 -------------- pandas/core/arrays/string_.py | 7 ++----- 5 files changed, 19 insertions(+), 21 deletions(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 05d569f0e58eb..179268a9303c8 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -42,6 +42,7 @@ def infer_dtype(value: object, skipna: bool = ...) -> str: ... def is_iterator(obj: object) -> bool: ... def is_scalar(val: object) -> bool: ... def is_list_like(obj: object, allow_sets: bool = ...) -> bool: ... +def is_pyarrow_array(obj: object) -> bool: ... def is_period(val: object) -> TypeGuard[Period]: ... def is_interval(val: object) -> TypeGuard[Interval]: ... def is_decimal(val: object) -> TypeGuard[Decimal]: ... diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 92f1dc2d4ea3b..18106b3ac50e5 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -145,6 +145,16 @@ i8max = INT64_MAX u8max = UINT64_MAX +cdef bint PYARROW_INSTALLED = False + +try: + import pyarrow as pa + + PYARROW_INSTALLED = True +except ImportError: + pa = None + + @cython.wraparound(False) @cython.boundscheck(False) def memory_usage_of_objects(arr: object[:]) -> int64_t: @@ -1173,6 +1183,12 @@ cdef bint c_is_list_like(object obj, bint allow_sets) except -1: ) +def is_pyarrow_array(obj): + if PYARROW_INSTALLED: + return isinstance(obj, (pa.Array, pa.ChunkedArray)) + return False + + _TYPE_MAP = { "categorical": "categorical", "category": "categorical", diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index c7244356510e2..3d7589bf67ee2 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -23,7 +23,6 @@ import pandas.compat.compressors from pandas.compat.numpy import is_numpy_dev from pandas.compat.pyarrow import ( - is_pyarrow_array, pa_version_under7p0, pa_version_under8p0, pa_version_under9p0, @@ -156,7 +155,6 @@ def get_lzma_file() -> type[pandas.compat.compressors.LZMAFile]: __all__ = [ "is_numpy_dev", - "is_pyarrow_array", "pa_version_under7p0", "pa_version_under8p0", "pa_version_under9p0", diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index d1ebceb86c7d4..020ec346490ff 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -4,8 +4,6 @@ from pandas.util.version import Version -PYARROW_INSTALLED = None - try: import pyarrow as pa @@ -16,21 +14,9 @@ pa_version_under9p0 = _palv < Version("9.0.0") pa_version_under10p0 = _palv < Version("10.0.0") pa_version_under11p0 = _palv < Version("11.0.0") - - PYARROW_INSTALLED = True except ImportError: - pa = None - pa_version_under7p0 = True pa_version_under8p0 = True pa_version_under9p0 = True pa_version_under10p0 = True pa_version_under11p0 = True - - PYARROW_INSTALLED = False - - -def is_pyarrow_array(obj) -> bool: - if PYARROW_INSTALLED: - return isinstance(obj, (pa.Array, pa.ChunkedArray)) - return False diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 612752064a0c9..d400b0c55a279 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -14,10 +14,7 @@ missing as libmissing, ) from pandas._libs.arrays import NDArrayBacked -from pandas.compat import ( - is_pyarrow_array, - pa_version_under7p0, -) +from pandas.compat import pa_version_under7p0 from pandas.compat.numpy import function as nv from pandas.util._decorators import doc @@ -359,7 +356,7 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal result[na_values] = libmissing.NA else: - if is_pyarrow_array(scalars): + if lib.is_pyarrow_array(scalars): # pyarrow array; we cannot rely on the "to_numpy" check in # ensure_string_array because calling scalars.to_numpy would set # zero_copy_only to True which caused problems see GH#52076 From 2448fd69dd1e7c403a9e6ccd806b84403784a1bb Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 28 Apr 2023 15:08:44 +0200 Subject: [PATCH 4/6] add docstring --- pandas/_libs/lib.pyx | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 18106b3ac50e5..9ff7b78e552b5 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1184,6 +1184,13 @@ cdef bint c_is_list_like(object obj, bint allow_sets) except -1: def is_pyarrow_array(obj): + """ + Return True if given object is a pyarrow Array or ChunkedArray. + + Returns + ------- + bool + """ if PYARROW_INSTALLED: return isinstance(obj, (pa.Array, pa.ChunkedArray)) return False From 564c912088509842871dfa45acd8e4d50c0b8ee3 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 28 Apr 2023 15:21:26 +0200 Subject: [PATCH 5/6] try TypeGuard --- pandas/_libs/lib.pyi | 3 ++- pandas/_typing.py | 11 +++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 179268a9303c8..6994fd5333ab7 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -17,6 +17,7 @@ from pandas._libs.interval import Interval from pandas._libs.tslibs import Period from pandas._typing import ( ArrayLike, + ArrowArrayTypeGuard, DtypeObj, TypeGuard, npt, @@ -42,7 +43,7 @@ def infer_dtype(value: object, skipna: bool = ...) -> str: ... def is_iterator(obj: object) -> bool: ... def is_scalar(val: object) -> bool: ... def is_list_like(obj: object, allow_sets: bool = ...) -> bool: ... -def is_pyarrow_array(obj: object) -> bool: ... +def is_pyarrow_array(obj: object) -> ArrowArrayTypeGuard: ... def is_period(val: object) -> TypeGuard[Period]: ... def is_interval(val: object) -> TypeGuard[Interval]: ... def is_decimal(val: object) -> TypeGuard[Decimal]: ... diff --git a/pandas/_typing.py b/pandas/_typing.py index dc3f2f54a54ca..c0cce04e768cb 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -93,10 +93,21 @@ from typing import Self else: from typing_extensions import Self # pyright: reportUnusedImport = false + + try: + from pyarrow import ( + Array, + ChunkedArray, + ) + + ArrowArrayTypeGuard = TypeGuard[Union[Array, ChunkedArray]] + except ImportError: + ArrowArrayTypeGuard = bool else: npt: Any = None Self: Any = None TypeGuard: Any = None + ArrowArrayTypeGuard = bool HashableT = TypeVar("HashableT", bound=Hashable) From c8aa9526a2ebf52070714c0775327ec310a02f90 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 31 May 2023 10:14:42 +0200 Subject: [PATCH 6/6] remove ArrowArrayTypeGuard --- pandas/_libs/lib.pyi | 3 +-- pandas/_typing.py | 11 ----------- 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 6a494c9e5bc51..e9d4e45c07925 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -17,7 +17,6 @@ from pandas._libs.interval import Interval from pandas._libs.tslibs import Period from pandas._typing import ( ArrayLike, - ArrowArrayTypeGuard, DtypeObj, TypeGuard, npt, @@ -43,7 +42,7 @@ def infer_dtype(value: object, skipna: bool = ...) -> str: ... def is_iterator(obj: object) -> bool: ... def is_scalar(val: object) -> bool: ... def is_list_like(obj: object, allow_sets: bool = ...) -> bool: ... -def is_pyarrow_array(obj: object) -> ArrowArrayTypeGuard: ... +def is_pyarrow_array(obj: object) -> bool: ... def is_period(val: object) -> TypeGuard[Period]: ... def is_interval(val: object) -> TypeGuard[Interval]: ... def is_decimal(val: object) -> TypeGuard[Decimal]: ... diff --git a/pandas/_typing.py b/pandas/_typing.py index eb5a87d45e348..9d4acbe76ba15 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -93,21 +93,10 @@ from typing import Self else: from typing_extensions import Self # pyright: reportUnusedImport = false - - try: - from pyarrow import ( - Array, - ChunkedArray, - ) - - ArrowArrayTypeGuard = TypeGuard[Union[Array, ChunkedArray]] - except ImportError: - ArrowArrayTypeGuard = bool else: npt: Any = None Self: Any = None TypeGuard: Any = None - ArrowArrayTypeGuard = bool HashableT = TypeVar("HashableT", bound=Hashable)