From f6fcdf8363e11544d18f6264b6052141bc73ddb4 Mon Sep 17 00:00:00 2001 From: Gaurav Sheni Date: Tue, 10 May 2022 18:37:35 -0400 Subject: [PATCH 01/19] Update __init__.py --- pandas/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/__init__.py b/pandas/__init__.py index 3645e8744d8af..7f1d3732ed008 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -57,6 +57,8 @@ UInt64Dtype, Float32Dtype, Float64Dtype, + Float32ArrowDtype, + Float64ArrowDtype, CategoricalDtype, PeriodDtype, IntervalDtype, From 8c39928246cec320ad379858c1de888e3e9f6b6d Mon Sep 17 00:00:00 2001 From: Gaurav Sheni Date: Tue, 10 May 2022 18:38:36 -0400 Subject: [PATCH 02/19] Update api.py --- pandas/core/api.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/api.py b/pandas/core/api.py index cf082d2013d3b..1c72f2fb6f214 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -31,6 +31,8 @@ from pandas.core.arrays.floating import ( Float32Dtype, Float64Dtype, + Float32ArrowDtype, + Float64ArrowDtype, ) from pandas.core.arrays.integer import ( Int8Dtype, From c7add0b6fe53d5ce5316d196a294376fab7cb70d Mon Sep 17 00:00:00 2001 From: Gaurav Sheni Date: Wed, 11 May 2022 00:20:41 -0400 Subject: [PATCH 03/19] added floatingdtype and lfloatingarrowarray --- pandas/core/api.py | 2 + pandas/core/arrays/arrow/floating.py | 69 ++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 pandas/core/arrays/arrow/floating.py diff --git a/pandas/core/api.py b/pandas/core/api.py index 1c72f2fb6f214..cbea384b7ad8e 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -31,6 +31,8 @@ from pandas.core.arrays.floating import ( Float32Dtype, Float64Dtype, +) +from pandas.core.arrays.arrow.floating import ( Float32ArrowDtype, Float64ArrowDtype, ) diff --git a/pandas/core/arrays/arrow/floating.py b/pandas/core/arrays/arrow/floating.py new file mode 100644 index 0000000000000..dc3d8c4131724 --- /dev/null +++ b/pandas/core/arrays/arrow/floating.py @@ -0,0 +1,69 @@ +from __future__ import annotations + +from typing import ( + Any, + Callable, + TypeVar, +) + +import pyarrow as pa + +from pandas.errors import AbstractMethodError +from pandas.util._decorators import cache_readonly + +from pandas.core.arrays.arrow.array import ArrowExtensionArray +from pandas.core.arrays.arrow.dtype import ArrowDtype + +T = TypeVar("T", bound="FloatingArrowArray") + + +class FloatingArrowDtype(ArrowDtype): + _default_pa_dtype: pa.null() + _dtype_checker: Callable[[Any], bool] # pa.types.is_ + + @property + def _is_numeric(self) -> bool: + return False + + @property + def _is_float(self) -> bool: + return True + + @classmethod + def _str_to_dtype_mapping(cls): + raise AbstractMethodError(cls) + + +class FloatingArrowArray(ArrowExtensionArray): + """ + Base class for Floating dtypes. + """ + + _dtype_cls: type[FloatingArrowDtype] + + def __init__(self, values: pa.ChunkedArray) -> None: + checker = self._dtype_cls._dtype_checker + if not (isinstance(values, pa.ChunkedArray) and checker(values.type)): + descr = ( + "floating" + ) + raise TypeError(f"values should be {descr} arrow array.") + super().__init__(values) + + @cache_readonly + def dtype(self) -> FloatingArrowDtype: + mapping = self._dtype_cls._str_to_dtype_mapping() + return mapping[str(self._data.type)] + + @classmethod + def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False): + if dtype is None: + dtype = cls._dtype_cls._default_pa_dtype + return cls(pa.chunked_array([scalars], type=dtype.type)) + + @classmethod + def _from_sequence_of_strings(cls, strings, *, dtype=None, copy: bool = False): + from pandas.core.tools.numeric import to_numeric + + scalars = to_numeric(strings, errors="raise") + return cls._from_sequence(scalars, dtype=dtype, copy=copy) \ No newline at end of file From 1b38b5fed4d5cfc728e33f456a73d137f0a3f152 Mon Sep 17 00:00:00 2001 From: Gaurav Sheni Date: Wed, 11 May 2022 00:28:12 -0400 Subject: [PATCH 04/19] use numeric array --- pandas/core/arrays/arrow/floating.py | 124 +++++++++++++++++---------- pandas/core/arrays/arrow/numeric.py | 69 +++++++++++++++ 2 files changed, 146 insertions(+), 47 deletions(-) create mode 100644 pandas/core/arrays/arrow/numeric.py diff --git a/pandas/core/arrays/arrow/floating.py b/pandas/core/arrays/arrow/floating.py index dc3d8c4131724..aa1e82eeaa79b 100644 --- a/pandas/core/arrays/arrow/floating.py +++ b/pandas/core/arrays/arrow/floating.py @@ -1,69 +1,99 @@ from __future__ import annotations -from typing import ( - Any, - Callable, - TypeVar, -) - import pyarrow as pa -from pandas.errors import AbstractMethodError -from pandas.util._decorators import cache_readonly +from pandas.core.dtypes.base import register_extension_dtype -from pandas.core.arrays.arrow.array import ArrowExtensionArray -from pandas.core.arrays.arrow.dtype import ArrowDtype - -T = TypeVar("T", bound="FloatingArrowArray") +from pandas.core.arrays.arrow.numeric import ( + FloatingArrowArray, + FloatingArrowDtype, +) -class FloatingArrowDtype(ArrowDtype): - _default_pa_dtype: pa.null() - _dtype_checker: Callable[[Any], bool] # pa.types.is_ +class FloatingArrowDtype(NumericArrowDtype): + """ + An ExtensionDtype to hold a single size & kind of integer Arrow dtype. + These specific implementations are subclasses of the non-public + FloatingArrowDtype. + """ - @property - def _is_numeric(self) -> bool: - return False + _default_pa_dtype = pa.float64() + _dtype_checker = pa.types.is_floating - @property - def _is_float(self) -> bool: - return True + @classmethod + def construct_array_type(cls) -> type[FloatingArrowArray]: + """ + Return the array type associated with this dtype. + Returns + ------- + type + """ + return FloatingArrowArray @classmethod def _str_to_dtype_mapping(cls): - raise AbstractMethodError(cls) + return INT_STR_TO_DTYPE -class FloatingArrowArray(ArrowExtensionArray): +class FloatingArrowArray(NumericArrowArray): """ - Base class for Floating dtypes. + Array of pyarrow integer values. + To construct an IntegerArray from generic array-like ipaut, use + :func:`pandas.array` with one of the integer dtypes (see examples). + Parameters + ---------- + values : pa.ChunkedArray + A 1-d integer-dtype array. + Attributes + ---------- + None + Methods + ------- + None + Returns + ------- + FloatingArrowArray """ - _dtype_cls: type[FloatingArrowDtype] + _dtype_cls = FloatingArrowDtype - def __init__(self, values: pa.ChunkedArray) -> None: - checker = self._dtype_cls._dtype_checker - if not (isinstance(values, pa.ChunkedArray) and checker(values.type)): - descr = ( - "floating" - ) - raise TypeError(f"values should be {descr} arrow array.") - super().__init__(values) - @cache_readonly - def dtype(self) -> FloatingArrowDtype: - mapping = self._dtype_cls._str_to_dtype_mapping() - return mapping[str(self._data.type)] +_dtype_docstring = """ +An ExtensionDtype for {dtype} integer pyarrow data. +Attributes +---------- +None +Methods +------- +None +""" - @classmethod - def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False): - if dtype is None: - dtype = cls._dtype_cls._default_pa_dtype - return cls(pa.chunked_array([scalars], type=dtype.type)) +# create the Dtype + + +@register_extension_dtype +class Float16ArrowDtype(FloatingArrowDtype): + type = pa.float16() + name = "float16" + __doc__ = _dtype_docstring.format(dtype="float16") + + +@register_extension_dtype +class Float32ArrowDtype(FloatingArrowDtype): + type = pa.float32() + name = "float32" + __doc__ = _dtype_docstring.format(dtype="float32") + + +@register_extension_dtype +class Float64ArrowDtype(FloatingArrowDtype): + type = pa.float64() + name = "float64" + __doc__ = _dtype_docstring.format(dtype="float64") - @classmethod - def _from_sequence_of_strings(cls, strings, *, dtype=None, copy: bool = False): - from pandas.core.tools.numeric import to_numeric - scalars = to_numeric(strings, errors="raise") - return cls._from_sequence(scalars, dtype=dtype, copy=copy) \ No newline at end of file +INT_STR_TO_DTYPE: dict[str, FloatingArrowDtype] = { + "float16": Float16ArrowDtype(), + "float32": Float32ArrowDtype(), + "float64": Float64ArrowDtype(), +} \ No newline at end of file diff --git a/pandas/core/arrays/arrow/numeric.py b/pandas/core/arrays/arrow/numeric.py new file mode 100644 index 0000000000000..dc3d8c4131724 --- /dev/null +++ b/pandas/core/arrays/arrow/numeric.py @@ -0,0 +1,69 @@ +from __future__ import annotations + +from typing import ( + Any, + Callable, + TypeVar, +) + +import pyarrow as pa + +from pandas.errors import AbstractMethodError +from pandas.util._decorators import cache_readonly + +from pandas.core.arrays.arrow.array import ArrowExtensionArray +from pandas.core.arrays.arrow.dtype import ArrowDtype + +T = TypeVar("T", bound="FloatingArrowArray") + + +class FloatingArrowDtype(ArrowDtype): + _default_pa_dtype: pa.null() + _dtype_checker: Callable[[Any], bool] # pa.types.is_ + + @property + def _is_numeric(self) -> bool: + return False + + @property + def _is_float(self) -> bool: + return True + + @classmethod + def _str_to_dtype_mapping(cls): + raise AbstractMethodError(cls) + + +class FloatingArrowArray(ArrowExtensionArray): + """ + Base class for Floating dtypes. + """ + + _dtype_cls: type[FloatingArrowDtype] + + def __init__(self, values: pa.ChunkedArray) -> None: + checker = self._dtype_cls._dtype_checker + if not (isinstance(values, pa.ChunkedArray) and checker(values.type)): + descr = ( + "floating" + ) + raise TypeError(f"values should be {descr} arrow array.") + super().__init__(values) + + @cache_readonly + def dtype(self) -> FloatingArrowDtype: + mapping = self._dtype_cls._str_to_dtype_mapping() + return mapping[str(self._data.type)] + + @classmethod + def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False): + if dtype is None: + dtype = cls._dtype_cls._default_pa_dtype + return cls(pa.chunked_array([scalars], type=dtype.type)) + + @classmethod + def _from_sequence_of_strings(cls, strings, *, dtype=None, copy: bool = False): + from pandas.core.tools.numeric import to_numeric + + scalars = to_numeric(strings, errors="raise") + return cls._from_sequence(scalars, dtype=dtype, copy=copy) \ No newline at end of file From f8e5551312334ba67244c135fcae647e5a3376e2 Mon Sep 17 00:00:00 2001 From: Gaurav Sheni Date: Wed, 11 May 2022 22:09:57 -0400 Subject: [PATCH 05/19] more dtype checking --- pandas/core/arrays/arrow/array.py | 28 +++++++++++++++++++++++++++- pandas/core/arrays/arrow/dtype.py | 31 ++++++++++++++++++++++++++++++- 2 files changed, 57 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index fdd505e259dd9..b805e82660f3d 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -39,6 +39,7 @@ import pyarrow.compute as pc from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning + from pandas.core.arrays.arrow.dtype import ArrowDtype if TYPE_CHECKING: from pandas import Series @@ -53,8 +54,11 @@ class ArrowExtensionArray(ExtensionArray): _data: pa.ChunkedArray - def __init__(self, values: pa.ChunkedArray) -> None: + def __init__(self, values: pa.ChunkedArray, pa_dtype: pa.DataType) -> None: self._data = values + self.storage = storage + self._dtype = ArrowDtype(storage="pyarrow", pa_dtype=pa_dtype) + def __arrow_array__(self, type=None): """Convert myself to a pyarrow Array or ChunkedArray.""" @@ -468,3 +472,25 @@ def _replace_with_indices( return pc.if_else(mask, None, chunk) return pc.replace_with_mask(chunk, mask, value) + + @cache_readonly + def dtype(self) -> NumericArrowDtype: + return self._dtype + + @classmethod + def _from_sequence_of_strings(cls, strings, *, dtype=None, copy: bool = False): + if self.dtype._is_numeric: + from pandas.core.tools.numeric import to_numeric + + scalars = to_numeric(strings, errors="raise") + elif self.dtype._is_temporal: + from pandas.core.tools.datetimes import to_datetime + + scalars = to_datetime(strings, error="raise") + return cls._from_sequence(scalars, dtype=dtype, copy=copy) + + def mean(self, skipna=True): + if self.dtype._is_numeric: + return pa.compute.mean(self._data, skip_nulls=skipna) + else: + raise TypeError(f"Cannot compute mean from '{string}'") diff --git a/pandas/core/arrays/arrow/dtype.py b/pandas/core/arrays/arrow/dtype.py index c0ecb0856f27f..ca75f33efc725 100644 --- a/pandas/core/arrays/arrow/dtype.py +++ b/pandas/core/arrays/arrow/dtype.py @@ -23,9 +23,38 @@ class ArrowDtype(StorageExtensionDtype): na_value = pa.NA - def __init__(self, storage="pyarrow") -> None: + def __init__(self, pa_dtype, storage="pyarrow") -> None: + self.pa_dtype = pa_dtype + self.storage = storage super().__init__(storage) + def _is_numeric(self): + return pa.types.is_integer(self.pa_dtype) or pa.types.is_float(self.pa_dtype) + + def _is_integer(self): + return pa.types.is_integer(self.pa_dtype) + + def _is_boolean(self): + return pa.types.is_boolean(self.pa_dtype) + + def _is_floating(self): + return pa.types.is_floating(self.pa_dtype) + + def _is_temporal(self): + return pa.types.is_temporal(self.pa_dtype) + + def _is_floating(self): + return pa.types.is_floating(self.pa_dtype) + + def _is_date(self): + return pa.types.is_date(self.pa_dtype) + + def _is_time(self): + return pa.types.is_time(self.pa_dtype) + + def _is_string(self): + return pa.types.is_string(self.pa_dtype) + @cache_readonly def numpy_dtype(self) -> np.dtype: """Return an instance of the related numpy dtype""" From f35674420fb820d55df2ff48002a702dad5a8a53 Mon Sep 17 00:00:00 2001 From: Gaurav Sheni Date: Wed, 11 May 2022 22:25:23 -0400 Subject: [PATCH 06/19] change imports: --- pandas/core/arrays/arrow/floating.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/arrow/floating.py b/pandas/core/arrays/arrow/floating.py index aa1e82eeaa79b..ac2d60a4e0316 100644 --- a/pandas/core/arrays/arrow/floating.py +++ b/pandas/core/arrays/arrow/floating.py @@ -4,21 +4,20 @@ from pandas.core.dtypes.base import register_extension_dtype -from pandas.core.arrays.arrow.numeric import ( +from pandas.core.arrays.arrow.dtype import ( + FloatingArrowArray, FloatingArrowArray, - FloatingArrowDtype, ) -class FloatingArrowDtype(NumericArrowDtype): +class FloatingArrowDtype(FloatingArrowArray): """ An ExtensionDtype to hold a single size & kind of integer Arrow dtype. These specific implementations are subclasses of the non-public FloatingArrowDtype. """ - _default_pa_dtype = pa.float64() - _dtype_checker = pa.types.is_floating + self.pa_dtype = pa.float64() @classmethod def construct_array_type(cls) -> type[FloatingArrowArray]: From d414a9e46e58aab501ffa94f42ffa0d56ebfb264 Mon Sep 17 00:00:00 2001 From: Gaurav Sheni Date: Wed, 11 May 2022 22:28:40 -0400 Subject: [PATCH 07/19] add float 16 arrow dtype --- pandas/__init__.py | 1 + pandas/core/api.py | 1 + 2 files changed, 2 insertions(+) diff --git a/pandas/__init__.py b/pandas/__init__.py index 7f1d3732ed008..f31d021119598 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -57,6 +57,7 @@ UInt64Dtype, Float32Dtype, Float64Dtype, + Float16ArrowDtype, Float32ArrowDtype, Float64ArrowDtype, CategoricalDtype, diff --git a/pandas/core/api.py b/pandas/core/api.py index cbea384b7ad8e..4f7f726bace85 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -33,6 +33,7 @@ Float64Dtype, ) from pandas.core.arrays.arrow.floating import ( + Float16ArrowDtype, Float32ArrowDtype, Float64ArrowDtype, ) From ad867311a5771b878a1484f3ef33fe56547c2eec Mon Sep 17 00:00:00 2001 From: Gaurav Sheni Date: Wed, 11 May 2022 22:36:48 -0400 Subject: [PATCH 08/19] fix return --- pandas/core/arrays/arrow/array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index b805e82660f3d..4ba82dc8a9853 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -474,7 +474,7 @@ def _replace_with_indices( return pc.replace_with_mask(chunk, mask, value) @cache_readonly - def dtype(self) -> NumericArrowDtype: + def dtype(self) -> ArrowDtype: return self._dtype @classmethod From 39c01ac3eda61336fe38c57efb5ca21daeb300ec Mon Sep 17 00:00:00 2001 From: Gaurav Sheni Date: Wed, 11 May 2022 22:42:05 -0400 Subject: [PATCH 09/19] add more methods --- pandas/core/arrays/arrow/array.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 4ba82dc8a9853..0844225a3618b 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -494,3 +494,27 @@ def mean(self, skipna=True): return pa.compute.mean(self._data, skip_nulls=skipna) else: raise TypeError(f"Cannot compute mean from '{string}'") + + def max(self, skipna=True): + if self.dtype._is_numeric: + return pa.compute.max(self._data, skip_nulls=skipna) + else: + raise TypeError(f"Cannot compute max from '{string}'") + + def min(self, skipna=True): + if self.dtype._is_numeric: + return pa.compute.min(self._data, skip_nulls=skipna) + else: + raise TypeError(f"Cannot compute min from '{string}'") + + def mode(self, skipna=True): + if self.dtype._is_numeric: + return pa.compute.mode(self._data, skip_nulls=skipna) + else: + raise TypeError(f"Cannot compute mode from '{string}'") + + def quantile(self, q=0.5, interpolation='linear'): + if self.dtype._is_numeric: + return pa.compute.quantile(self._data, q=q, interpolation=interpolation) + else: + raise TypeError(f"Cannot compute quantile from '{string}'") From 604f05acbde781181bf94abbceff009a11c447bf Mon Sep 17 00:00:00 2001 From: Gaurav Sheni Date: Wed, 11 May 2022 23:35:30 -0400 Subject: [PATCH 10/19] update docstrings --- pandas/core/arrays/arrow/floating.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/arrow/floating.py b/pandas/core/arrays/arrow/floating.py index ac2d60a4e0316..1d7dcbc0731d0 100644 --- a/pandas/core/arrays/arrow/floating.py +++ b/pandas/core/arrays/arrow/floating.py @@ -12,7 +12,7 @@ class FloatingArrowDtype(FloatingArrowArray): """ - An ExtensionDtype to hold a single size & kind of integer Arrow dtype. + An ExtensionDtype to hold a single size & kind of floating Arrow dtype. These specific implementations are subclasses of the non-public FloatingArrowDtype. """ @@ -36,13 +36,13 @@ def _str_to_dtype_mapping(cls): class FloatingArrowArray(NumericArrowArray): """ - Array of pyarrow integer values. - To construct an IntegerArray from generic array-like ipaut, use - :func:`pandas.array` with one of the integer dtypes (see examples). + Array of pyarrow floating values. + To construct an FloatingArray from generic array-like ipaut, use + :func:`pandas.array` with one of the floating dtypes (see examples). Parameters ---------- values : pa.ChunkedArray - A 1-d integer-dtype array. + A 1-d floating-dtype array. Attributes ---------- None From e09dfb8b0f1a7287fa4aec1c2f85fb686b351206 Mon Sep 17 00:00:00 2001 From: Gaurav Sheni Date: Sat, 14 May 2022 22:23:34 -0400 Subject: [PATCH 11/19] remove floating.py --- pandas/core/arrays/arrow/floating.py | 98 ---------------------------- 1 file changed, 98 deletions(-) delete mode 100644 pandas/core/arrays/arrow/floating.py diff --git a/pandas/core/arrays/arrow/floating.py b/pandas/core/arrays/arrow/floating.py deleted file mode 100644 index 1d7dcbc0731d0..0000000000000 --- a/pandas/core/arrays/arrow/floating.py +++ /dev/null @@ -1,98 +0,0 @@ -from __future__ import annotations - -import pyarrow as pa - -from pandas.core.dtypes.base import register_extension_dtype - -from pandas.core.arrays.arrow.dtype import ( - FloatingArrowArray, - FloatingArrowArray, -) - - -class FloatingArrowDtype(FloatingArrowArray): - """ - An ExtensionDtype to hold a single size & kind of floating Arrow dtype. - These specific implementations are subclasses of the non-public - FloatingArrowDtype. - """ - - self.pa_dtype = pa.float64() - - @classmethod - def construct_array_type(cls) -> type[FloatingArrowArray]: - """ - Return the array type associated with this dtype. - Returns - ------- - type - """ - return FloatingArrowArray - - @classmethod - def _str_to_dtype_mapping(cls): - return INT_STR_TO_DTYPE - - -class FloatingArrowArray(NumericArrowArray): - """ - Array of pyarrow floating values. - To construct an FloatingArray from generic array-like ipaut, use - :func:`pandas.array` with one of the floating dtypes (see examples). - Parameters - ---------- - values : pa.ChunkedArray - A 1-d floating-dtype array. - Attributes - ---------- - None - Methods - ------- - None - Returns - ------- - FloatingArrowArray - """ - - _dtype_cls = FloatingArrowDtype - - -_dtype_docstring = """ -An ExtensionDtype for {dtype} integer pyarrow data. -Attributes ----------- -None -Methods -------- -None -""" - -# create the Dtype - - -@register_extension_dtype -class Float16ArrowDtype(FloatingArrowDtype): - type = pa.float16() - name = "float16" - __doc__ = _dtype_docstring.format(dtype="float16") - - -@register_extension_dtype -class Float32ArrowDtype(FloatingArrowDtype): - type = pa.float32() - name = "float32" - __doc__ = _dtype_docstring.format(dtype="float32") - - -@register_extension_dtype -class Float64ArrowDtype(FloatingArrowDtype): - type = pa.float64() - name = "float64" - __doc__ = _dtype_docstring.format(dtype="float64") - - -INT_STR_TO_DTYPE: dict[str, FloatingArrowDtype] = { - "float16": Float16ArrowDtype(), - "float32": Float32ArrowDtype(), - "float64": Float64ArrowDtype(), -} \ No newline at end of file From 8fcf3a2745a4d0e66475e1d86301888690214915 Mon Sep 17 00:00:00 2001 From: Gaurav Sheni Date: Sat, 14 May 2022 22:24:14 -0400 Subject: [PATCH 12/19] fix attr --- pandas/core/arrays/arrow/numeric.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/numeric.py b/pandas/core/arrays/arrow/numeric.py index dc3d8c4131724..66d85d8099112 100644 --- a/pandas/core/arrays/arrow/numeric.py +++ b/pandas/core/arrays/arrow/numeric.py @@ -23,7 +23,7 @@ class FloatingArrowDtype(ArrowDtype): @property def _is_numeric(self) -> bool: - return False + return True @property def _is_float(self) -> bool: From 6047ed07370b20a372294f9f1238a76e2e4c8c9f Mon Sep 17 00:00:00 2001 From: Gaurav Sheni Date: Sat, 14 May 2022 22:29:27 -0400 Subject: [PATCH 13/19] lint --- pandas/core/arrays/arrow/array.py | 5 ++--- pandas/core/arrays/arrow/numeric.py | 6 ++---- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 0844225a3618b..b680b5af091bd 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -59,7 +59,6 @@ def __init__(self, values: pa.ChunkedArray, pa_dtype: pa.DataType) -> None: self.storage = storage self._dtype = ArrowDtype(storage="pyarrow", pa_dtype=pa_dtype) - def __arrow_array__(self, type=None): """Convert myself to a pyarrow Array or ChunkedArray.""" return self._data @@ -494,7 +493,7 @@ def mean(self, skipna=True): return pa.compute.mean(self._data, skip_nulls=skipna) else: raise TypeError(f"Cannot compute mean from '{string}'") - + def max(self, skipna=True): if self.dtype._is_numeric: return pa.compute.max(self._data, skip_nulls=skipna) @@ -513,7 +512,7 @@ def mode(self, skipna=True): else: raise TypeError(f"Cannot compute mode from '{string}'") - def quantile(self, q=0.5, interpolation='linear'): + def quantile(self, q=0.5, interpolation="linear"): if self.dtype._is_numeric: return pa.compute.quantile(self._data, q=q, interpolation=interpolation) else: diff --git a/pandas/core/arrays/arrow/numeric.py b/pandas/core/arrays/arrow/numeric.py index 66d85d8099112..c02b6cefec6b3 100644 --- a/pandas/core/arrays/arrow/numeric.py +++ b/pandas/core/arrays/arrow/numeric.py @@ -44,9 +44,7 @@ class FloatingArrowArray(ArrowExtensionArray): def __init__(self, values: pa.ChunkedArray) -> None: checker = self._dtype_cls._dtype_checker if not (isinstance(values, pa.ChunkedArray) and checker(values.type)): - descr = ( - "floating" - ) + descr = "floating" raise TypeError(f"values should be {descr} arrow array.") super().__init__(values) @@ -66,4 +64,4 @@ def _from_sequence_of_strings(cls, strings, *, dtype=None, copy: bool = False): from pandas.core.tools.numeric import to_numeric scalars = to_numeric(strings, errors="raise") - return cls._from_sequence(scalars, dtype=dtype, copy=copy) \ No newline at end of file + return cls._from_sequence(scalars, dtype=dtype, copy=copy) From 3bd3a9f28b31b37b446ceed944afad45f19d891f Mon Sep 17 00:00:00 2001 From: Gaurav Sheni Date: Sat, 14 May 2022 23:21:24 -0400 Subject: [PATCH 14/19] lint --- doc/source/user_guide/style.ipynb | 634 +++++++++++++++++------------- pandas/__init__.py | 7 +- pandas/core/api.py | 10 +- pandas/core/arrays/arrow/array.py | 30 +- pandas/core/arrays/arrow/dtype.py | 4 +- 5 files changed, 395 insertions(+), 290 deletions(-) diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb index 85c552a7d596f..1b54ec9e13b7c 100644 --- a/doc/source/user_guide/style.ipynb +++ b/doc/source/user_guide/style.ipynb @@ -37,6 +37,7 @@ "outputs": [], "source": [ "import matplotlib.pyplot\n", + "\n", "# We have this here to trigger matplotlib's font cache stuff.\n", "# This cell is hidden from the output" ] @@ -51,9 +52,16 @@ "import numpy as np\n", "import matplotlib as mpl\n", "\n", - "df = pd.DataFrame([[38.0, 2.0, 18.0, 22.0, 21, np.nan],[19, 439, 6, 452, 226,232]], \n", - " index=pd.Index(['Tumour (Positive)', 'Non-Tumour (Negative)'], name='Actual Label:'), \n", - " columns=pd.MultiIndex.from_product([['Decision Tree', 'Regression', 'Random'],['Tumour', 'Non-Tumour']], names=['Model:', 'Predicted:']))\n", + "df = pd.DataFrame(\n", + " [[38.0, 2.0, 18.0, 22.0, 21, np.nan], [19, 439, 6, 452, 226, 232]],\n", + " index=pd.Index(\n", + " [\"Tumour (Positive)\", \"Non-Tumour (Negative)\"], name=\"Actual Label:\"\n", + " ),\n", + " columns=pd.MultiIndex.from_product(\n", + " [[\"Decision Tree\", \"Regression\", \"Random\"], [\"Tumour\", \"Non-Tumour\"]],\n", + " names=[\"Model:\", \"Predicted:\"],\n", + " ),\n", + ")\n", "df.style" ] }, @@ -75,63 +83,66 @@ "outputs": [], "source": [ "# Hidden cell to just create the below example: code is covered throughout the guide.\n", - "s = df.style\\\n", - " .hide_columns([('Random', 'Tumour'), ('Random', 'Non-Tumour')])\\\n", - " .format('{:.0f}')\\\n", - " .set_table_styles([{\n", - " 'selector': '',\n", - " 'props': 'border-collapse: separate;'\n", - " },{\n", - " 'selector': 'caption',\n", - " 'props': 'caption-side: bottom; font-size:1.3em;'\n", - " },{\n", - " 'selector': '.index_name',\n", - " 'props': 'font-style: italic; color: darkgrey; font-weight:normal;'\n", - " },{\n", - " 'selector': 'th:not(.index_name)',\n", - " 'props': 'background-color: #000066; color: white;'\n", - " },{\n", - " 'selector': 'th.col_heading',\n", - " 'props': 'text-align: center;'\n", - " },{\n", - " 'selector': 'th.col_heading.level0',\n", - " 'props': 'font-size: 1.5em;'\n", - " },{\n", - " 'selector': 'th.col2',\n", - " 'props': 'border-left: 1px solid white;'\n", - " },{\n", - " 'selector': '.col2',\n", - " 'props': 'border-left: 1px solid #000066;'\n", - " },{\n", - " 'selector': 'td',\n", - " 'props': 'text-align: center; font-weight:bold;'\n", - " },{\n", - " 'selector': '.true',\n", - " 'props': 'background-color: #e6ffe6;'\n", - " },{\n", - " 'selector': '.false',\n", - " 'props': 'background-color: #ffe6e6;'\n", - " },{\n", - " 'selector': '.border-red',\n", - " 'props': 'border: 2px dashed red;'\n", - " },{\n", - " 'selector': '.border-green',\n", - " 'props': 'border: 2px dashed green;'\n", - " },{\n", - " 'selector': 'td:hover',\n", - " 'props': 'background-color: #ffffb3;'\n", - " }])\\\n", - " .set_td_classes(pd.DataFrame([['true border-green', 'false', 'true', 'false border-red', '', ''],\n", - " ['false', 'true', 'false', 'true', '', '']], \n", - " index=df.index, columns=df.columns))\\\n", - " .set_caption(\"Confusion matrix for multiple cancer prediction models.\")\\\n", - " .set_tooltips(pd.DataFrame([['This model has a very strong true positive rate', '', '', \"This model's total number of false negatives is too high\", '', ''],\n", - " ['', '', '', '', '', '']], \n", - " index=df.index, columns=df.columns),\n", - " css_class='pd-tt', props=\n", - " 'visibility: hidden; position: absolute; z-index: 1; border: 1px solid #000066;'\n", - " 'background-color: white; color: #000066; font-size: 0.8em;' \n", - " 'transform: translate(0px, -24px); padding: 0.6em; border-radius: 0.5em;')\n" + "s = (\n", + " df.style.hide_columns([(\"Random\", \"Tumour\"), (\"Random\", \"Non-Tumour\")])\n", + " .format(\"{:.0f}\")\n", + " .set_table_styles(\n", + " [\n", + " {\"selector\": \"\", \"props\": \"border-collapse: separate;\"},\n", + " {\"selector\": \"caption\", \"props\": \"caption-side: bottom; font-size:1.3em;\"},\n", + " {\n", + " \"selector\": \".index_name\",\n", + " \"props\": \"font-style: italic; color: darkgrey; font-weight:normal;\",\n", + " },\n", + " {\n", + " \"selector\": \"th:not(.index_name)\",\n", + " \"props\": \"background-color: #000066; color: white;\",\n", + " },\n", + " {\"selector\": \"th.col_heading\", \"props\": \"text-align: center;\"},\n", + " {\"selector\": \"th.col_heading.level0\", \"props\": \"font-size: 1.5em;\"},\n", + " {\"selector\": \"th.col2\", \"props\": \"border-left: 1px solid white;\"},\n", + " {\"selector\": \".col2\", \"props\": \"border-left: 1px solid #000066;\"},\n", + " {\"selector\": \"td\", \"props\": \"text-align: center; font-weight:bold;\"},\n", + " {\"selector\": \".true\", \"props\": \"background-color: #e6ffe6;\"},\n", + " {\"selector\": \".false\", \"props\": \"background-color: #ffe6e6;\"},\n", + " {\"selector\": \".border-red\", \"props\": \"border: 2px dashed red;\"},\n", + " {\"selector\": \".border-green\", \"props\": \"border: 2px dashed green;\"},\n", + " {\"selector\": \"td:hover\", \"props\": \"background-color: #ffffb3;\"},\n", + " ]\n", + " )\n", + " .set_td_classes(\n", + " pd.DataFrame(\n", + " [\n", + " [\"true border-green\", \"false\", \"true\", \"false border-red\", \"\", \"\"],\n", + " [\"false\", \"true\", \"false\", \"true\", \"\", \"\"],\n", + " ],\n", + " index=df.index,\n", + " columns=df.columns,\n", + " )\n", + " )\n", + " .set_caption(\"Confusion matrix for multiple cancer prediction models.\")\n", + " .set_tooltips(\n", + " pd.DataFrame(\n", + " [\n", + " [\n", + " \"This model has a very strong true positive rate\",\n", + " \"\",\n", + " \"\",\n", + " \"This model's total number of false negatives is too high\",\n", + " \"\",\n", + " \"\",\n", + " ],\n", + " [\"\", \"\", \"\", \"\", \"\", \"\"],\n", + " ],\n", + " index=df.index,\n", + " columns=df.columns,\n", + " ),\n", + " css_class=\"pd-tt\",\n", + " props=\"visibility: hidden; position: absolute; z-index: 1; border: 1px solid #000066;\"\n", + " \"background-color: white; color: #000066; font-size: 0.8em;\"\n", + " \"transform: translate(0px, -24px); padding: 0.6em; border-radius: 0.5em;\",\n", + " )\n", + ")" ] }, { @@ -167,10 +178,15 @@ "metadata": {}, "outputs": [], "source": [ - "df.style.format(precision=0, na_rep='MISSING', thousands=\" \",\n", - " formatter={('Decision Tree', 'Tumour'): \"{:.2f}\",\n", - " ('Regression', 'Non-Tumour'): lambda x: \"$ {:,.1f}\".format(x*-1e6)\n", - " })" + "df.style.format(\n", + " precision=0,\n", + " na_rep=\"MISSING\",\n", + " thousands=\" \",\n", + " formatter={\n", + " (\"Decision Tree\", \"Tumour\"): \"{:.2f}\",\n", + " (\"Regression\", \"Non-Tumour\"): lambda x: \"$ {:,.1f}\".format(x * -1e6),\n", + " },\n", + ")" ] }, { @@ -186,17 +202,21 @@ "metadata": {}, "outputs": [], "source": [ - "weather_df = pd.DataFrame(np.random.rand(10,2)*5, \n", - " index=pd.date_range(start=\"2021-01-01\", periods=10),\n", - " columns=[\"Tokyo\", \"Beijing\"])\n", + "weather_df = pd.DataFrame(\n", + " np.random.rand(10, 2) * 5,\n", + " index=pd.date_range(start=\"2021-01-01\", periods=10),\n", + " columns=[\"Tokyo\", \"Beijing\"],\n", + ")\n", + "\n", "\n", - "def rain_condition(v): \n", + "def rain_condition(v):\n", " if v < 1.75:\n", " return \"Dry\"\n", " elif v < 2.75:\n", " return \"Rain\"\n", " return \"Heavy Rain\"\n", "\n", + "\n", "def make_pretty(styler):\n", " styler.set_caption(\"Weather Conditions\")\n", " styler.format(rain_condition)\n", @@ -204,6 +224,7 @@ " styler.background_gradient(axis=None, vmin=1, vmax=5, cmap=\"YlGnBu\")\n", " return styler\n", "\n", + "\n", "weather_df" ] }, @@ -241,7 +262,9 @@ "metadata": {}, "outputs": [], "source": [ - "s = df.style.format('{:.0f}').hide([('Random', 'Tumour'), ('Random', 'Non-Tumour')], axis=\"columns\")\n", + "s = df.style.format(\"{:.0f}\").hide(\n", + " [(\"Random\", \"Tumour\"), (\"Random\", \"Non-Tumour\")], axis=\"columns\"\n", + ")\n", "s" ] }, @@ -253,8 +276,8 @@ }, "outputs": [], "source": [ - "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting \n", - "s.set_uuid('after_hide')" + "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting\n", + "s.set_uuid(\"after_hide\")" ] }, { @@ -311,16 +334,16 @@ "outputs": [], "source": [ "cell_hover = { # for row hover use instead of \n", - " 'selector': 'td:hover',\n", - " 'props': [('background-color', '#ffffb3')]\n", + " \"selector\": \"td:hover\",\n", + " \"props\": [(\"background-color\", \"#ffffb3\")],\n", "}\n", "index_names = {\n", - " 'selector': '.index_name',\n", - " 'props': 'font-style: italic; color: darkgrey; font-weight:normal;'\n", + " \"selector\": \".index_name\",\n", + " \"props\": \"font-style: italic; color: darkgrey; font-weight:normal;\",\n", "}\n", "headers = {\n", - " 'selector': 'th:not(.index_name)',\n", - " 'props': 'background-color: #000066; color: white;'\n", + " \"selector\": \"th:not(.index_name)\",\n", + " \"props\": \"background-color: #000066; color: white;\",\n", "}\n", "s.set_table_styles([cell_hover, index_names, headers])" ] @@ -333,8 +356,8 @@ }, "outputs": [], "source": [ - "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting \n", - "s.set_uuid('after_tab_styles1')" + "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting\n", + "s.set_uuid(\"after_tab_styles1\")" ] }, { @@ -350,11 +373,14 @@ "metadata": {}, "outputs": [], "source": [ - "s.set_table_styles([\n", - " {'selector': 'th.col_heading', 'props': 'text-align: center;'},\n", - " {'selector': 'th.col_heading.level0', 'props': 'font-size: 1.5em;'},\n", - " {'selector': 'td', 'props': 'text-align: center; font-weight: bold;'},\n", - "], overwrite=False)" + "s.set_table_styles(\n", + " [\n", + " {\"selector\": \"th.col_heading\", \"props\": \"text-align: center;\"},\n", + " {\"selector\": \"th.col_heading.level0\", \"props\": \"font-size: 1.5em;\"},\n", + " {\"selector\": \"td\", \"props\": \"text-align: center; font-weight: bold;\"},\n", + " ],\n", + " overwrite=False,\n", + ")" ] }, { @@ -365,8 +391,8 @@ }, "outputs": [], "source": [ - "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting \n", - "s.set_uuid('after_tab_styles2')" + "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting\n", + "s.set_uuid(\"after_tab_styles2\")" ] }, { @@ -384,10 +410,16 @@ "metadata": {}, "outputs": [], "source": [ - "s.set_table_styles({\n", - " ('Regression', 'Tumour'): [{'selector': 'th', 'props': 'border-left: 1px solid white'},\n", - " {'selector': 'td', 'props': 'border-left: 1px solid #000066'}]\n", - "}, overwrite=False, axis=0)" + "s.set_table_styles(\n", + " {\n", + " (\"Regression\", \"Tumour\"): [\n", + " {\"selector\": \"th\", \"props\": \"border-left: 1px solid white\"},\n", + " {\"selector\": \"td\", \"props\": \"border-left: 1px solid #000066\"},\n", + " ]\n", + " },\n", + " overwrite=False,\n", + " axis=0,\n", + ")" ] }, { @@ -398,8 +430,8 @@ }, "outputs": [], "source": [ - "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting \n", - "s.set_uuid('xyz01')" + "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting\n", + "s.set_uuid(\"xyz01\")" ] }, { @@ -424,7 +456,7 @@ "outputs": [], "source": [ "out = s.set_table_attributes('class=\"my-table-cls\"').to_html()\n", - "print(out[out.find('