From 967c8d05471b08a01cbfc81a153407b7329669ce Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 23 Mar 2021 16:25:24 -0700 Subject: [PATCH 1/5] ENH: Categorical.empty --- pandas/core/arrays/_mixins.py | 22 ++++++ pandas/core/arrays/categorical.py | 25 ++++++ pandas/tests/arrays/test_ndarray_backed.py | 89 ++++++++++++++++++++++ 3 files changed, 136 insertions(+) create mode 100644 pandas/tests/arrays/test_ndarray_backed.py diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 678e532f05772..8a7a857824a7f 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -27,6 +27,7 @@ from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.common import is_dtype_equal +from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.missing import array_equivalent from pandas.core import missing @@ -447,3 +448,24 @@ def value_counts(self, dropna: bool = True): index_arr = self._from_backing_data(np.asarray(result.index._data)) index = Index(index_arr, name=result.index.name) return Series(result._values, index=index, name=result.name) + + # ------------------------------------------------------------------------ + # numpy-like methods + + @classmethod + def empty( + cls: Type[NDArrayBackedExtensionArrayT], shape: Shape, dtype: ExtensionDtype + ) -> NDArrayBackedExtensionArrayT: + """ + Analogous to np.empty(shape, dtype=dtype) + + Parameters + ---------- + shape : tuple[int] + dtype : ExtensionDtype + """ + # The base implementation uses a naive approach to find the dtype + # for the backing ndarray + arr = cls._from_sequence([], dtype=dtype) + backing = np.empty(shape, dtype=arr._ndarray.dtype) + return arr._from_backing_data(backing) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 3c88590991d77..4a4064f1530fe 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -34,6 +34,7 @@ NpDtype, Ordered, Scalar, + Shape, ) from pandas.compat.numpy import function as nv from pandas.util._decorators import ( @@ -1527,6 +1528,30 @@ def value_counts(self, dropna: bool = True): return Series(count, index=CategoricalIndex(ix), dtype="int64") + # error: Argument 2 of "empty" is incompatible with supertype + # "NDArrayBackedExtensionArray"; supertype defines the argument type as + # "ExtensionDtype" + @classmethod + def empty( # type: ignore[override] + cls: Type[Categorical], shape: Shape, dtype: CategoricalDtype + ) -> Categorical: + """ + Analogous to np.empty(shape, dtype=dtype) + + Parameters + ---------- + shape : tuple[int] + dtype : CategoricalDtype + """ + arr = cls._from_sequence([], dtype=dtype) + + # We have to use np.zeros instead of np.empty otherwise the resulting + # ndarray may contain codes not supported by this dtype, in which + # case repr(result) could segfault. + backing = np.zeros(shape, dtype=arr._ndarray.dtype) + + return arr._from_backing_data(backing) + def _internal_get_values(self): """ Return the values. diff --git a/pandas/tests/arrays/test_ndarray_backed.py b/pandas/tests/arrays/test_ndarray_backed.py new file mode 100644 index 0000000000000..6ef97191d2ca2 --- /dev/null +++ b/pandas/tests/arrays/test_ndarray_backed.py @@ -0,0 +1,89 @@ +""" +Tests for subclasses of NDArrayBackedExtensionArray +""" +import numpy as np +import pytest + +from pandas import ( + CategoricalIndex, + date_range, +) +from pandas.core.arrays import ( + Categorical, + DatetimeArray, + PandasArray, + PeriodArray, + TimedeltaArray, +) + + +@pytest.fixture( + params=[Categorical, DatetimeArray, TimedeltaArray, PeriodArray, PandasArray] +) +def ea_subclass(request): + """ + Fixture for subclasses of NDArrayBackedExtensionArray. + """ + return request.param + + +class TestEmpty: + # def test_empty(self, ea_subclass): + + def test_empty_categorical(self): + ci = CategoricalIndex(["a", "b", "c"], ordered=True) + dtype = ci.dtype + + # case with int8 codes + shape = (4,) + result = Categorical.empty(shape, dtype=dtype) + assert isinstance(result, Categorical) + assert result.shape == shape + assert result._ndarray.dtype == np.int8 + + # case where repr would segfault if we didn't override base implementation + result = Categorical.empty((4096,), dtype=dtype) + assert isinstance(result, Categorical) + assert result.shape == (4096,) + assert result._ndarray.dtype == np.int8 + repr(result) + + # case with int16 codes + ci = CategoricalIndex(list(range(512)) * 4, ordered=False) + dtype = ci.dtype + result = Categorical.empty(shape, dtype=dtype) + assert isinstance(result, Categorical) + assert result.shape == shape + assert result._ndarray.dtype == np.int16 + + def test_empty_dt64tz(self): + dti = date_range("2016-01-01", periods=2, tz="Asia/Tokyo") + dtype = dti.dtype + + shape = (0,) + result = DatetimeArray.empty(shape, dtype=dtype) + assert result.dtype == dtype + assert isinstance(result, DatetimeArray) + assert result.shape == shape + + def test_empty_dt64(self): + shape = (3, 9) + result = DatetimeArray.empty(shape, dtype="datetime64[ns]") + assert isinstance(result, DatetimeArray) + assert result.shape == shape + + def test_empty_td64(self): + shape = (3, 9) + result = TimedeltaArray.empty(shape, dtype="m8[ns]") + assert isinstance(result, TimedeltaArray) + assert result.shape == shape + + def test_empty_pandas_array(self): + arr = PandasArray(np.array([1, 2])) + dtype = arr.dtype + + shape = (3, 9) + result = PandasArray.empty(shape, dtype=dtype) + assert isinstance(result, PandasArray) + assert result.dtype == dtype + assert result.shape == shape From 9565f8fce43a1bbc7a2dba8f69a6fd4ca6e87c16 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 29 Mar 2021 09:18:52 -0700 Subject: [PATCH 2/5] remove unused --- pandas/tests/arrays/test_ndarray_backed.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/pandas/tests/arrays/test_ndarray_backed.py b/pandas/tests/arrays/test_ndarray_backed.py index 6ef97191d2ca2..dc957316f2ee5 100644 --- a/pandas/tests/arrays/test_ndarray_backed.py +++ b/pandas/tests/arrays/test_ndarray_backed.py @@ -2,7 +2,6 @@ Tests for subclasses of NDArrayBackedExtensionArray """ import numpy as np -import pytest from pandas import ( CategoricalIndex, @@ -12,24 +11,11 @@ Categorical, DatetimeArray, PandasArray, - PeriodArray, TimedeltaArray, ) -@pytest.fixture( - params=[Categorical, DatetimeArray, TimedeltaArray, PeriodArray, PandasArray] -) -def ea_subclass(request): - """ - Fixture for subclasses of NDArrayBackedExtensionArray. - """ - return request.param - - class TestEmpty: - # def test_empty(self, ea_subclass): - def test_empty_categorical(self): ci = CategoricalIndex(["a", "b", "c"], ordered=True) dtype = ci.dtype From 0fc821524c5a411444b7cee252c5568dc3f2569e Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 5 Apr 2021 15:48:31 -0700 Subject: [PATCH 3/5] ENH: implement EA.empty --- pandas/core/arrays/base.py | 15 +++++++++++++++ pandas/core/arrays/string_.py | 6 ++++++ pandas/tests/extension/arrow/test_bool.py | 4 ++++ pandas/tests/extension/base/constructors.py | 7 +++++++ pandas/tests/extension/test_categorical.py | 9 ++++++++- 5 files changed, 40 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index c45528d657404..69d429c9a6bc1 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1310,6 +1310,21 @@ def delete(self: ExtensionArrayT, loc) -> ExtensionArrayT: indexer = np.delete(np.arange(len(self)), loc) return self.take(indexer) + @classmethod + def empty(cls, shape: Shape, dtype: ExtensionDtype): + """ + Create an ExtensionArray with the given shape and dtype. + """ + obj = cls._from_sequence([], dtype=dtype) + + taker = np.broadcast_to(np.intp(-1), shape) + result = obj.take(taker, allow_fill=True) + if not isinstance(result, cls) or dtype != result.dtype: + raise NotImplementedError( + f"Default 'empty' implementation is invalid for dtype='{dtype}'" + ) + return result + class ExtensionOpsMixin: """ diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 666afb65e19ff..dc3667a83be79 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -258,6 +258,12 @@ def _from_sequence_of_strings( ): return cls._from_sequence(strings, dtype=dtype, copy=copy) + @classmethod + def empty(cls, shape, dtype) -> StringArray: + values = np.empty(shape, dtype=object) + values[:] = libmissing.NA + return cls(values).astype(dtype, copy=False) + def __arrow_array__(self, type=None): """ Convert myself into a pyarrow Array. diff --git a/pandas/tests/extension/arrow/test_bool.py b/pandas/tests/extension/arrow/test_bool.py index 829be279b45d3..6a16433aa0a32 100644 --- a/pandas/tests/extension/arrow/test_bool.py +++ b/pandas/tests/extension/arrow/test_bool.py @@ -82,6 +82,10 @@ def test_series_constructor_scalar_na_with_index(self, dtype, na_value): def test_construct_empty_dataframe(self, dtype): super().test_construct_empty_dataframe(dtype) + @pytest.mark.xfail(reason="_from_sequence ignores dtype keyword") + def test_empty(self, dtype): + super().test_empty(dtype) + class TestReduce(base.BaseNoReduceTests): def test_reduce_series_boolean(self): diff --git a/pandas/tests/extension/base/constructors.py b/pandas/tests/extension/base/constructors.py index e2323620daa0e..472971cd5eddb 100644 --- a/pandas/tests/extension/base/constructors.py +++ b/pandas/tests/extension/base/constructors.py @@ -122,3 +122,10 @@ def test_construct_empty_dataframe(self, dtype): {"a": pd.array([], dtype=dtype)}, index=pd.Index([], dtype="object") ) self.assert_frame_equal(result, expected) + + def test_empty(self, dtype): + cls = dtype.construct_array_type() + result = cls.empty((4,), dtype=dtype) + + assert isinstance(result, cls) + assert result.dtype == dtype diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 3f1f2c02c79f7..43f0136bce906 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -117,7 +117,14 @@ def test_contains(self, data, data_missing): class TestConstructors(base.BaseConstructorsTests): - pass + def test_empty(self, dtype): + cls = dtype.construct_array_type() + result = cls.empty((4,), dtype=dtype) + + assert isinstance(result, cls) + # the dtype we passed is not initialized, so will not match the + # dtype on our result. + assert result.dtype == CategoricalDtype([]) class TestReshaping(base.BaseReshapingTests): From ec77a141de08da6d5bd84ec0667b3c733394f772 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 6 Apr 2021 10:31:40 -0700 Subject: [PATCH 4/5] post-merge fixup --- pandas/core/arrays/_mixins.py | 3 ++- pandas/core/arrays/categorical.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index f03d1609278b2..1f66270dcf40f 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -14,6 +14,7 @@ from pandas._typing import ( F, Shape, + type_t, ) from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError @@ -470,7 +471,7 @@ def value_counts(self, dropna: bool = True): @classmethod def empty( - cls: Type[NDArrayBackedExtensionArrayT], shape: Shape, dtype: ExtensionDtype + cls: type_t[NDArrayBackedExtensionArrayT], shape: Shape, dtype: ExtensionDtype ) -> NDArrayBackedExtensionArrayT: """ Analogous to np.empty(shape, dtype=dtype) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 1a2641bf22056..e1bea15cba762 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -31,6 +31,7 @@ Ordered, Scalar, Shape, + type_t, ) from pandas.compat.numpy import function as nv from pandas.util._decorators import ( @@ -1531,7 +1532,7 @@ def value_counts(self, dropna: bool = True): # "ExtensionDtype" @classmethod def empty( # type: ignore[override] - cls: Type[Categorical], shape: Shape, dtype: CategoricalDtype + cls: type_t[Categorical], shape: Shape, dtype: CategoricalDtype ) -> Categorical: """ Analogous to np.empty(shape, dtype=dtype) From 7d8328bec4285f5d6527bbee6755e4feb80dfe0c Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 11 Apr 2021 16:01:15 -0700 Subject: [PATCH 5/5] empty -> _empty --- pandas/core/arrays/_mixins.py | 2 +- pandas/core/arrays/base.py | 2 +- pandas/core/arrays/categorical.py | 4 ++-- pandas/core/arrays/string_.py | 2 +- pandas/tests/arrays/test_ndarray_backed.py | 14 +++++++------- pandas/tests/extension/base/constructors.py | 2 +- pandas/tests/extension/test_categorical.py | 2 +- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index add55406c1298..e97687de34273 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -472,7 +472,7 @@ def value_counts(self, dropna: bool = True): # numpy-like methods @classmethod - def empty( + def _empty( cls: type_t[NDArrayBackedExtensionArrayT], shape: Shape, dtype: ExtensionDtype ) -> NDArrayBackedExtensionArrayT: """ diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index df8d8f5bcd6d7..02731bd4fbbc1 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1305,7 +1305,7 @@ def delete(self: ExtensionArrayT, loc) -> ExtensionArrayT: return self.take(indexer) @classmethod - def empty(cls, shape: Shape, dtype: ExtensionDtype): + def _empty(cls, shape: Shape, dtype: ExtensionDtype): """ Create an ExtensionArray with the given shape and dtype. """ diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 4cfae56855133..f2b5ad447a0cf 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1527,11 +1527,11 @@ def value_counts(self, dropna: bool = True): return Series(count, index=CategoricalIndex(ix), dtype="int64") - # error: Argument 2 of "empty" is incompatible with supertype + # error: Argument 2 of "_empty" is incompatible with supertype # "NDArrayBackedExtensionArray"; supertype defines the argument type as # "ExtensionDtype" @classmethod - def empty( # type: ignore[override] + def _empty( # type: ignore[override] cls: type_t[Categorical], shape: Shape, dtype: CategoricalDtype ) -> Categorical: """ diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 7ebf174a111d6..600aacec9c87a 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -255,7 +255,7 @@ def _from_sequence_of_strings( return cls._from_sequence(strings, dtype=dtype, copy=copy) @classmethod - def empty(cls, shape, dtype) -> StringArray: + def _empty(cls, shape, dtype) -> StringArray: values = np.empty(shape, dtype=object) values[:] = libmissing.NA return cls(values).astype(dtype, copy=False) diff --git a/pandas/tests/arrays/test_ndarray_backed.py b/pandas/tests/arrays/test_ndarray_backed.py index dc957316f2ee5..c48fb7e78d45b 100644 --- a/pandas/tests/arrays/test_ndarray_backed.py +++ b/pandas/tests/arrays/test_ndarray_backed.py @@ -22,13 +22,13 @@ def test_empty_categorical(self): # case with int8 codes shape = (4,) - result = Categorical.empty(shape, dtype=dtype) + result = Categorical._empty(shape, dtype=dtype) assert isinstance(result, Categorical) assert result.shape == shape assert result._ndarray.dtype == np.int8 # case where repr would segfault if we didn't override base implementation - result = Categorical.empty((4096,), dtype=dtype) + result = Categorical._empty((4096,), dtype=dtype) assert isinstance(result, Categorical) assert result.shape == (4096,) assert result._ndarray.dtype == np.int8 @@ -37,7 +37,7 @@ def test_empty_categorical(self): # case with int16 codes ci = CategoricalIndex(list(range(512)) * 4, ordered=False) dtype = ci.dtype - result = Categorical.empty(shape, dtype=dtype) + result = Categorical._empty(shape, dtype=dtype) assert isinstance(result, Categorical) assert result.shape == shape assert result._ndarray.dtype == np.int16 @@ -47,20 +47,20 @@ def test_empty_dt64tz(self): dtype = dti.dtype shape = (0,) - result = DatetimeArray.empty(shape, dtype=dtype) + result = DatetimeArray._empty(shape, dtype=dtype) assert result.dtype == dtype assert isinstance(result, DatetimeArray) assert result.shape == shape def test_empty_dt64(self): shape = (3, 9) - result = DatetimeArray.empty(shape, dtype="datetime64[ns]") + result = DatetimeArray._empty(shape, dtype="datetime64[ns]") assert isinstance(result, DatetimeArray) assert result.shape == shape def test_empty_td64(self): shape = (3, 9) - result = TimedeltaArray.empty(shape, dtype="m8[ns]") + result = TimedeltaArray._empty(shape, dtype="m8[ns]") assert isinstance(result, TimedeltaArray) assert result.shape == shape @@ -69,7 +69,7 @@ def test_empty_pandas_array(self): dtype = arr.dtype shape = (3, 9) - result = PandasArray.empty(shape, dtype=dtype) + result = PandasArray._empty(shape, dtype=dtype) assert isinstance(result, PandasArray) assert result.dtype == dtype assert result.shape == shape diff --git a/pandas/tests/extension/base/constructors.py b/pandas/tests/extension/base/constructors.py index 472971cd5eddb..56c3f8216f033 100644 --- a/pandas/tests/extension/base/constructors.py +++ b/pandas/tests/extension/base/constructors.py @@ -125,7 +125,7 @@ def test_construct_empty_dataframe(self, dtype): def test_empty(self, dtype): cls = dtype.construct_array_type() - result = cls.empty((4,), dtype=dtype) + result = cls._empty((4,), dtype=dtype) assert isinstance(result, cls) assert result.dtype == dtype diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 43f0136bce906..ea8b1cfb738f5 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -119,7 +119,7 @@ def test_contains(self, data, data_missing): class TestConstructors(base.BaseConstructorsTests): def test_empty(self, dtype): cls = dtype.construct_array_type() - result = cls.empty((4,), dtype=dtype) + result = cls._empty((4,), dtype=dtype) assert isinstance(result, cls) # the dtype we passed is not initialized, so will not match the