pandas-dev · jreback · Apr 13, 2021 · Mar 23, 2021 · Mar 24, 2021 · Mar 25, 2021
diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py
@@ -15,6 +15,7 @@
     F,
     PositionalIndexer2D,
     Shape,
+    type_t,
 )
 from pandas.compat.numpy import function as nv
 from pandas.errors import AbstractMethodError
@@ -28,6 +29,7 @@
 )
 
 from pandas.core.dtypes.common import is_dtype_equal
+from pandas.core.dtypes.dtypes import ExtensionDtype
 from pandas.core.dtypes.missing import array_equivalent
 
 from pandas.core import missing
@@ -465,3 +467,24 @@ def value_counts(self, dropna: bool = True):
         index_arr = self._from_backing_data(np.asarray(result.index._data))
         index = Index(index_arr, name=result.index.name)
         return Series(result._values, index=index, name=result.name)
+
+    # ------------------------------------------------------------------------
+    # numpy-like methods
+
+    @classmethod
+    def _empty(
+        cls: type_t[NDArrayBackedExtensionArrayT], shape: Shape, dtype: ExtensionDtype
+    ) -> NDArrayBackedExtensionArrayT:
+        """
+        Analogous to np.empty(shape, dtype=dtype)
+
+        Parameters
+        ----------
+        shape : tuple[int]
+        dtype : ExtensionDtype
+        """
+        # The base implementation uses a naive approach to find the dtype
+        #  for the backing ndarray
+        arr = cls._from_sequence([], dtype=dtype)
+        backing = np.empty(shape, dtype=arr._ndarray.dtype)
+        return arr._from_backing_data(backing)
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -1304,6 +1304,21 @@ def delete(self: ExtensionArrayT, loc) -> ExtensionArrayT:
         indexer = np.delete(np.arange(len(self)), loc)
         return self.take(indexer)
 
+    @classmethod
+    def _empty(cls, shape: Shape, dtype: ExtensionDtype):
+        """
+        Create an ExtensionArray with the given shape and dtype.
+        """
+        obj = cls._from_sequence([], dtype=dtype)
+
+        taker = np.broadcast_to(np.intp(-1), shape)
+        result = obj.take(taker, allow_fill=True)
+        if not isinstance(result, cls) or dtype != result.dtype:
+            raise NotImplementedError(
+                f"Default 'empty' implementation is invalid for dtype='{dtype}'"
+            )
+        return result
+
 
 class ExtensionOpsMixin:
     """

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -30,6 +30,8 @@
     NpDtype,
     Ordered,
     Scalar,
+    Shape,
+    type_t,
 )
 from pandas.compat.numpy import function as nv
 from pandas.util._decorators import (
@@ -1525,6 +1527,30 @@ def value_counts(self, dropna: bool = True):
 
         return Series(count, index=CategoricalIndex(ix), dtype="int64")
 
+    # error: Argument 2 of "_empty" is incompatible with supertype
+    # "NDArrayBackedExtensionArray"; supertype defines the argument type as
+    # "ExtensionDtype"
+    @classmethod
+    def _empty(  # type: ignore[override]
+        cls: type_t[Categorical], shape: Shape, dtype: CategoricalDtype
+    ) -> Categorical:
+        """
+        Analogous to np.empty(shape, dtype=dtype)
+
+        Parameters
+        ----------
+        shape : tuple[int]
+        dtype : CategoricalDtype
+        """
+        arr = cls._from_sequence([], dtype=dtype)
+
+        # We have to use np.zeros instead of np.empty otherwise the resulting
+        #  ndarray may contain codes not supported by this dtype, in which
+        #  case repr(result) could segfault.
+        backing = np.zeros(shape, dtype=arr._ndarray.dtype)
+
+        return arr._from_backing_data(backing)
+
     def _internal_get_values(self):
         """
         Return the values.

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -254,6 +254,12 @@ def _from_sequence_of_strings(
     ):
         return cls._from_sequence(strings, dtype=dtype, copy=copy)
 
+    @classmethod
+    def _empty(cls, shape, dtype) -> StringArray:
+        values = np.empty(shape, dtype=object)
+        values[:] = libmissing.NA
+        return cls(values).astype(dtype, copy=False)
+
     def __arrow_array__(self, type=None):
         """
         Convert myself into a pyarrow Array.

diff --git a/pandas/tests/arrays/test_ndarray_backed.py b/pandas/tests/arrays/test_ndarray_backed.py
@@ -0,0 +1,75 @@
+"""
+Tests for subclasses of NDArrayBackedExtensionArray
+"""
+import numpy as np
+
+from pandas import (
+    CategoricalIndex,
+    date_range,
+)
+from pandas.core.arrays import (
+    Categorical,
+    DatetimeArray,
+    PandasArray,
+    TimedeltaArray,
+)
+
+
+class TestEmpty:
+    def test_empty_categorical(self):
+        ci = CategoricalIndex(["a", "b", "c"], ordered=True)
+        dtype = ci.dtype
+
+        # case with int8 codes
+        shape = (4,)
+        result = Categorical._empty(shape, dtype=dtype)
+        assert isinstance(result, Categorical)
+        assert result.shape == shape
+        assert result._ndarray.dtype == np.int8
+
+        # case where repr would segfault if we didn't override base implementation
+        result = Categorical._empty((4096,), dtype=dtype)
+        assert isinstance(result, Categorical)
+        assert result.shape == (4096,)
+        assert result._ndarray.dtype == np.int8
+        repr(result)
+
+        # case with int16 codes
+        ci = CategoricalIndex(list(range(512)) * 4, ordered=False)
+        dtype = ci.dtype
+        result = Categorical._empty(shape, dtype=dtype)
+        assert isinstance(result, Categorical)
+        assert result.shape == shape
+        assert result._ndarray.dtype == np.int16
+
+    def test_empty_dt64tz(self):
+        dti = date_range("2016-01-01", periods=2, tz="Asia/Tokyo")
+        dtype = dti.dtype
+
+        shape = (0,)
+        result = DatetimeArray._empty(shape, dtype=dtype)
+        assert result.dtype == dtype
+        assert isinstance(result, DatetimeArray)
+        assert result.shape == shape
+
+    def test_empty_dt64(self):
+        shape = (3, 9)
+        result = DatetimeArray._empty(shape, dtype="datetime64[ns]")
+        assert isinstance(result, DatetimeArray)
+        assert result.shape == shape
+
+    def test_empty_td64(self):
+        shape = (3, 9)
+        result = TimedeltaArray._empty(shape, dtype="m8[ns]")
+        assert isinstance(result, TimedeltaArray)
+        assert result.shape == shape
+
+    def test_empty_pandas_array(self):
+        arr = PandasArray(np.array([1, 2]))
+        dtype = arr.dtype
+
+        shape = (3, 9)
+        result = PandasArray._empty(shape, dtype=dtype)
+        assert isinstance(result, PandasArray)
+        assert result.dtype == dtype
+        assert result.shape == shape
diff --git a/pandas/tests/extension/arrow/test_bool.py b/pandas/tests/extension/arrow/test_bool.py
@@ -82,6 +82,10 @@ def test_series_constructor_scalar_na_with_index(self, dtype, na_value):
     def test_construct_empty_dataframe(self, dtype):
         super().test_construct_empty_dataframe(dtype)
 
+    @pytest.mark.xfail(reason="_from_sequence ignores dtype keyword")
+    def test_empty(self, dtype):
+        super().test_empty(dtype)
+
 
 class TestReduce(base.BaseNoReduceTests):
     def test_reduce_series_boolean(self):

diff --git a/pandas/tests/extension/base/constructors.py b/pandas/tests/extension/base/constructors.py
@@ -122,3 +122,10 @@ def test_construct_empty_dataframe(self, dtype):
             {"a": pd.array([], dtype=dtype)}, index=pd.Index([], dtype="object")
         )
         self.assert_frame_equal(result, expected)
+
+    def test_empty(self, dtype):
+        cls = dtype.construct_array_type()
+        result = cls._empty((4,), dtype=dtype)
+
+        assert isinstance(result, cls)
+        assert result.dtype == dtype
diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py
@@ -117,7 +117,14 @@ def test_contains(self, data, data_missing):
 
 
 class TestConstructors(base.BaseConstructorsTests):
-    pass
+    def test_empty(self, dtype):
+        cls = dtype.construct_array_type()
+        result = cls._empty((4,), dtype=dtype)
+
+        assert isinstance(result, cls)
+        # the dtype we passed is not initialized, so will not match the
+        #  dtype on our result.
+        assert result.dtype == CategoricalDtype([])
 
 
 class TestReshaping(base.BaseReshapingTests):