From 11eff3e1c87eb73329673e4c83bb26146d00b67f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 21 Jan 2020 11:27:47 -0800 Subject: [PATCH 1/8] Make Series._values match Index._values --- pandas/core/apply.py | 1 + pandas/core/arrays/numpy_.py | 3 +-- pandas/core/base.py | 17 ++++------------- pandas/core/construction.py | 5 ++++- pandas/core/internals/blocks.py | 10 ++++++++++ pandas/tests/indexes/datetimes/test_tools.py | 2 +- pandas/tests/reductions/test_reductions.py | 15 +++++++++++++-- 7 files changed, 34 insertions(+), 19 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index ca1be3154757a..2bbe5cdd91bcf 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -280,6 +280,7 @@ def apply_standard(self): and not self.dtypes.apply(is_extension_array_dtype).any() # Disallow complex_internals since libreduction shortcut # cannot handle MultiIndex + and not self.dtypes.apply(lambda x: x.kind in ["m", "M"]).any() and not isinstance(self.agg_axis, ABCMultiIndex) ): diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 4db3d3010adaf..075096f6cfb54 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -43,7 +43,6 @@ class PandasDtype(ExtensionDtype): def __init__(self, dtype): dtype = np.dtype(dtype) self._dtype = dtype - self._name = dtype.name self._type = dtype.type def __repr__(self) -> str: @@ -56,7 +55,7 @@ def numpy_dtype(self): @property def name(self): - return self._name + return self._dtype.name @property def type(self): diff --git a/pandas/core/base.py b/pandas/core/base.py index c6800d282700f..5275b157a86b3 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -18,13 +18,11 @@ from pandas.core.dtypes.cast import is_nested_object from pandas.core.dtypes.common import ( is_categorical_dtype, - is_datetime64_ns_dtype, is_dict_like, is_extension_array_dtype, is_list_like, is_object_dtype, is_scalar, - is_timedelta64_ns_dtype, needs_i8_conversion, ) from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries @@ -749,17 +747,7 @@ def array(self) -> ExtensionArray: # Special mixin syntax may be developed in the future: # https://github.com/python/typing/issues/246 result = self._values # type: ignore - - if is_datetime64_ns_dtype(result.dtype): - from pandas.arrays import DatetimeArray - - result = DatetimeArray(result) - elif is_timedelta64_ns_dtype(result.dtype): - from pandas.arrays import TimedeltaArray - - result = TimedeltaArray(result) - - elif not is_extension_array_dtype(result.dtype): + if isinstance(result, np.ndarray): from pandas.core.arrays.numpy_ import PandasArray result = PandasArray(result) @@ -1270,6 +1258,9 @@ def unique(self): if hasattr(values, "unique"): result = values.unique() + if self.dtype.kind in ["m", "M"]: + if getattr(self.dtype, "tz", None) is None: + result = np.asarray(result) else: result = unique1d(values) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index f947a1fda49f1..e3b15b06d7bd6 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -377,7 +377,10 @@ def extract_array(obj, extract_numpy: bool = False): array([1, 2, 3]) """ if isinstance(obj, (ABCIndexClass, ABCSeries)): - obj = obj.array + arr = obj._values + if not extract_numpy and isinstance(arr, np.ndarray): + return obj.array + return arr if extract_numpy and isinstance(obj, ABCPandasArray): obj = obj.to_numpy() diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index cb702a81d2bde..570f8c6ae1c27 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -12,6 +12,7 @@ import pandas._libs.internals as libinternals from pandas._libs.tslibs import Timedelta, conversion from pandas._libs.tslibs.timezones import tz_compare +from pandas.util._decorators import cache_readonly from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( @@ -2112,6 +2113,13 @@ def get_values(self, dtype=None): return result.reshape(self.values.shape) return self.values + def internal_values(self): + return self._ea_values + + @cache_readonly + def _ea_values(self): + return self._holder(self.values) + class DatetimeBlock(DatetimeLikeBlockMixin, Block): __slots__ = () @@ -2148,6 +2156,7 @@ def _maybe_coerce_values(self, values): values = values._data assert isinstance(values, np.ndarray), type(values) + assert values.dtype == _NS_DTYPE, values.dtype return values def astype(self, dtype, copy: bool = False, errors: str = "raise"): @@ -2242,6 +2251,7 @@ class DatetimeTZBlock(ExtensionBlock, DatetimeBlock): is_datetimetz = True is_extension = True + internal_values = Block.internal_values _can_hold_element = DatetimeBlock._can_hold_element to_native_types = DatetimeBlock.to_native_types fill_value = np.datetime64("NaT", "ns") diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index a5332eaea0432..7abf810e6bcfc 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -1583,7 +1583,7 @@ def test_string_na_nat_conversion(self, cache): for i in range(5): x = series[i] if isna(x): - expected[i] = iNaT + expected[i] = pd.NaT else: expected[i] = to_datetime(x, cache=cache) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 7400b049961d5..8d2058ffab643 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -316,7 +316,12 @@ def test_invalid_td64_reductions(self, opname): ) td = s.diff() - msg = "reduction operation '{op}' not allowed for this dtype" + msg = "|".join( + [ + "reduction operation '{op}' not allowed for this dtype", + r"cannot perform {op} with type timedelta64\[ns\]", + ] + ) msg = msg.format(op=opname) with pytest.raises(TypeError, match=msg): @@ -648,7 +653,13 @@ def test_ops_consistency_on_empty(self, method): # timedelta64[ns] tdser = Series([], dtype="m8[ns]") if method == "var": - with pytest.raises(TypeError, match="operation 'var' not allowed"): + msg = "|".join( + [ + "operation 'var' not allowed", + r"cannot perform var with type timedelta64\[ns\]", + ] + ) + with pytest.raises(TypeError, match=msg): getattr(tdser, method)() else: result = getattr(tdser, method)() From 3b22d7e66dd4169d84cf702907d875794c9a1494 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 23 Jan 2020 17:24:52 -0800 Subject: [PATCH 2/8] rebase --- pandas/core/internals/blocks.py | 24 +----------------------- pandas/core/series.py | 5 ----- 2 files changed, 1 insertion(+), 28 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index f49037afe8e69..00b45bc776e73 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -67,14 +67,7 @@ ) import pandas.core.algorithms as algos -from pandas.core.arrays import ( - Categorical, - DatetimeArray, - ExtensionArray, - PandasArray, - PandasDtype, - TimedeltaArray, -) +from pandas.core.arrays import Categorical, DatetimeArray, PandasDtype, TimedeltaArray from pandas.core.base import PandasObject import pandas.core.common as com from pandas.core.construction import extract_array @@ -217,12 +210,6 @@ def internal_values(self): """ return self.values - def array_values(self) -> ExtensionArray: - """ - The array that Series.array returns. Always an ExtensionArray. - """ - return PandasArray(self.values) - def get_values(self, dtype=None): """ return an internal format, currently just the ndarray @@ -1795,9 +1782,6 @@ def get_values(self, dtype=None): values = values.reshape((1,) + values.shape) return values - def array_values(self) -> ExtensionArray: - return self.values - def to_dense(self): return np.asarray(self.values) @@ -2269,9 +2253,6 @@ def set(self, locs, values): def external_values(self): return np.asarray(self.values.astype("datetime64[ns]", copy=False)) - def array_values(self) -> ExtensionArray: - return DatetimeArray._simple_new(self.values) - class DatetimeTZBlock(ExtensionBlock, DatetimeBlock): """ implement a datetime64 block with a tz attribute """ @@ -2530,9 +2511,6 @@ def to_native_types(self, slicer=None, na_rep=None, quoting=None, **kwargs): def external_values(self): return np.asarray(self.values.astype("timedelta64[ns]", copy=False)) - def array_values(self) -> ExtensionArray: - return TimedeltaArray._simple_new(self.values) - class BoolBlock(NumericBlock): __slots__ = () diff --git a/pandas/core/series.py b/pandas/core/series.py index d8eb98d06afd7..7e144efb4bcb5 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -526,11 +526,6 @@ def _values(self): """ return self._data.internal_values() - @Appender(base.IndexOpsMixin.array.__doc__) # type: ignore - @property - def array(self) -> ExtensionArray: - return self._data._block.array_values() - def _internal_get_values(self): """ Same as values (but handles sparseness conversions); is a view. From 91af8065fb39f9c8a6186a7718a686d42e462dfd Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 24 Jan 2020 18:24:31 -0800 Subject: [PATCH 3/8] docstring, comments --- pandas/core/apply.py | 5 +++-- pandas/core/base.py | 3 ++- pandas/core/series.py | 3 ++- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 2bbe5cdd91bcf..d768c859f049e 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -278,9 +278,10 @@ def apply_standard(self): if ( self.result_type in ["reduce", None] and not self.dtypes.apply(is_extension_array_dtype).any() - # Disallow complex_internals since libreduction shortcut - # cannot handle MultiIndex + # Disallow dtypes that have blocks backed by EAs and not self.dtypes.apply(lambda x: x.kind in ["m", "M"]).any() + # Disallow MultiIndex since libreduction shortcut + # cannot handle MultiIndex and not isinstance(self.agg_axis, ABCMultiIndex) ): diff --git a/pandas/core/base.py b/pandas/core/base.py index 8d16513a6c663..7a7416d3bb96b 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1274,7 +1274,8 @@ def unique(self): if hasattr(values, "unique"): result = values.unique() - if self.dtype.kind in ["m", "M"]: + if self.dtype.kind in ["m", "M"] and isinstance(self, ABCSeries): + # GH#31182 Series._values returns EA, unpack for backward-compat if getattr(self.dtype, "tz", None) is None: result = np.asarray(result) else: diff --git a/pandas/core/series.py b/pandas/core/series.py index 819a6ceb79cb1..ab0c12a54eb98 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -523,8 +523,9 @@ def _values(self): ----------- | ------------- | ------------- | ------------- | --------------- | Numeric | ndarray | ndarray | PandasArray | ndarray | Category | Categorical | Categorical | Categorical | ndarray[int] | - dt64[ns] | ndarray[M8ns] | ndarray[M8ns] | DatetimeArray | ndarray[M8ns] | + dt64[ns] | ndarray[M8ns] | DatetimeArray | DatetimeArray | ndarray[M8ns] | dt64[ns tz] | ndarray[M8ns] | DatetimeArray | DatetimeArray | ndarray[M8ns] | + td64[ns] | ndarray[m8ns] | TimedeltaArray| ndarray[m8bs] | ndarray[m8ns] | Period | ndarray[obj] | PeriodArray | PeriodArray | ndarray[int] | Nullable | EA | EA | EA | ndarray | From 5fd52c49eda5bb913dfaf08e74246006130e6e8d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 25 Jan 2020 11:47:56 -0800 Subject: [PATCH 4/8] restore array_values for perf --- pandas/core/base.py | 11 +---------- pandas/core/construction.py | 5 +---- pandas/core/internals/blocks.py | 28 +++++++++++++++++++++++++--- pandas/core/series.py | 5 +++++ 4 files changed, 32 insertions(+), 17 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 7a7416d3bb96b..05e3302abddbe 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -752,16 +752,7 @@ def array(self) -> ExtensionArray: [a, b, a] Categories (2, object): [a, b] """ - # As a mixin, we depend on the mixing class having _values. - # Special mixin syntax may be developed in the future: - # https://github.com/python/typing/issues/246 - result = self._values # type: ignore - if isinstance(result, np.ndarray): - from pandas.core.arrays.numpy_ import PandasArray - - result = PandasArray(result) - - return result + raise AbstractMethodError(self) def to_numpy(self, dtype=None, copy=False, na_value=lib.no_default, **kwargs): """ diff --git a/pandas/core/construction.py b/pandas/core/construction.py index e3b15b06d7bd6..f947a1fda49f1 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -377,10 +377,7 @@ def extract_array(obj, extract_numpy: bool = False): array([1, 2, 3]) """ if isinstance(obj, (ABCIndexClass, ABCSeries)): - arr = obj._values - if not extract_numpy and isinstance(arr, np.ndarray): - return obj.array - return arr + obj = obj.array if extract_numpy and isinstance(obj, ABCPandasArray): obj = obj.to_numpy() diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 00b45bc776e73..322536642dd47 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -67,7 +67,14 @@ ) import pandas.core.algorithms as algos -from pandas.core.arrays import Categorical, DatetimeArray, PandasDtype, TimedeltaArray +from pandas.core.arrays import ( + Categorical, + DatetimeArray, + ExtensionArray, + PandasArray, + PandasDtype, + TimedeltaArray, +) from pandas.core.base import PandasObject import pandas.core.common as com from pandas.core.construction import extract_array @@ -210,6 +217,16 @@ def internal_values(self): """ return self.values + def array_values(self) -> ExtensionArray: + """ + The array that Series.array returns. Always an ExtensionArray. + """ + return self._ea_values + + @cache_readonly + def _ea_values(self) -> ExtensionArray: + return PandasArray(self.values) + def get_values(self, dtype=None): """ return an internal format, currently just the ndarray @@ -1782,6 +1799,9 @@ def get_values(self, dtype=None): values = values.reshape((1,) + values.shape) return values + def array_values(self) -> ExtensionArray: + return self.values + def to_dense(self): return np.asarray(self.values) @@ -2123,12 +2143,15 @@ def get_values(self, dtype=None): return result.reshape(self.values.shape) return self.values + def array_values(self): + return self._ea_values + def internal_values(self): return self._ea_values @cache_readonly def _ea_values(self): - return self._holder(self.values) + return self._holder._simple_new(self.values) class DatetimeBlock(DatetimeLikeBlockMixin, Block): @@ -2166,7 +2189,6 @@ def _maybe_coerce_values(self, values): values = values._data assert isinstance(values, np.ndarray), type(values) - assert values.dtype == _NS_DTYPE, values.dtype return values def astype(self, dtype, copy: bool = False, errors: str = "raise"): diff --git a/pandas/core/series.py b/pandas/core/series.py index 1786a6f3f64e1..e14cb4953851f 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -524,6 +524,11 @@ def _values(self): """ return self._data.internal_values() + @Appender(base.IndexOpsMixin.array.__doc__) # type: ignore + @property + def array(self) -> ExtensionArray: + return self._data._block.array_values() + def _internal_get_values(self): """ Same as values (but handles sparseness conversions); is a view. From 75cb3f4b2924492c148f89ba40270a98bb6cd9e7 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 25 Jan 2020 15:36:38 -0800 Subject: [PATCH 5/8] just do internal_values --- pandas/core/internals/blocks.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 322536642dd47..b2baee6acece3 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -12,7 +12,6 @@ import pandas._libs.internals as libinternals from pandas._libs.tslibs import Timedelta, conversion from pandas._libs.tslibs.timezones import tz_compare -from pandas.util._decorators import cache_readonly from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( @@ -221,10 +220,6 @@ def array_values(self) -> ExtensionArray: """ The array that Series.array returns. Always an ExtensionArray. """ - return self._ea_values - - @cache_readonly - def _ea_values(self) -> ExtensionArray: return PandasArray(self.values) def get_values(self, dtype=None): @@ -2143,15 +2138,9 @@ def get_values(self, dtype=None): return result.reshape(self.values.shape) return self.values - def array_values(self): - return self._ea_values - def internal_values(self): - return self._ea_values - - @cache_readonly - def _ea_values(self): - return self._holder._simple_new(self.values) + # Override to return DatetimeArray and TimedeltaArray + return self.array_values() class DatetimeBlock(DatetimeLikeBlockMixin, Block): @@ -2275,6 +2264,9 @@ def set(self, locs, values): def external_values(self): return np.asarray(self.values.astype("datetime64[ns]", copy=False)) + def array_values(self) -> ExtensionArray: + return DatetimeArray._simple_new(self.values) + class DatetimeTZBlock(ExtensionBlock, DatetimeBlock): """ implement a datetime64 block with a tz attribute """ @@ -2533,6 +2525,9 @@ def to_native_types(self, slicer=None, na_rep=None, quoting=None, **kwargs): def external_values(self): return np.asarray(self.values.astype("timedelta64[ns]", copy=False)) + def array_values(self) -> ExtensionArray: + return TimedeltaArray._simple_new(self.values) + class BoolBlock(NumericBlock): __slots__ = () From c270efa433288f87e07fa0f58db562bacf9c3592 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 26 Jan 2020 15:53:27 -0800 Subject: [PATCH 6/8] docstring update --- pandas/core/series.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index e14cb4953851f..09ed42f3bf3d7 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -502,8 +502,9 @@ def _values(self): cases). Differs from ``.array`` in that this still returns the numpy array if - the Block is backed by a numpy array, while ``.array`` ensures to always - return an ExtensionArray. + the Block is backed by a numpy array (except for datetime64 and + timedelta64 dtypes), while ``.array`` ensures to always return an + ExtensionArray. Differs from ``._ndarray_values``, as that ensures to always return a numpy array (it will call ``_ndarray_values`` on the ExtensionArray, if @@ -517,7 +518,7 @@ def _values(self): Category | Categorical | Categorical | Categorical | ndarray[int] | dt64[ns] | ndarray[M8ns] | DatetimeArray | DatetimeArray | ndarray[M8ns] | dt64[ns tz] | ndarray[M8ns] | DatetimeArray | DatetimeArray | ndarray[M8ns] | - td64[ns] | ndarray[m8ns] | TimedeltaArray| ndarray[m8bs] | ndarray[m8ns] | + td64[ns] | ndarray[m8ns] | TimedeltaArray| ndarray[m8ns] | ndarray[m8ns] | Period | ndarray[obj] | PeriodArray | PeriodArray | ndarray[int] | Nullable | EA | EA | EA | ndarray | From 713bac24e55d0d014139c3b8b73d7c7cb916258b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 26 Jan 2020 16:46:49 -0800 Subject: [PATCH 7/8] better comment --- pandas/core/apply.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index d768c859f049e..ed9f10803dec3 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -278,7 +278,8 @@ def apply_standard(self): if ( self.result_type in ["reduce", None] and not self.dtypes.apply(is_extension_array_dtype).any() - # Disallow dtypes that have blocks backed by EAs + # Disallow dtypes where setting _index_data will break + # ExtensionArray values, see GH#31182 and not self.dtypes.apply(lambda x: x.kind in ["m", "M"]).any() # Disallow MultiIndex since libreduction shortcut # cannot handle MultiIndex From 5c305ff0b21e0a5bef1de951cdeeda1729391d1e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 27 Jan 2020 20:26:39 -0800 Subject: [PATCH 8/8] update docstring --- pandas/core/series.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index d986478067ff8..0aaa583885bc3 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -493,7 +493,8 @@ def _values(self): """ Return the internal repr of this data (defined by Block.interval_values). This are the values as stored in the Block (ndarray or ExtensionArray - depending on the Block class). + depending on the Block class), with datetime64[ns] and timedelta64[ns] + wrapped in ExtensionArrays to match Index._values behavior. Differs from the public ``.values`` for certain data types, because of historical backwards compatibility of the public attribute (e.g. period