From f25bcffc633cddf129458437f0efb20fce44878e Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 28 Nov 2020 19:31:23 +0000 Subject: [PATCH 1/4] test-backportability-of-38120 --- pandas/core/algorithms.py | 32 +++++++++- pandas/core/arrays/datetimelike.py | 18 ++++++ .../tests/indexes/datetimes/test_datetime.py | 61 ++++++++++++------- .../indexes/timedeltas/test_timedelta.py | 11 +++- .../indexing/multiindex/test_multiindex.py | 10 +++ pandas/tests/window/common.py | 1 - 6 files changed, 108 insertions(+), 25 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 48d4fe65942fe..d8276e330f4bd 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -46,11 +46,14 @@ pandas_dtype, ) from pandas.core.dtypes.generic import ( + ABCDatetimeArray, ABCExtensionArray, ABCIndex, ABCIndexClass, ABCMultiIndex, + ABCRangeIndex, ABCSeries, + ABCTimedeltaArray, ) from pandas.core.dtypes.missing import isna, na_value_for_dtype @@ -191,8 +194,15 @@ def _reconstruct_data( ------- ExtensionArray or np.ndarray """ - if is_extension_array_dtype(dtype): - values = dtype.construct_array_type()._from_sequence(values) + if isinstance(values, ABCExtensionArray) and values.dtype == dtype: + # Catch DatetimeArray/TimedeltaArray + return values + elif is_extension_array_dtype(dtype): + cls = dtype.construct_array_type() + if isinstance(values, cls) and values.dtype == dtype: + return values + + values = cls._from_sequence(values) elif is_bool_dtype(dtype): values = values.astype(dtype, copy=False) @@ -652,8 +662,13 @@ def factorize( # responsible only for factorization. All data coercion, sorting and boxing # should happen here. + if isinstance(values, ABCRangeIndex): + return values.factorize(sort=sort) + values = _ensure_arraylike(values) original = values + if not isinstance(values, ABCMultiIndex): + values = extract_array(values, extract_numpy=True) # GH35667, if na_sentinel=None, we will not dropna NaNs from the uniques # of values, assign na_sentinel=-1 to replace code value for NaN. @@ -662,6 +677,19 @@ def factorize( na_sentinel = -1 dropna = False + if ( + isinstance(values, (ABCDatetimeArray, ABCTimedeltaArray)) + and values.freq is not None + ): + codes, uniques = values.factorize(sort=sort) + if isinstance(original, ABCIndexClass): + uniques = original._shallow_copy(uniques, name=None) + elif isinstance(original, ABCSeries): + from pandas import Index + + uniques = Index(uniques) + return codes, uniques + if is_extension_array_dtype(values.dtype): values = extract_array(values) codes, uniques = values.factorize(na_sentinel=na_sentinel) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index a10912aa45baa..0c5ac56ffe10a 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1660,6 +1660,24 @@ def mean(self, skipna=True): # Don't have to worry about NA `result`, since no NA went in. return self._box_func(result) + # -------------------------------------------------------------- + + def factorize(self, na_sentinel=-1, sort: bool = False): + if self.freq is not None: + # We must be unique, so can short-circuit (and retain freq) + codes = np.arange(len(self), dtype=np.intp) + uniques = self.copy() # TODO: copy or view? + if sort and self.freq.n < 0: + codes = codes[::-1] + # TODO: overload __getitem__, a slice indexer returns same type as self + # error: Incompatible types in assignment (expression has type + # "Union[DatetimeLikeArrayMixin, Union[Any, Any]]", variable + # has type "TimelikeOps") [assignment] + uniques = uniques[::-1] # type: ignore[assignment] + return codes, uniques + # FIXME: shouldn't get here; we are ignoring sort + return super().factorize(na_sentinel=na_sentinel) + DatetimeLikeArrayMixin._add_comparison_ops() diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index 7bb1d98086a91..1b20aad542084 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -271,10 +271,12 @@ def test_factorize(self): arr, idx = idx1.factorize() tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) + assert idx.freq == exp_idx.freq arr, idx = idx1.factorize(sort=True) tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) + assert idx.freq == exp_idx.freq # tz must be preserved idx1 = idx1.tz_localize("Asia/Tokyo") @@ -283,6 +285,7 @@ def test_factorize(self): arr, idx = idx1.factorize() tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) + assert idx.freq == exp_idx.freq idx2 = pd.DatetimeIndex( ["2014-03", "2014-03", "2014-02", "2014-01", "2014-03", "2014-01"] @@ -293,21 +296,31 @@ def test_factorize(self): arr, idx = idx2.factorize(sort=True) tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) + assert idx.freq == exp_idx.freq exp_arr = np.array([0, 0, 1, 2, 0, 2], dtype=np.intp) exp_idx = DatetimeIndex(["2014-03", "2014-02", "2014-01"]) arr, idx = idx2.factorize() tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) + assert idx.freq == exp_idx.freq - # freq must be preserved + def test_factorize_preserves_freq(self): + # GH#38120 freq should be preserved idx3 = date_range("2000-01", periods=4, freq="M", tz="Asia/Tokyo") exp_arr = np.array([0, 1, 2, 3], dtype=np.intp) + arr, idx = idx3.factorize() tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, idx3) + assert idx.freq == idx3.freq - def test_factorize_tz(self, tz_naive_fixture): + arr, idx = pd.factorize(idx3) + tm.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, idx3) + assert idx.freq == idx3.freq + + def test_factorize_tz(self, tz_naive_fixture, index_or_series): tz = tz_naive_fixture # GH#13750 base = pd.date_range("2016-11-05", freq="H", periods=100, tz=tz) @@ -315,27 +328,33 @@ def test_factorize_tz(self, tz_naive_fixture): exp_arr = np.arange(100, dtype=np.intp).repeat(5) - for obj in [idx, pd.Series(idx)]: - arr, res = obj.factorize() - tm.assert_numpy_array_equal(arr, exp_arr) - expected = base._with_freq(None) - tm.assert_index_equal(res, expected) + obj = index_or_series(idx) - def test_factorize_dst(self): - # GH 13750 - idx = pd.date_range("2016-11-06", freq="H", periods=12, tz="US/Eastern") - - for obj in [idx, pd.Series(idx)]: - arr, res = obj.factorize() - tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp)) - tm.assert_index_equal(res, idx) - - idx = pd.date_range("2016-06-13", freq="H", periods=12, tz="US/Eastern") + arr, res = obj.factorize() + tm.assert_numpy_array_equal(arr, exp_arr) + expected = base._with_freq(None) + tm.assert_index_equal(res, expected) + assert res.freq == expected.freq - for obj in [idx, pd.Series(idx)]: - arr, res = obj.factorize() - tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp)) - tm.assert_index_equal(res, idx) + def test_factorize_dst(self, index_or_series): + # GH 13750 + idx = date_range("2016-11-06", freq="H", periods=12, tz="US/Eastern") + obj = index_or_series(idx) + + arr, res = obj.factorize() + tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp)) + tm.assert_index_equal(res, idx) + if index_or_series is Index: + assert res.freq == idx.freq + + idx = date_range("2016-06-13", freq="H", periods=12, tz="US/Eastern") + obj = index_or_series(idx) + + arr, res = obj.factorize() + tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp)) + tm.assert_index_equal(res, idx) + if index_or_series is Index: + assert res.freq == idx.freq @pytest.mark.parametrize( "arr, expected", diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index 4a1749ff734c1..ef1e599d13221 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -75,17 +75,26 @@ def test_factorize(self): arr, idx = idx1.factorize() tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) + assert idx.freq == exp_idx.freq arr, idx = idx1.factorize(sort=True) tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) + assert idx.freq == exp_idx.freq - # freq must be preserved + def test_factorize_preserves_freq(self): + # GH#38120 freq should be preserved idx3 = timedelta_range("1 day", periods=4, freq="s") exp_arr = np.array([0, 1, 2, 3], dtype=np.intp) arr, idx = idx3.factorize() tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, idx3) + assert idx.freq == idx3.freq + + arr, idx = pd.factorize(idx3) + tm.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, idx3) + assert idx.freq == idx3.freq def test_sort_values(self): diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index 4565d79c632de..162be4e0740d6 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -91,3 +91,13 @@ def test_multiindex_get_loc_list_raises(self): msg = "unhashable type" with pytest.raises(TypeError, match=msg): idx.get_loc([]) + + def test_multiindex_with_datatime_level_preserves_freq(self): + # https://github.com/pandas-dev/pandas/issues/35563 + idx = Index(range(2), name="A") + dti = pd.date_range("2020-01-01", periods=7, freq="D", name="B") + mi = MultiIndex.from_product([idx, dti]) + df = DataFrame(np.random.randn(14, 2), index=mi) + result = df.loc[0].index + tm.assert_index_equal(result, dti) + assert result.freq == dti.freq diff --git a/pandas/tests/window/common.py b/pandas/tests/window/common.py index 7e0be331ec8d5..d6b80a803a88d 100644 --- a/pandas/tests/window/common.py +++ b/pandas/tests/window/common.py @@ -12,7 +12,6 @@ def get_result(obj, obj2=None): result = result.loc[(slice(None), 1), 5] result.index = result.index.droplevel(1) expected = get_result(frame[1], frame[5]) - expected.index = expected.index._with_freq(None) tm.assert_series_equal(result, expected, check_names=False) From 2cd19f4a4075b31f45061d5e495f70349d7eef40 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 29 Nov 2020 11:36:17 +0000 Subject: [PATCH 2/4] fix recursion error --- pandas/core/algorithms.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index d8276e330f4bd..1f9ba48fb4ef7 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -51,7 +51,6 @@ ABCIndex, ABCIndexClass, ABCMultiIndex, - ABCRangeIndex, ABCSeries, ABCTimedeltaArray, ) @@ -662,9 +661,6 @@ def factorize( # responsible only for factorization. All data coercion, sorting and boxing # should happen here. - if isinstance(values, ABCRangeIndex): - return values.factorize(sort=sort) - values = _ensure_arraylike(values) original = values if not isinstance(values, ABCMultiIndex): From 9f9491d258404af6e2967410ca4818808a81157d Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 29 Nov 2020 11:37:40 +0000 Subject: [PATCH 3/4] remove ignore --- pandas/core/arrays/datetimelike.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 0c5ac56ffe10a..a9fe95c0892e6 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1669,11 +1669,7 @@ def factorize(self, na_sentinel=-1, sort: bool = False): uniques = self.copy() # TODO: copy or view? if sort and self.freq.n < 0: codes = codes[::-1] - # TODO: overload __getitem__, a slice indexer returns same type as self - # error: Incompatible types in assignment (expression has type - # "Union[DatetimeLikeArrayMixin, Union[Any, Any]]", variable - # has type "TimelikeOps") [assignment] - uniques = uniques[::-1] # type: ignore[assignment] + uniques = uniques[::-1] return codes, uniques # FIXME: shouldn't get here; we are ignoring sort return super().factorize(na_sentinel=na_sentinel) From be0f4d75a0746902bd131b8513f1b61860864276 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 29 Nov 2020 13:10:23 +0000 Subject: [PATCH 4/4] dispatch back to base class for PeriodArray --- pandas/core/arrays/period.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index fe78481d99d30..4d117a31255da 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -48,6 +48,7 @@ import pandas.core.algorithms as algos from pandas.core.arrays import datetimelike as dtl +from pandas.core.arrays.base import ExtensionArray import pandas.core.common as com @@ -766,6 +767,9 @@ def _check_timedeltalike_freq_compat(self, other): raise raise_on_incompatible(self, other) + def factorize(self, na_sentinel=-1): + return ExtensionArray.factorize(self, na_sentinel=na_sentinel) + def raise_on_incompatible(left, right): """