From 9ca42b80f9f806c31e5f24ef77b9ce9f4a7b81e2 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 27 Jun 2018 17:20:11 +0200 Subject: [PATCH 1/5] BUG: fix reindexing MultiIndex with categorical datetime-like level --- pandas/core/indexes/multi.py | 10 ++++++++-- pandas/tests/groupby/test_categorical.py | 20 ++++++++++++++++++++ 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index f9f3041bef073..7b29bb27e6606 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -819,11 +819,17 @@ def values(self): return self._tuples values = [] - for lev, lab in zip(self.levels, self.labels): + for i, (lev, lab) in enumerate(zip(self.levels, self.labels)): # Need to box timestamps, etc. box = hasattr(lev, '_box_values') + if is_categorical_dtype(lev): + # TODO GH-21390 + taken = self.get_level_values(i)._values.get_values() + if not isinstance(taken, np.ndarray): + # Datetime/Period index + taken = np.array(taken.astype(object)) # Try to minimize boxing. - if box and len(lev) > len(lab): + elif box and len(lev) > len(lab): taken = lev._box_values(algos.take_1d(lev._ndarray_values, lab)) elif box: diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 0fec6a8f96a24..ce1166cff4c71 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -854,3 +854,23 @@ def test_empty_prod(): result = df.groupby("A", observed=False).B.prod(min_count=1) expected = pd.Series([2, 1, np.nan], expected_idx, name='B') tm.assert_series_equal(result, expected) + + +def test_groupby_multiindex_categorical_datetime(): + # https://github.com/pandas-dev/pandas/issues/21390 + + df = pd.DataFrame({ + 'key1': pd.Categorical(list('abcbabcba')), + 'key2': pd.Categorical( + list(pd.date_range('2018-06-01 00', freq='1T', periods=3)) * 3), + 'values': np.arange(9), + }) + result = df.groupby(['key1', 'key2']).mean() + + idx = pd.MultiIndex.from_product( + [pd.Categorical(['a', 'b', 'c']), + pd.Categorical(pd.date_range('2018-06-01 00', freq='1T', periods=3))], + names=['key1', 'key2']) + expected = pd.DataFrame( + {'values': [0, 4, 8, 3, 4, 5, 6, np.nan, 2]}, index=idx) + assert_frame_equal(result, expected) From 3e46afa94b2c5ede8ff94fe304be49b478659371 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 27 Jun 2018 17:37:39 +0200 Subject: [PATCH 2/5] also add test for reindex/get_indexer --- pandas/tests/frame/test_axis_select_reindex.py | 15 ++++++++++++++- pandas/tests/indexes/test_multi.py | 12 ++++++++++-- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/pandas/tests/frame/test_axis_select_reindex.py b/pandas/tests/frame/test_axis_select_reindex.py index 0e0d6598f5101..004fb4eb0c128 100644 --- a/pandas/tests/frame/test_axis_select_reindex.py +++ b/pandas/tests/frame/test_axis_select_reindex.py @@ -10,7 +10,7 @@ import numpy as np from pandas.compat import lrange, lzip, u -from pandas import (compat, DataFrame, Series, Index, MultiIndex, +from pandas import (compat, DataFrame, Series, Index, MultiIndex, Categorical, date_range, isna) import pandas as pd @@ -1129,6 +1129,19 @@ def test_reindex_multi(self): assert_frame_equal(result, expected) + def test_reindex_multi_categorical_time(self): + # https://github.com/pandas-dev/pandas/issues/21390 + midx = pd.MultiIndex.from_product( + [Categorical(['a', 'b', 'c']), + Categorical(date_range("2012-01-01", periods=3, freq='H'))]) + df = pd.DataFrame({'a': range(len(midx))}, index=midx) + df2 = df.iloc[[0, 1, 2, 3, 4, 5, 6, 8]] + + result = df2.reindex(midx) + expected = pd.DataFrame( + {'a': [0, 1, 2, 3, 4, 5, 6, np.nan, 8]}, index=midx) + assert_frame_equal(result, expected) + data = [[1, 2, 3], [1, 2, 3]] @pytest.mark.parametrize('actual', [ diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index c925c4c403960..07c69d6ea2af0 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -12,8 +12,8 @@ import pandas as pd -from pandas import (CategoricalIndex, DataFrame, Index, MultiIndex, - compat, date_range, period_range) +from pandas import (CategoricalIndex, Categorical, DataFrame, Index, + MultiIndex, compat, date_range, period_range) from pandas.compat import PY3, long, lrange, lzip, range, u, PYPY from pandas.errors import PerformanceWarning, UnsortedIndexError from pandas.core.dtypes.dtypes import CategoricalDtype @@ -1591,6 +1591,14 @@ def test_get_indexer_nearest(self): with pytest.raises(NotImplementedError): midx.get_indexer(['a'], method='pad', tolerance=2) + def test_get_indexer_categorical_time(self): + # https://github.com/pandas-dev/pandas/issues/21390 + midx = MultiIndex.from_product( + [Categorical(['a', 'b', 'c']), + Categorical(date_range("2012-01-01", periods=3, freq='H'))]) + result = midx.get_indexer(midx) + tm.assert_numpy_array_equal(result, np.arange(9, dtype=np.intp)) + def test_hash_collisions(self): # non-smoke test that we don't get hash collisions From c867e41c48017de74db596515f9a923e1c9c26f8 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 27 Jun 2018 17:45:35 +0200 Subject: [PATCH 3/5] add whatsnew --- doc/source/whatsnew/v0.23.2.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 9c4b408a1d24b..3d5c3300cf48b 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -54,7 +54,8 @@ Fixed Regressions - Fixed regression in :meth:`to_csv` when handling file-like object incorrectly (:issue:`21471`) - Bug in both :meth:`DataFrame.first_valid_index` and :meth:`Series.first_valid_index` raised for a row index having duplicate values (:issue:`21441`) -- +- Fixed regression in :meth:`~DataFrame.reindex` and :meth:`~DataFrame.groupby` + with a MultiIndex or multiple keys that contains categorical datetime-like values (:issue:`21390`). .. _whatsnew_0232.performance: From 16dc32af9a179459e6ac59fc10f53982f2deefbc Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 27 Jun 2018 17:55:05 +0200 Subject: [PATCH 4/5] fix flake8 --- pandas/tests/indexes/test_multi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 07c69d6ea2af0..5c46627cd2366 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -13,7 +13,7 @@ import pandas as pd from pandas import (CategoricalIndex, Categorical, DataFrame, Index, - MultiIndex, compat, date_range, period_range) + MultiIndex, compat, date_range, period_range) from pandas.compat import PY3, long, lrange, lzip, range, u, PYPY from pandas.errors import PerformanceWarning, UnsortedIndexError from pandas.core.dtypes.dtypes import CategoricalDtype From 819057f8169fd8a15f57480ab68310bc0d5a85a2 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 28 Jun 2018 12:25:49 +0200 Subject: [PATCH 5/5] possible simpler approach --- pandas/core/indexes/multi.py | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 7b29bb27e6606..162c8141cb334 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -11,6 +11,8 @@ from pandas.compat.numpy import function as nv from pandas import compat +from pandas.core.dtypes.dtypes import ( + ExtensionDtype, PandasExtensionDtype) from pandas.core.dtypes.common import ( _ensure_int64, _ensure_platform_int, @@ -819,26 +821,16 @@ def values(self): return self._tuples values = [] - for i, (lev, lab) in enumerate(zip(self.levels, self.labels)): - # Need to box timestamps, etc. - box = hasattr(lev, '_box_values') - if is_categorical_dtype(lev): - # TODO GH-21390 - taken = self.get_level_values(i)._values.get_values() - if not isinstance(taken, np.ndarray): - # Datetime/Period index - taken = np.array(taken.astype(object)) - # Try to minimize boxing. - elif box and len(lev) > len(lab): - taken = lev._box_values(algos.take_1d(lev._ndarray_values, - lab)) - elif box: - taken = algos.take_1d(lev._box_values(lev._ndarray_values), - lab, - fill_value=lev._na_value) - else: - taken = algos.take_1d(np.asarray(lev._values), lab) - values.append(taken) + + for i in range(self.nlevels): + vals = self._get_level_values(i) + if is_categorical_dtype(vals): + vals = vals.get_values() + if (isinstance(vals.dtype, (PandasExtensionDtype, ExtensionDtype)) + or hasattr(vals, '_box_values')): + vals = vals.astype(object) + vals = np.array(vals, copy=False) + values.append(vals) self._tuples = lib.fast_zip(values) return self._tuples