From 4836a29ceb342f14c856d42fa7a18cb29eaa6d51 Mon Sep 17 00:00:00 2001 From: fjetter Date: Sun, 13 May 2018 19:44:15 +0200 Subject: [PATCH 1/5] PERF: implement __contains__ for Categorical --- asv_bench/benchmarks/categoricals.py | 44 ++++++++++++++++++++++++++++ doc/source/whatsnew/v0.24.0.txt | 2 +- pandas/core/arrays/categorical.py | 7 +++++ 3 files changed, 52 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 5464e7cba22c3..f5b08310d9d04 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -193,3 +193,47 @@ def time_categorical_series_is_monotonic_increasing(self): def time_categorical_series_is_monotonic_decreasing(self): self.s.is_monotonic_decreasing + + +class Contains(object): + + params = (["a", "c", "d", "z", np.nan], [True, False]) + param_names = ["value", "has_nan"] + + def setup(self, value, has_nan): + n = 1 * 10 ** 4 + obj_values = list("a" * n + "b" * n + "c" * n) + if has_nan: + obj_values = [np.nan] + obj_values[:-2] + [np.nan] + + self.ci = pd.CategoricalIndex(obj_values, categories=list("abcd")) + self.cat = pd.Categorical(obj_values, categories=list("abcd")) + + def time_contains_index(self, value, has_nan): + value in self.ci + + def time_cat_isin(self, value, has_nan): + value in self.cat + + +class Indexing(object): + + params = (["a", "c"], [True, False]) + param_names = ["value", "has_nan"] + + def setup(self, value, has_nan): + n = 1 * 10 ** 4 + obj_values = list("a" * n + "b" * n + "c" * n) + if has_nan: + obj_values = [np.nan] + obj_values[:-2] + [np.nan] + + self.ci = pd.CategoricalIndex(obj_values, categories=list("abcd")) + self.cat = pd.Categorical(obj_values, categories=list("abcd")) + self.df = pd.DataFrame(dict(A=range(n * 3)), index=self.ci) + self.ser = pd.Series(range(n * 3), index=self.ci) + + def time_loc_df(self, value, has_nan): + self.df.loc[value] + + def time_loc_ser(self, value, has_nan): + self.ser.loc[value] diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index c69de149a0f35..40f9ab5b24506 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -64,7 +64,7 @@ Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`) -- +- Improved performance of indexing on a Series/DataFrame with a CategoricalIndex .. _whatsnew_0240.docs: diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 30f9c56d24f02..34dbebd758f4e 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1847,6 +1847,13 @@ def __iter__(self): """Returns an Iterator over the values of this Categorical.""" return iter(self.get_values().tolist()) + def __contains__(self, key): + """Returns True if `key` is in this Categorical.""" + if key in self.categories: + return self.categories.get_loc(key) in self.codes + else: + return False + def _tidy_repr(self, max_vals=10, footer=True): """ a short repr displaying only max_vals and an optional (but default footer) From 40f1cac79f820f3a3d2c9f6eb2fc9c65c41061c2 Mon Sep 17 00:00:00 2001 From: fjetter Date: Wed, 6 Jun 2018 23:34:51 +0200 Subject: [PATCH 2/5] Add case for nan in categorical.__contains__ --- pandas/core/arrays/categorical.py | 2 ++ pandas/tests/indexes/test_category.py | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 34dbebd758f4e..cc27f5e4b9e75 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1851,6 +1851,8 @@ def __contains__(self, key): """Returns True if `key` is in this Categorical.""" if key in self.categories: return self.categories.get_loc(key) in self.codes + elif isna(key) and self.isna().any(): + return True else: return False diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index a2a4170256088..68c6d63b32f62 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -244,6 +244,10 @@ def test_contains(self): list('aabbca') + [np.nan], categories=list('cabdef')) assert np.nan in ci + ci = CategoricalIndex( + list('aaa'), categories=list('cabdef')) + assert 'f' not in ci + def test_min_max(self): ci = self.create_index(ordered=False) From 5f8b5e075dc48c184eb310db23481c2c81e67acb Mon Sep 17 00:00:00 2001 From: fjetter Date: Fri, 8 Jun 2018 15:43:22 +0200 Subject: [PATCH 3/5] review comments --- asv_bench/benchmarks/categoricals.py | 16 +++++++++++----- doc/source/whatsnew/v0.24.0.txt | 2 +- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index f5b08310d9d04..41460eaf47699 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -197,7 +197,14 @@ def time_categorical_series_is_monotonic_decreasing(self): class Contains(object): - params = (["a", "c", "d", "z", np.nan], [True, False]) + params = ([ + "b", # in array + "d", # in categories but not in codes + "z", # nowhere + np.nan, + ], + [True, False], + ) param_names = ["value", "has_nan"] def setup(self, value, has_nan): @@ -227,10 +234,9 @@ def setup(self, value, has_nan): if has_nan: obj_values = [np.nan] + obj_values[:-2] + [np.nan] - self.ci = pd.CategoricalIndex(obj_values, categories=list("abcd")) - self.cat = pd.Categorical(obj_values, categories=list("abcd")) - self.df = pd.DataFrame(dict(A=range(n * 3)), index=self.ci) - self.ser = pd.Series(range(n * 3), index=self.ci) + ci = pd.CategoricalIndex(obj_values, categories=list("abcd")) + self.df = pd.DataFrame(dict(A=range(n * 3)), index=ci) + self.ser = pd.Series(range(n * 3), index=ci) def time_loc_df(self, value, has_nan): self.df.loc[value] diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 40f9ab5b24506..8854cfd5d9e4a 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -64,7 +64,7 @@ Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`) -- Improved performance of indexing on a Series/DataFrame with a CategoricalIndex +- Improved performance of indexing on a Series/DataFrame with a ``CategoricalIndex`` (:issue:`21022`) .. _whatsnew_0240.docs: From c5387a71482bfe7ccc1e730d24765ea812f617c7 Mon Sep 17 00:00:00 2001 From: fjetter Date: Sat, 9 Jun 2018 11:51:10 +0200 Subject: [PATCH 4/5] PERF: __contains__ of Categorical improved --- pandas/core/arrays/categorical.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index cc27f5e4b9e75..3791f854ea7c2 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1849,10 +1849,14 @@ def __iter__(self): def __contains__(self, key): """Returns True if `key` is in this Categorical.""" - if key in self.categories: - return self.categories.get_loc(key) in self.codes - elif isna(key) and self.isna().any(): - return True + hash(key) + if isna(key): + return self.isna().any() + elif self.categories._defer_to_indexing: # e.g. Interval values + loc = self.categories.get_loc(key) + return np.isin(self.codes, loc).any() + elif key in self.categories: + return self.categories.get_loc(key) in self._codes else: return False From a3550e8e87935d3e215ed9a3a4d82d3aedb3b72b Mon Sep 17 00:00:00 2001 From: fjetter Date: Sat, 9 Jun 2018 11:51:40 +0200 Subject: [PATCH 5/5] BUG: Fix membership check for interval categoricals --- doc/source/whatsnew/v0.24.0.txt | 2 +- pandas/core/indexes/category.py | 10 ---------- pandas/tests/indexes/test_category.py | 7 +++++++ 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 8854cfd5d9e4a..4158edf683c89 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -83,7 +83,7 @@ Bug Fixes Categorical ^^^^^^^^^^^ -- +- Fixed an issue where membership checks on ``CategoricalIndex`` with interval values may return false positive (:issue:`21022`) - - diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 150eca32e229d..8722170ac41d4 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -323,20 +323,10 @@ def _reverse_indexer(self): @Appender(_index_shared_docs['__contains__'] % _index_doc_kwargs) def __contains__(self, key): - hash(key) - - if self.categories._defer_to_indexing: - return key in self.categories - return key in self.values @Appender(_index_shared_docs['contains'] % _index_doc_kwargs) def contains(self, key): - hash(key) - - if self.categories._defer_to_indexing: - return self.categories.contains(key) - return key in self.values def __array__(self, dtype=None): diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 68c6d63b32f62..88d76210da8ba 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -248,6 +248,13 @@ def test_contains(self): list('aaa'), categories=list('cabdef')) assert 'f' not in ci + def test_containst_defer_to_indexing(self): + intervals = pd.interval_range(1, 4) + cat = pd.CategoricalIndex(list(intervals[:-1]), categories=intervals) + assert intervals[0] in cat + assert intervals[1] in cat + assert intervals[2] not in cat + def test_min_max(self): ci = self.create_index(ordered=False)