From d0861e8e7f6b2630f8dc441700d7ff7827ff8725 Mon Sep 17 00:00:00 2001 From: behzad nouri Date: Thu, 9 Oct 2014 19:38:01 -0400 Subject: [PATCH] index into multi-index past the lexsort depth --- doc/source/whatsnew/v0.15.2.txt | 22 +++++++ pandas/core/index.py | 103 ++++++++++++++++++++++++-------- pandas/core/internals.py | 4 +- pandas/tests/test_indexing.py | 85 ++++++++++++++++++++++++-- 4 files changed, 185 insertions(+), 29 deletions(-) diff --git a/doc/source/whatsnew/v0.15.2.txt b/doc/source/whatsnew/v0.15.2.txt index 944a78ad3691e..6688f106f922e 100644 --- a/doc/source/whatsnew/v0.15.2.txt +++ b/doc/source/whatsnew/v0.15.2.txt @@ -19,6 +19,26 @@ users upgrade to this version. API changes ~~~~~~~~~~~ +- Indexing in ``MultiIndex`` beyond lex-sort depth is now supported, though + a lexically sorted index will have a better performance. (:issue:`2646`) + + .. ipython:: python + + df = pd.DataFrame({'jim':[0, 0, 1, 1], + 'joe':['x', 'x', 'z', 'y'], + 'jolie':np.random.rand(4)}).set_index(['jim', 'joe']) + df + df.index.lexsort_depth + + # in prior versions this would raise a KeyError + # will now show a PerformanceWarning + df.loc[(1, 'z')] + + # lexically sorting + df2 = df.sortlevel() + df2 + df2.index.lexsort_depth + df2.loc[(1,'z')] - Bug in concat of Series with ``category`` dtype which were coercing to ``object``. (:issue:`8641`) @@ -129,3 +149,5 @@ Bug Fixes - Bugs when trying to stack multiple columns, when some (or all) of the level names are numbers (:issue:`8584`). +- Bug in ``MultiIndex`` where ``__contains__`` returns wrong result if index is + not lexically sorted or unique (:issue:`7724`) diff --git a/pandas/core/index.py b/pandas/core/index.py index 3f0b45ae10988..7d9f772126483 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -21,6 +21,7 @@ from pandas.core.common import (_values_from_object, is_float, is_integer, ABCSeries, _ensure_object, _ensure_int64) from pandas.core.config import get_option +from pandas.io.common import PerformanceWarning # simplify default_pprint = lambda x: com.pprint_thing(x, escape_chars=('\t', '\r', '\n'), @@ -4027,7 +4028,9 @@ def _partial_tup_index(self, tup, side='left'): def get_loc(self, key): """ - Get integer location slice for requested label or tuple + Get integer location, slice or boolean mask for requested label or tuple + If the key is past the lexsort depth, the return may be a boolean mask + array, otherwise it is always a slice or int. Parameters ---------- @@ -4035,22 +4038,73 @@ def get_loc(self, key): Returns ------- - loc : int or slice object - """ - if isinstance(key, tuple): - if len(key) == self.nlevels: - if self.is_unique: - return self._engine.get_loc(_values_from_object(key)) - else: - return slice(*self.slice_locs(key, key)) - else: - # partial selection - result = slice(*self.slice_locs(key, key)) - if result.start == result.stop: - raise KeyError(key) - return result - else: - return self._get_level_indexer(key, level=0) + loc : int, slice object or boolean mask + """ + def _maybe_to_slice(loc): + '''convert integer indexer to boolean mask or slice if possible''' + if not isinstance(loc, np.ndarray) or loc.dtype != 'int64': + return loc + + loc = lib.maybe_indices_to_slice(loc) + if isinstance(loc, slice): + return loc + + mask = np.empty(len(self), dtype='bool') + mask.fill(False) + mask[loc] = True + return mask + + if not isinstance(key, tuple): + loc = self._get_level_indexer(key, level=0) + return _maybe_to_slice(loc) + + keylen = len(key) + if self.nlevels < keylen: + raise KeyError('Key length ({0}) exceeds index depth ({1})' + ''.format(keylen, self.nlevels)) + + if keylen == self.nlevels and self.is_unique: + def _maybe_str_to_time_stamp(key, lev): + if lev.is_all_dates and not isinstance(key, Timestamp): + try: + return Timestamp(key, tz=getattr(lev, 'tz', None)) + except Exception: + pass + return key + key = _values_from_object(key) + key = tuple(map(_maybe_str_to_time_stamp, key, self.levels)) + return self._engine.get_loc(key) + + # -- partial selection or non-unique index + # break the key into 2 parts based on the lexsort_depth of the index; + # the first part returns a continuous slice of the index; the 2nd part + # needs linear search within the slice + i = self.lexsort_depth + lead_key, follow_key = key[:i], key[i:] + start, stop = self.slice_locs(lead_key, lead_key) \ + if lead_key else (0, len(self)) + + if start == stop: + raise KeyError(key) + + if not follow_key: + return slice(start, stop) + + warnings.warn('indexing past lexsort depth may impact performance.', + PerformanceWarning) + + loc = np.arange(start, stop, dtype='int64') + + for i, k in enumerate(follow_key, len(lead_key)): + mask = self.labels[i][loc] == self.levels[i].get_loc(k) + if not mask.all(): + loc = loc[mask] + if not len(loc): + raise KeyError(key) + + return _maybe_to_slice(loc) \ + if len(loc) != stop - start \ + else slice(start, stop) def get_loc_level(self, key, level=0, drop_level=True): """ @@ -4115,10 +4169,10 @@ def _maybe_drop_levels(indexer, levels, drop_level): if not any(isinstance(k, slice) for k in key): # partial selection - def partial_selection(key): - indexer = slice(*self.slice_locs(key, key)) - if indexer.start == indexer.stop: - raise KeyError(key) + # optionally get indexer to avoid re-calculation + def partial_selection(key, indexer=None): + if indexer is None: + indexer = self.get_loc(key) ilevels = [i for i in range(len(key)) if key[i] != slice(None, None)] return indexer, _maybe_drop_levels(indexer, ilevels, @@ -4139,11 +4193,12 @@ def partial_selection(key): if any([ l.is_all_dates for k, l in zip(key, self.levels) ]) and not can_index_exactly: - indexer = slice(*self.slice_locs(key, key)) + indexer = self.get_loc(key) # we have a multiple selection here - if not indexer.stop - indexer.start == 1: - return partial_selection(key) + if not isinstance(indexer, slice) \ + or indexer.stop - indexer.start != 1: + return partial_selection(key, indexer) key = tuple(self[indexer].tolist()[0]) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 14c4fb17c2b34..ef33e27d861fd 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -3257,7 +3257,9 @@ def take(self, indexer, axis=1, verify=True, convert=True): Take items along any axis. """ self._consolidate_inplace() - indexer = np.asanyarray(indexer, dtype=np.int_) + indexer = np.arange(indexer.start, indexer.stop, indexer.step, + dtype='int64') if isinstance(indexer, slice) \ + else np.asanyarray(indexer, dtype='int64') n = self.shape[axis] if convert: diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index 76be2e64de8d0..e710ef5ed0a41 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -1488,6 +1488,86 @@ def test_loc_multiindex(self): result = s.loc[2:4:2, 'a':'c'] assert_series_equal(result, expected) + def test_multiindex_perf_warn(self): + import sys + from pandas.io.common import PerformanceWarning + + if sys.version_info < (2, 7): + raise nose.SkipTest('python version < 2.7') + + df = DataFrame({'jim':[0, 0, 1, 1], + 'joe':['x', 'x', 'z', 'y'], + 'jolie':np.random.rand(4)}).set_index(['jim', 'joe']) + + with tm.assert_produces_warning(PerformanceWarning): + _ = df.loc[(1, 'z')] + + df = df.iloc[[2,1,3,0]] + with tm.assert_produces_warning(PerformanceWarning): + _ = df.loc[(0,)] + + def test_multiindex_get_loc(self): # GH7724, GH2646 + # test indexing into a multi-index before & past the lexsort depth + from numpy.random import randint, choice, randn + cols = ['jim', 'joe', 'jolie', 'joline', 'jolia'] + + def validate(mi, df, key): + mask = np.ones(len(df)).astype('bool') + + # test for all partials of this key + for i, k in enumerate(key): + mask &= df.iloc[:, i] == k + + if not mask.any(): + self.assertNotIn(key[:i+1], mi.index) + continue + + self.assertIn(key[:i+1], mi.index) + right = df[mask].copy() + + if i + 1 != len(key): # partial key + right.drop(cols[:i+1], axis=1, inplace=True) + right.set_index(cols[i+1:-1], inplace=True) + assert_frame_equal(mi.loc[key[:i+1]], right) + + else: # full key + right.set_index(cols[:-1], inplace=True) + if len(right) == 1: # single hit + right = Series(right['jolia'].values, + name=right.index[0], index=['jolia']) + assert_series_equal(mi.loc[key[:i+1]], right) + else: # multi hit + assert_frame_equal(mi.loc[key[:i+1]], right) + + def loop(mi, df, keys): + for key in keys: + validate(mi, df, key) + + n, m = 1000, 50 + + vals = [randint(0, 10, n), choice(list('abcdefghij'), n), + choice(pd.date_range('20141009', periods=10).tolist(), n), + choice(list('ZYXWVUTSRQ'), n), randn(n)] + vals = list(map(tuple, zip(*vals))) + + # bunch of keys for testing + keys = [randint(0, 11, m), choice(list('abcdefghijk'), m), + choice(pd.date_range('20141009', periods=11).tolist(), m), + choice(list('ZYXWVUTSRQP'), m)] + keys = list(map(tuple, zip(*keys))) + keys += list(map(lambda t: t[:-1], vals[::n//m])) + + # covers both unique index and non-unique index + df = pd.DataFrame(vals, columns=cols) + a, b = pd.concat([df, df]), df.drop_duplicates(subset=cols[:-1]) + + for frame in a, b: + for i in range(5): # lexsort depth + df = frame.copy() if i == 0 else frame.sort(columns=cols[:i]) + mi = df.set_index(cols[:-1]) + assert not mi.index.lexsort_depth < i + loop(mi, df, keys) + def test_series_getitem_multiindex(self): # GH 6018 @@ -1541,10 +1621,7 @@ def test_ix_general(self): 'year': {0: 2012, 1: 2011, 2: 2012, 3: 2012, 4: 2012}} df = DataFrame(data).set_index(keys=['col', 'year']) key = 4.0, 2012 - - # this should raise correct error - with tm.assertRaises(KeyError): - df.ix[key] + tm.assert_frame_equal(df.ix[key], df.iloc[2:]) # this is ok df.sortlevel(inplace=True)