Skip to content

index into multi-index past the lex-sort depth #8526

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 21, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions doc/source/whatsnew/v0.15.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,26 @@ users upgrade to this version.

API changes
~~~~~~~~~~~
- Indexing in ``MultiIndex`` beyond lex-sort depth is now supported, though
a lexically sorted index will have a better performance. (:issue:`2646`)

.. ipython:: python

df = pd.DataFrame({'jim':[0, 0, 1, 1],
'joe':['x', 'x', 'z', 'y'],
'jolie':np.random.rand(4)}).set_index(['jim', 'joe'])
df
df.index.lexsort_depth

# in prior versions this would raise a KeyError
# will now show a PerformanceWarning
df.loc[(1, 'z')]

# lexically sorting
df2 = df.sortlevel()
df2
df2.index.lexsort_depth
df2.loc[(1,'z')]

- Bug in concat of Series with ``category`` dtype which were coercing to ``object``. (:issue:`8641`)

Expand Down Expand Up @@ -129,3 +149,5 @@ Bug Fixes

- Bugs when trying to stack multiple columns, when some (or all)
of the level names are numbers (:issue:`8584`).
- Bug in ``MultiIndex`` where ``__contains__`` returns wrong result if index is
not lexically sorted or unique (:issue:`7724`)
103 changes: 79 additions & 24 deletions pandas/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from pandas.core.common import (_values_from_object, is_float, is_integer,
ABCSeries, _ensure_object, _ensure_int64)
from pandas.core.config import get_option
from pandas.io.common import PerformanceWarning

# simplify
default_pprint = lambda x: com.pprint_thing(x, escape_chars=('\t', '\r', '\n'),
Expand Down Expand Up @@ -4027,30 +4028,83 @@ def _partial_tup_index(self, tup, side='left'):

def get_loc(self, key):
"""
Get integer location slice for requested label or tuple
Get integer location, slice or boolean mask for requested label or tuple
If the key is past the lexsort depth, the return may be a boolean mask
array, otherwise it is always a slice or int.

Parameters
----------
key : label or tuple

Returns
-------
loc : int or slice object
"""
if isinstance(key, tuple):
if len(key) == self.nlevels:
if self.is_unique:
return self._engine.get_loc(_values_from_object(key))
else:
return slice(*self.slice_locs(key, key))
else:
# partial selection
result = slice(*self.slice_locs(key, key))
if result.start == result.stop:
raise KeyError(key)
return result
else:
return self._get_level_indexer(key, level=0)
loc : int, slice object or boolean mask
"""
def _maybe_to_slice(loc):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I seem to recall something almost excatly like this maybe in core/indexing?

'''convert integer indexer to boolean mask or slice if possible'''
if not isinstance(loc, np.ndarray) or loc.dtype != 'int64':
return loc

loc = lib.maybe_indices_to_slice(loc)
if isinstance(loc, slice):
return loc

mask = np.empty(len(self), dtype='bool')
mask.fill(False)
mask[loc] = True
return mask

if not isinstance(key, tuple):
loc = self._get_level_indexer(key, level=0)
return _maybe_to_slice(loc)

keylen = len(key)
if self.nlevels < keylen:
raise KeyError('Key length ({0}) exceeds index depth ({1})'
''.format(keylen, self.nlevels))

if keylen == self.nlevels and self.is_unique:
def _maybe_str_to_time_stamp(key, lev):
if lev.is_all_dates and not isinstance(key, Timestamp):
try:
return Timestamp(key, tz=getattr(lev, 'tz', None))
except Exception:
pass
return key
key = _values_from_object(key)
key = tuple(map(_maybe_str_to_time_stamp, key, self.levels))
return self._engine.get_loc(key)

# -- partial selection or non-unique index
# break the key into 2 parts based on the lexsort_depth of the index;
# the first part returns a continuous slice of the index; the 2nd part
# needs linear search within the slice
i = self.lexsort_depth
lead_key, follow_key = key[:i], key[i:]
start, stop = self.slice_locs(lead_key, lead_key) \
if lead_key else (0, len(self))

if start == stop:
raise KeyError(key)

if not follow_key:
return slice(start, stop)

warnings.warn('indexing past lexsort depth may impact performance.',
PerformanceWarning)

loc = np.arange(start, stop, dtype='int64')

for i, k in enumerate(follow_key, len(lead_key)):
mask = self.labels[i][loc] == self.levels[i].get_loc(k)
if not mask.all():
loc = loc[mask]
if not len(loc):
raise KeyError(key)

return _maybe_to_slice(loc) \
if len(loc) != stop - start \
else slice(start, stop)

def get_loc_level(self, key, level=0, drop_level=True):
"""
Expand Down Expand Up @@ -4115,10 +4169,10 @@ def _maybe_drop_levels(indexer, levels, drop_level):
if not any(isinstance(k, slice) for k in key):

# partial selection
def partial_selection(key):
indexer = slice(*self.slice_locs(key, key))
if indexer.start == indexer.stop:
raise KeyError(key)
# optionally get indexer to avoid re-calculation
def partial_selection(key, indexer=None):
if indexer is None:
indexer = self.get_loc(key)
ilevels = [i for i in range(len(key))
if key[i] != slice(None, None)]
return indexer, _maybe_drop_levels(indexer, ilevels,
Expand All @@ -4139,11 +4193,12 @@ def partial_selection(key):
if any([
l.is_all_dates for k, l in zip(key, self.levels)
]) and not can_index_exactly:
indexer = slice(*self.slice_locs(key, key))
indexer = self.get_loc(key)

# we have a multiple selection here
if not indexer.stop - indexer.start == 1:
return partial_selection(key)
if not isinstance(indexer, slice) \
or indexer.stop - indexer.start != 1:
return partial_selection(key, indexer)

key = tuple(self[indexer].tolist()[0])

Expand Down
4 changes: 3 additions & 1 deletion pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -3257,7 +3257,9 @@ def take(self, indexer, axis=1, verify=True, convert=True):
Take items along any axis.
"""
self._consolidate_inplace()
indexer = np.asanyarray(indexer, dtype=np.int_)
indexer = np.arange(indexer.start, indexer.stop, indexer.step,
dtype='int64') if isinstance(indexer, slice) \
else np.asanyarray(indexer, dtype='int64')

n = self.shape[axis]
if convert:
Expand Down
85 changes: 81 additions & 4 deletions pandas/tests/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1488,6 +1488,86 @@ def test_loc_multiindex(self):
result = s.loc[2:4:2, 'a':'c']
assert_series_equal(result, expected)

def test_multiindex_perf_warn(self):
import sys
from pandas.io.common import PerformanceWarning

if sys.version_info < (2, 7):
raise nose.SkipTest('python version < 2.7')

df = DataFrame({'jim':[0, 0, 1, 1],
'joe':['x', 'x', 'z', 'y'],
'jolie':np.random.rand(4)}).set_index(['jim', 'joe'])

with tm.assert_produces_warning(PerformanceWarning):
_ = df.loc[(1, 'z')]

df = df.iloc[[2,1,3,0]]
with tm.assert_produces_warning(PerformanceWarning):
_ = df.loc[(0,)]

def test_multiindex_get_loc(self): # GH7724, GH2646
# test indexing into a multi-index before & past the lexsort depth
from numpy.random import randint, choice, randn
cols = ['jim', 'joe', 'jolie', 'joline', 'jolia']

def validate(mi, df, key):
mask = np.ones(len(df)).astype('bool')

# test for all partials of this key
for i, k in enumerate(key):
mask &= df.iloc[:, i] == k

if not mask.any():
self.assertNotIn(key[:i+1], mi.index)
continue

self.assertIn(key[:i+1], mi.index)
right = df[mask].copy()

if i + 1 != len(key): # partial key
right.drop(cols[:i+1], axis=1, inplace=True)
right.set_index(cols[i+1:-1], inplace=True)
assert_frame_equal(mi.loc[key[:i+1]], right)

else: # full key
right.set_index(cols[:-1], inplace=True)
if len(right) == 1: # single hit
right = Series(right['jolia'].values,
name=right.index[0], index=['jolia'])
assert_series_equal(mi.loc[key[:i+1]], right)
else: # multi hit
assert_frame_equal(mi.loc[key[:i+1]], right)

def loop(mi, df, keys):
for key in keys:
validate(mi, df, key)

n, m = 1000, 50

vals = [randint(0, 10, n), choice(list('abcdefghij'), n),
choice(pd.date_range('20141009', periods=10).tolist(), n),
choice(list('ZYXWVUTSRQ'), n), randn(n)]
vals = list(map(tuple, zip(*vals)))

# bunch of keys for testing
keys = [randint(0, 11, m), choice(list('abcdefghijk'), m),
choice(pd.date_range('20141009', periods=11).tolist(), m),
choice(list('ZYXWVUTSRQP'), m)]
keys = list(map(tuple, zip(*keys)))
keys += list(map(lambda t: t[:-1], vals[::n//m]))

# covers both unique index and non-unique index
df = pd.DataFrame(vals, columns=cols)
a, b = pd.concat([df, df]), df.drop_duplicates(subset=cols[:-1])

for frame in a, b:
for i in range(5): # lexsort depth
df = frame.copy() if i == 0 else frame.sort(columns=cols[:i])
mi = df.set_index(cols[:-1])
assert not mi.index.lexsort_depth < i
loop(mi, df, keys)

def test_series_getitem_multiindex(self):

# GH 6018
Expand Down Expand Up @@ -1541,10 +1621,7 @@ def test_ix_general(self):
'year': {0: 2012, 1: 2011, 2: 2012, 3: 2012, 4: 2012}}
df = DataFrame(data).set_index(keys=['col', 'year'])
key = 4.0, 2012

# this should raise correct error
with tm.assertRaises(KeyError):
df.ix[key]
tm.assert_frame_equal(df.ix[key], df.iloc[2:])

# this is ok
df.sortlevel(inplace=True)
Expand Down