diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 246eab386b2ab..efb4707649f08 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -380,6 +380,7 @@ Performance Improvements - Improved performance of ``DatetimeIndex`` and ``Series`` arithmetic operations with Business-Month and Business-Quarter frequencies (:issue:`18489`) - :func:`Series` / :func:`DataFrame` tab completion limits to 100 values, for better performance. (:issue:`18587`) - Improved performance of :func:`DataFrame.median` with ``axis=1`` when bottleneck is not installed (:issue:`16468`) +- Improved performance of :func:`MultiIndex.get_loc` for large indexes, at the cost of a reduction in performance for small ones (:issue:`18519`) .. _whatsnew_0230.docs: @@ -476,7 +477,11 @@ MultiIndex - Bug in :func:`MultiIndex.get_level_values` which would return an invalid index on level of ints with missing values (:issue:`17924`) - Bug in :func:`MultiIndex.remove_unused_levels` which would fill nan values (:issue:`18417`) - Bug in :func:`MultiIndex.from_tuples`` which would fail to take zipped tuples in python3 (:issue:`18434`) -- +- Bug in :func:`MultiIndex.get_loc`` which would fail to automatically cast values between float and int (:issue:`18818`, :issue:`15994`) +- Bug in :func:`MultiIndex.get_loc`` which would cast boolean to integer labels (:issue:`19086`) +- Bug in :func:`MultiIndex.get_loc`` which would fail to locate keys containing ``NaN`` (:issue:`18485`) +- Bug in :func:`MultiIndex.get_loc`` in large :class:`MultiIndex`, would fail when levels had different dtypes (:issue:`18520`) + I/O ^^^ diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index 014da22df3382..d735b3c0673b2 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -31,15 +31,6 @@ cdef class PyObjectHashTable(HashTable): cpdef get_item(self, object val) cpdef set_item(self, object key, Py_ssize_t val) -cdef class MultiIndexHashTable(HashTable): - cdef: - kh_uint64_t *table - object mi - - cpdef get_item(self, object val) - cpdef set_item(self, object key, Py_ssize_t val) - cdef inline void _check_for_collision(self, Py_ssize_t loc, object label) - cdef class StringHashTable(HashTable): cdef kh_str_t *table diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index bd9dd1f9bae37..bca4e388f3279 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -899,139 +899,3 @@ cdef class PyObjectHashTable(HashTable): count += 1 return np.asarray(labels) - - -cdef class MultiIndexHashTable(HashTable): - - def __init__(self, size_hint=1): - self.table = kh_init_uint64() - self.mi = None - kh_resize_uint64(self.table, size_hint) - - def __dealloc__(self): - if self.table is not NULL: - kh_destroy_uint64(self.table) - self.table = NULL - - def __len__(self): - return self.table.size - - def sizeof(self, deep=False): - """ return the size of my table in bytes """ - return self.table.n_buckets * (sizeof(uint64_t) + # keys - sizeof(size_t) + # vals - sizeof(uint32_t)) # flags - - def _check_for_collisions(self, int64_t[:] locs, object mi): - # validate that the locs map to the actual values - # provided in the mi - # we can only check if we *don't* have any missing values - # :< - cdef: - ndarray[int64_t] alocs - - alocs = np.asarray(locs) - if (alocs != -1).all(): - - result = self.mi.take(locs) - if isinstance(mi, tuple): - from pandas import Index - mi = Index([mi]) - if not result.equals(mi): - raise AssertionError( - "hash collision\nlocs:\n{}\n" - "result:\n{}\nmi:\n{}".format(alocs, result, mi)) - - cdef inline void _check_for_collision(self, Py_ssize_t loc, object label): - # validate that the loc maps to the actual value - # version of _check_for_collisions above for single label (tuple) - - result = self.mi[loc] - - if not all(l == r or (is_null_datetimelike(l) - and is_null_datetimelike(r)) - for l, r in zip(result, label)): - raise AssertionError( - "hash collision\nloc:\n{}\n" - "result:\n{}\nmi:\n{}".format(loc, result, label)) - - def __contains__(self, object key): - try: - self.get_item(key) - return True - except (KeyError, ValueError, TypeError): - return False - - cpdef get_item(self, object key): - cdef: - khiter_t k - uint64_t value - int64_t[:] locs - Py_ssize_t loc - - value = self.mi._hashed_indexing_key(key) - k = kh_get_uint64(self.table, value) - if k != self.table.n_buckets: - loc = self.table.vals[k] - self._check_for_collision(loc, key) - return loc - else: - raise KeyError(key) - - cpdef set_item(self, object key, Py_ssize_t val): - raise NotImplementedError - - @cython.boundscheck(False) - def map_locations(self, object mi): - cdef: - Py_ssize_t i, n - ndarray[uint64_t] values - uint64_t val - int ret = 0 - khiter_t k - - self.mi = mi - n = len(mi) - values = mi._hashed_values - - with nogil: - for i in range(n): - val = values[i] - k = kh_put_uint64(self.table, val, &ret) - self.table.vals[k] = i - - @cython.boundscheck(False) - def lookup(self, object mi): - # look up with a target mi - cdef: - Py_ssize_t i, n - ndarray[uint64_t] values - int ret = 0 - uint64_t val - khiter_t k - int64_t[:] locs - - n = len(mi) - values = mi._hashed_values - - locs = np.empty(n, dtype=np.int64) - - with nogil: - for i in range(n): - val = values[i] - k = kh_get_uint64(self.table, val) - if k != self.table.n_buckets: - locs[i] = self.table.vals[k] - else: - locs[i] = -1 - - self._check_for_collisions(locs, mi) - return np.asarray(locs) - - def unique(self, object mi): - raise NotImplementedError - - def get_labels(self, object mi, ObjectVector uniques, - Py_ssize_t count_prior, int64_t na_sentinel, - bint check_null=True): - raise NotImplementedError diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index bfea4ff9915ac..6b23e487aad3a 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -26,11 +26,12 @@ from hashtable cimport HashTable from pandas._libs import algos, hashtable as _hash from pandas._libs.tslibs import period as periodlib from pandas._libs.tslib import Timestamp, Timedelta +from pandas._libs.missing import checknull cdef int64_t iNaT = util.get_nat() -cdef inline is_definitely_invalid_key(object val): +cdef inline bint is_definitely_invalid_key(object val): if PyTuple_Check(val): try: hash(val) @@ -585,70 +586,137 @@ cpdef convert_scalar(ndarray arr, object value): return value -cdef class MultiIndexObjectEngine(ObjectEngine): +cdef class BaseMultiIndexCodesEngine: """ - provide the same interface as the MultiIndexEngine - but use the IndexEngine for computation - - This provides good performance with samller MI's + Base class for MultiIndexUIntEngine and MultiIndexPyIntEngine, which + represent each label in a MultiIndex as an integer, by juxtaposing the bits + encoding each level, with appropriate offsets. + + For instance: if 3 levels have respectively 3, 6 and 1 possible values, + then their labels can be represented using respectively 2, 3 and 1 bits, + as follows: + _ _ _ _____ _ __ __ __ + |0|0|0| ... |0| 0|a1|a0| -> offset 0 (first level) + — — — ————— — —— —— —— + |0|0|0| ... |0|b2|b1|b0| -> offset 2 (bits required for first level) + — — — ————— — —— —— —— + |0|0|0| ... |0| 0| 0|c0| -> offset 5 (bits required for first two levels) + ‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾ + and the resulting unsigned integer representation will be: + _ _ _ _____ _ __ __ __ __ __ __ + |0|0|0| ... |0|c0|b2|b1|b0|a1|a0| + ‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾ + + Offsets are calculated at initialization, labels are transformed by method + _codes_to_ints. + + Keys are located by first locating each component against the respective + level, then locating (the integer representation of) codes. """ - def get_indexer(self, values): - # convert a MI to an ndarray - if hasattr(values, 'values'): - values = values.values - return super(MultiIndexObjectEngine, self).get_indexer(values) + def __init__(self, object levels, object labels, + ndarray[uint64_t, ndim=1] offsets): + """ + Parameters + ---------- + levels : list-like of numpy arrays + Levels of the MultiIndex + labels : list-like of numpy arrays of integer dtype + Labels of the MultiIndex + offsets : numpy array of uint64 dtype + Pre-calculated offsets, one for each level of the index + """ - cpdef get_loc(self, object val): + self.levels = levels + self.offsets = offsets - # convert a MI to an ndarray - if hasattr(val, 'values'): - val = val.values - return super(MultiIndexObjectEngine, self).get_loc(val) + # Transform labels in a single array, and add 1 so that we are working + # with positive integers (-1 for NaN becomes 0): + codes = (np.array(labels, dtype='int64').T + 1).astype('uint64', + copy=False) + # Map each codes combination in the index to an integer unambiguously + # (no collisions possible), based on the "offsets", which describe the + # number of bits to switch labels for each level: + lab_ints = self._codes_to_ints(codes) -cdef class MultiIndexHashEngine(ObjectEngine): - """ - Use a hashing based MultiIndex impl - but use the IndexEngine for computation + # Initialize underlying index (e.g. libindex.UInt64Engine) with + # integers representing labels: we will use its get_loc and get_indexer + self._base.__init__(self, lambda: lab_ints, len(lab_ints)) - This provides good performance with larger MI's - """ + def _extract_level_codes(self, object target, object method=None): + """ + Map the requested list of (tuple) keys to their integer representations + for searching in the underlying integer index. + + Parameters + ---------- + target : list-like of keys + Each key is a tuple, with a label for each level of the index. + + Returns + ------ + int_keys : 1-dimensional array of dtype uint64 or object + Integers representing one combination each + """ - def _call_monotonic(self, object mi): - # defer these back to the mi iteself - return (mi.is_monotonic_increasing, - mi.is_monotonic_decreasing, - mi.is_unique) + level_codes = [lev.get_indexer(codes) + 1 for lev, codes + in zip(self.levels, zip(*target))] + return self._codes_to_ints(np.array(level_codes, dtype='uint64').T) + + def get_indexer(self, object target, object method=None, + object limit=None): + lab_ints = self._extract_level_codes(target) + + # All methods (exact, backfill, pad) directly map to the respective + # methods of the underlying (integers) index... + if method is not None: + # but underlying backfill and pad methods require index and keys + # to be sorted. The index already is (checked in + # Index._get_fill_indexer), sort (integer representations of) keys: + order = np.argsort(lab_ints) + lab_ints = lab_ints[order] + indexer = (getattr(self._base, 'get_{}_indexer'.format(method)) + (self, lab_ints, limit=limit)) + indexer = indexer[order] + else: + indexer = self._base.get_indexer(self, lab_ints) - def get_backfill_indexer(self, other, limit=None): - # we coerce to ndarray-of-tuples - values = np.array(self._get_index_values()) - return algos.backfill_object(values, other, limit=limit) + return indexer - def get_pad_indexer(self, other, limit=None): - # we coerce to ndarray-of-tuples - values = np.array(self._get_index_values()) - return algos.pad_object(values, other, limit=limit) + def get_loc(self, object key): + if is_definitely_invalid_key(key): + raise TypeError("'{key}' is an invalid key".format(key=key)) + if not PyTuple_Check(key): + raise KeyError(key) + try: + indices = [0 if checknull(v) else lev.get_loc(v) + 1 + for lev, v in zip(self.levels, key)] + except KeyError: + raise KeyError(key) - cpdef get_loc(self, object val): - if is_definitely_invalid_key(val): - raise TypeError("'{val}' is an invalid key".format(val=val)) + # Transform indices into single integer: + lab_int = self._codes_to_ints(np.array(indices, dtype='uint64')) - self._ensure_mapping_populated() - if not self.unique: - return self._get_loc_duplicates(val) + return self._base.get_loc(self, lab_int) - try: - return self.mapping.get_item(val) - except TypeError: - raise KeyError(val) + def get_indexer_non_unique(self, object target): + # This needs to be overridden just because the default one works on + # target._values, and target can be itself a MultiIndex. - def get_indexer(self, values): - self._ensure_mapping_populated() - return self.mapping.lookup(values) + lab_ints = self._extract_level_codes(target) + indexer = self._base.get_indexer_non_unique(self, lab_ints) + + return indexer + + def __contains__(self, object val): + # Default __contains__ looks in the underlying mapping, which in this + # case only contains integer representations. + try: + self.get_loc(val) + return True + except (KeyError, TypeError, ValueError): + return False - cdef _make_hash_table(self, n): - return _hash.MultiIndexHashTable(n) # Generated from template. include "index_class_helper.pxi" diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 797774832aaa5..510f7245cebd8 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -45,6 +45,87 @@ target_klass='MultiIndex or list of tuples')) +class MultiIndexUIntEngine(libindex.BaseMultiIndexCodesEngine, + libindex.UInt64Engine): + """ + This class manages a MultiIndex by mapping label combinations to positive + integers. + """ + _base = libindex.UInt64Engine + + def _codes_to_ints(self, codes): + """ + Transform combination(s) of uint64 in one uint64 (each), in a strictly + monotonic way (i.e. respecting the lexicographic order of integer + combinations): see BaseMultiIndexCodesEngine documentation. + + Parameters + ---------- + codes : 1- or 2-dimensional array of dtype uint64 + Combinations of integers (one per row) + + Returns + ------ + int_keys : scalar or 1-dimensional array, of dtype uint64 + Integer(s) representing one combination (each) + """ + # Shift the representation of each level by the pre-calculated number + # of bits: + codes <<= self.offsets + + # Now sum and OR are in fact interchangeable. This is a simple + # composition of the (disjunct) significant bits of each level (i.e. + # each column in "codes") in a single positive integer: + if codes.ndim == 1: + # Single key + return np.bitwise_or.reduce(codes) + + # Multiple keys + return np.bitwise_or.reduce(codes, axis=1) + + +class MultiIndexPyIntEngine(libindex.BaseMultiIndexCodesEngine, + libindex.ObjectEngine): + """ + This class manages those (extreme) cases in which the number of possible + label combinations overflows the 64 bits integers, and uses an ObjectEngine + containing Python integers. + """ + _base = libindex.ObjectEngine + + def _codes_to_ints(self, codes): + """ + Transform combination(s) of uint64 in one Python integer (each), in a + strictly monotonic way (i.e. respecting the lexicographic order of + integer combinations): see BaseMultiIndexCodesEngine documentation. + + Parameters + ---------- + codes : 1- or 2-dimensional array of dtype uint64 + Combinations of integers (one per row) + + Returns + ------ + int_keys : int, or 1-dimensional array of dtype object + Integer(s) representing one combination (each) + """ + + # Shift the representation of each level by the pre-calculated number + # of bits. Since this can overflow uint64, first make sure we are + # working with Python integers: + codes = codes.astype('object') << self.offsets + + # Now sum and OR are in fact interchangeable. This is a simple + # composition of the (disjunct) significant bits of each level (i.e. + # each column in "codes") in a single positive integer (per row): + if codes.ndim == 1: + # Single key + return np.bitwise_or.reduce(codes) + + # Multiple keys + return np.bitwise_or.reduce(codes, axis=1) + + class MultiIndex(Index): """ A multi-level, or hierarchical, index object for pandas objects @@ -687,16 +768,25 @@ def _get_level_number(self, level): @cache_readonly def _engine(self): - - # choose our engine based on our size - # the hashing based MultiIndex for larger - # sizes, and the MultiIndexOjbect for smaller - # xref: https://github.com/pandas-dev/pandas/pull/16324 - l = len(self) - if l > 10000: - return libindex.MultiIndexHashEngine(lambda: self, l) - - return libindex.MultiIndexObjectEngine(lambda: self.values, l) + # Calculate the number of bits needed to represent labels in each + # level, as log2 of their sizes (including -1 for NaN): + sizes = np.ceil(np.log2([len(l) + 1 for l in self.levels])) + + # Sum bit counts, starting from the _right_.... + lev_bits = np.cumsum(sizes[::-1])[::-1] + + # ... in order to obtain offsets such that sorting the combination of + # shifted codes (one for each level, resulting in a unique integer) is + # equivalent to sorting lexicographically the codes themselves. Notice + # that each level needs to be shifted by the number of bits needed to + # represent the _previous_ ones: + offsets = np.concatenate([lev_bits[1:], [0]]).astype('uint64') + + # Check the total number of bits needed for our representation: + if lev_bits[0] > 64: + # The levels would overflow a 64 bit uint - use Python integers: + return MultiIndexPyIntEngine(self.levels, self.labels, offsets) + return MultiIndexUIntEngine(self.levels, self.labels, offsets) @property def values(self): @@ -1885,16 +1975,11 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): if tolerance is not None: raise NotImplementedError("tolerance not implemented yet " 'for MultiIndex') - indexer = self._get_fill_indexer(target, method, limit) + indexer = self._engine.get_indexer(target, method, limit) elif method == 'nearest': raise NotImplementedError("method='nearest' not implemented yet " 'for MultiIndex; see GitHub issue 9365') else: - # we may not compare equally because of hashing if we - # don't have the same dtypes - if self._inferred_type_levels != target._inferred_type_levels: - return Index(self.values).get_indexer(target.values) - indexer = self._engine.get_indexer(target) return _ensure_platform_int(indexer) @@ -2131,17 +2216,6 @@ def _maybe_to_slice(loc): ''.format(keylen, self.nlevels)) if keylen == self.nlevels and self.is_unique: - - def _maybe_str_to_time_stamp(key, lev): - if lev.is_all_dates and not isinstance(key, Timestamp): - try: - return Timestamp(key, tz=getattr(lev, 'tz', None)) - except Exception: - pass - return key - - key = com._values_from_object(key) - key = tuple(map(_maybe_str_to_time_stamp, key, self.levels)) return self._engine.get_loc(key) # -- partial selection or non-unique index @@ -2274,34 +2348,9 @@ def partial_selection(key, indexer=None): return indexer, maybe_droplevels(indexer, ilevels, drop_level) - if len(key) == self.nlevels: - - if self.is_unique: - - # here we have a completely specified key, but are - # using some partial string matching here - # GH4758 - all_dates = ((l.is_all_dates and - not isinstance(k, compat.string_types)) - for k, l in zip(key, self.levels)) - can_index_exactly = any(all_dates) - if (any(l.is_all_dates - for k, l in zip(key, self.levels)) and - not can_index_exactly): - indexer = self.get_loc(key) - - # we have a multiple selection here - if (not isinstance(indexer, slice) or - indexer.stop - indexer.start != 1): - return partial_selection(key, indexer) - - key = tuple(self[indexer].tolist()[0]) - - return (self._engine.get_loc( - com._values_from_object(key)), None) - - else: - return partial_selection(key) + if len(key) == self.nlevels and self.is_unique: + # Complete key in unique index -> standard get_loc + return (self._engine.get_loc(key), None) else: return partial_selection(key) else: diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 9664d73651185..aedc957ec67da 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -1258,6 +1258,17 @@ def test_get_loc_level(self): assert result == expected assert new_index.equals(index.droplevel(0)) + @pytest.mark.parametrize('level', [0, 1]) + @pytest.mark.parametrize('null_val', [np.nan, pd.NaT, None]) + def test_get_loc_nan(self, level, null_val): + # GH 18485 : NaN in MultiIndex + levels = [['a', 'b'], ['c', 'd']] + key = ['b', 'd'] + levels[level] = np.array([0, null_val], dtype=type(null_val)) + key[level] = null_val + idx = MultiIndex.from_product(levels) + assert idx.get_loc(tuple(key)) == 3 + def test_get_loc_missing_nan(self): # GH 8569 idx = MultiIndex.from_arrays([[1.0, 2.0], [3.0, 4.0]]) @@ -1266,6 +1277,38 @@ def test_get_loc_missing_nan(self): pytest.raises(KeyError, idx.get_loc, np.nan) pytest.raises(KeyError, idx.get_loc, [np.nan]) + @pytest.mark.parametrize('dtype1', [int, float, bool, str]) + @pytest.mark.parametrize('dtype2', [int, float, bool, str]) + def test_get_loc_multiple_dtypes(self, dtype1, dtype2): + # GH 18520 + levels = [np.array([0, 1]).astype(dtype1), + np.array([0, 1]).astype(dtype2)] + idx = pd.MultiIndex.from_product(levels) + assert idx.get_loc(idx[2]) == 2 + + @pytest.mark.parametrize('level', [0, 1]) + @pytest.mark.parametrize('dtypes', [[int, float], [float, int]]) + def test_get_loc_implicit_cast(self, level, dtypes): + # GH 18818, GH 15994 : as flat index, cast int to float and vice-versa + levels = [['a', 'b'], ['c', 'd']] + key = ['b', 'd'] + lev_dtype, key_dtype = dtypes + levels[level] = np.array([0, 1], dtype=lev_dtype) + key[level] = key_dtype(1) + idx = MultiIndex.from_product(levels) + assert idx.get_loc(tuple(key)) == 3 + + def test_get_loc_cast_bool(self): + # GH 19086 : int is casted to bool, but not vice-versa + levels = [[False, True], np.arange(2, dtype='int64')] + idx = MultiIndex.from_product(levels) + + assert idx.get_loc((0, 1)) == 1 + assert idx.get_loc((1, 0)) == 2 + + pytest.raises(KeyError, idx.get_loc, (False, True)) + pytest.raises(KeyError, idx.get_loc, (True, False)) + def test_slice_locs(self): df = tm.makeTimeDataFrame() stacked = df.stack() diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 424ba6aab9a56..9582264a8c716 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1590,6 +1590,38 @@ def test_unstack_group_index_overflow(self): result = s.unstack(4) assert result.shape == (500, 2) + def test_pyint_engine(self): + # GH 18519 : when combinations of codes cannot be represented in 64 + # bits, the index underlying the MultiIndex engine works with Python + # integers, rather than uint64. + N = 5 + keys = [tuple(l) for l in [[0] * 10 * N, + [1] * 10 * N, + [2] * 10 * N, + [np.nan] * N + [2] * 9 * N, + [0] * N + [2] * 9 * N, + [np.nan] * N + [2] * 8 * N + [0] * N]] + # Each level contains 4 elements (including NaN), so it is represented + # in 2 bits, for a total of 2*N*10 = 100 > 64 bits. If we were using a + # 64 bit engine and truncating the first levels, the fourth and fifth + # keys would collide; if truncating the last levels, the fifth and + # sixth; if rotating bits rather than shifting, the third and fifth. + + for idx in range(len(keys)): + index = MultiIndex.from_tuples(keys) + assert index.get_loc(keys[idx]) == idx + + expected = np.arange(idx + 1, dtype='int64') + result = index.get_indexer([keys[i] for i in expected]) + tm.assert_numpy_array_equal(result, expected) + + # With missing key: + idces = range(len(keys)) + expected = np.array([-1] + list(idces), dtype='int64') + missing = tuple([0, 1] * 5 * N) + result = index.get_indexer([missing] + [keys[i] for i in idces]) + tm.assert_numpy_array_equal(result, expected) + def test_getitem_lowerdim_corner(self): pytest.raises(KeyError, self.frame.loc.__getitem__, (('bar', 'three'), 'B'))