diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index ffc6757b674ea..29071c99c6df6 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -1404,6 +1404,7 @@ Performance Improvements - Improved performance of datetime string parsing in ``DatetimeIndex`` (:issue:`13692`) - Improved performance of hashing ``Period`` (:issue:`12817`) - Improved performance of ``factorize`` of datetime with timezone (:issue:`13750`) +- Improved performance of by lazily creating indexing hashtables on larger Indexes (:issue:`14266`) .. _whatsnew_0190.bug_fixes: @@ -1422,7 +1423,6 @@ Bug Fixes - Bug in selection from a ``HDFStore`` with a fixed format and ``start`` and/or ``stop`` specified will now return the selected range (:issue:`8287`) - Bug in ``Categorical.from_codes()`` where an unhelpful error was raised when an invalid ``ordered`` parameter was passed in (:issue:`14058`) - Bug in ``Series`` construction from a tuple of integers on windows not returning default dtype (int64) (:issue:`13646`) - - Bug in ``.groupby(..).resample(..)`` when the same object is called multiple times (:issue:`13174`) - Bug in ``.to_records()`` when index name is a unicode string (:issue:`13172`) diff --git a/pandas/index.pyx b/pandas/index.pyx index 2935560a05b6b..a6eb74727a999 100644 --- a/pandas/index.pyx +++ b/pandas/index.pyx @@ -82,7 +82,7 @@ cdef class IndexEngine: cdef: bint unique, monotonic_inc, monotonic_dec - bint initialized, monotonic_check + bint initialized, monotonic_check, unique_check def __init__(self, vgetter, n): self.vgetter = vgetter @@ -91,6 +91,7 @@ cdef class IndexEngine: self.initialized = 0 self.monotonic_check = 0 + self.unique_check = 0 self.unique = 0 self.monotonic_inc = 0 @@ -177,8 +178,8 @@ cdef class IndexEngine: return left else: return slice(left, right) - else: - return self._maybe_get_bool_indexer(val) + + return self._maybe_get_bool_indexer(val) cdef _maybe_get_bool_indexer(self, object val): cdef: @@ -215,6 +216,7 @@ cdef class IndexEngine: if not self.initialized: self.initialize() + self.unique_check = 1 return self.unique == 1 property is_monotonic_increasing: @@ -234,16 +236,24 @@ cdef class IndexEngine: return self.monotonic_dec == 1 cdef inline _do_monotonic_check(self): + cdef object is_unique try: values = self._get_index_values() - self.monotonic_inc, self.monotonic_dec = \ + self.monotonic_inc, self.monotonic_dec, is_unique = \ self._call_monotonic(values) except TypeError: self.monotonic_inc = 0 self.monotonic_dec = 0 + is_unique = 0 self.monotonic_check = 1 + # we can only be sure of uniqueness if is_unique=1 + if is_unique: + self.initialized = 1 + self.unique = 1 + self.unique_check = 1 + cdef _get_index_values(self): return self.vgetter() @@ -257,6 +267,10 @@ cdef class IndexEngine: hash(val) cdef inline _ensure_mapping_populated(self): + # need to reset if we have previously + # set the initialized from monotonic checks + if self.unique_check: + self.initialized = 0 if not self.initialized: self.initialize() @@ -274,6 +288,12 @@ cdef class IndexEngine: def clear_mapping(self): self.mapping = None self.initialized = 0 + self.monotonic_check = 0 + self.unique_check = 0 + + self.unique = 0 + self.monotonic_inc = 0 + self.monotonic_dec = 0 def get_indexer(self, values): self._ensure_mapping_populated() @@ -537,7 +557,6 @@ cdef class DatetimeEngine(Int64Engine): raise TypeError # Welcome to the spaghetti factory - if self.over_size_threshold and self.is_monotonic_increasing: if not self.is_unique: val = _to_i8(val) diff --git a/pandas/src/algos_common_helper.pxi b/pandas/src/algos_common_helper.pxi index b89a80a73e2dd..be587fbc9a019 100644 --- a/pandas/src/algos_common_helper.pxi +++ b/pandas/src/algos_common_helper.pxi @@ -340,27 +340,28 @@ def is_monotonic_float64(ndarray[float64_t] arr, bint timelike): """ Returns ------- - is_monotonic_inc, is_monotonic_dec + is_monotonic_inc, is_monotonic_dec, is_unique """ cdef: Py_ssize_t i, n float64_t prev, cur bint is_monotonic_inc = 1 bint is_monotonic_dec = 1 + bint is_unique = 1 n = len(arr) if n == 1: if arr[0] != arr[0] or (timelike and arr[0] == iNaT): # single value is NaN - return False, False + return False, False, True else: - return True, True + return True, True, True elif n < 2: - return True, True + return True, True, True if timelike and arr[0] == iNaT: - return False, False + return False, False, True with nogil: prev = arr[0] @@ -375,7 +376,7 @@ def is_monotonic_float64(ndarray[float64_t] arr, bint timelike): elif cur > prev: is_monotonic_dec = 0 elif cur == prev: - pass # is_unique = 0 + is_unique = 0 else: # cur or prev is NaN is_monotonic_inc = 0 @@ -386,7 +387,8 @@ def is_monotonic_float64(ndarray[float64_t] arr, bint timelike): is_monotonic_dec = 0 break prev = cur - return is_monotonic_inc, is_monotonic_dec + return is_monotonic_inc, is_monotonic_dec, \ + is_unique and (is_monotonic_inc or is_monotonic_dec) @cython.wraparound(False) @@ -753,27 +755,28 @@ def is_monotonic_float32(ndarray[float32_t] arr, bint timelike): """ Returns ------- - is_monotonic_inc, is_monotonic_dec + is_monotonic_inc, is_monotonic_dec, is_unique """ cdef: Py_ssize_t i, n float32_t prev, cur bint is_monotonic_inc = 1 bint is_monotonic_dec = 1 + bint is_unique = 1 n = len(arr) if n == 1: if arr[0] != arr[0] or (timelike and arr[0] == iNaT): # single value is NaN - return False, False + return False, False, True else: - return True, True + return True, True, True elif n < 2: - return True, True + return True, True, True if timelike and arr[0] == iNaT: - return False, False + return False, False, True with nogil: prev = arr[0] @@ -788,7 +791,7 @@ def is_monotonic_float32(ndarray[float32_t] arr, bint timelike): elif cur > prev: is_monotonic_dec = 0 elif cur == prev: - pass # is_unique = 0 + is_unique = 0 else: # cur or prev is NaN is_monotonic_inc = 0 @@ -799,7 +802,8 @@ def is_monotonic_float32(ndarray[float32_t] arr, bint timelike): is_monotonic_dec = 0 break prev = cur - return is_monotonic_inc, is_monotonic_dec + return is_monotonic_inc, is_monotonic_dec, \ + is_unique and (is_monotonic_inc or is_monotonic_dec) @cython.wraparound(False) @@ -1166,27 +1170,28 @@ def is_monotonic_object(ndarray[object] arr, bint timelike): """ Returns ------- - is_monotonic_inc, is_monotonic_dec + is_monotonic_inc, is_monotonic_dec, is_unique """ cdef: Py_ssize_t i, n object prev, cur bint is_monotonic_inc = 1 bint is_monotonic_dec = 1 + bint is_unique = 1 n = len(arr) if n == 1: if arr[0] != arr[0] or (timelike and arr[0] == iNaT): # single value is NaN - return False, False + return False, False, True else: - return True, True + return True, True, True elif n < 2: - return True, True + return True, True, True if timelike and arr[0] == iNaT: - return False, False + return False, False, True prev = arr[0] @@ -1201,7 +1206,7 @@ def is_monotonic_object(ndarray[object] arr, bint timelike): elif cur > prev: is_monotonic_dec = 0 elif cur == prev: - pass # is_unique = 0 + is_unique = 0 else: # cur or prev is NaN is_monotonic_inc = 0 @@ -1212,7 +1217,8 @@ def is_monotonic_object(ndarray[object] arr, bint timelike): is_monotonic_dec = 0 break prev = cur - return is_monotonic_inc, is_monotonic_dec + return is_monotonic_inc, is_monotonic_dec, \ + is_unique and (is_monotonic_inc or is_monotonic_dec) @cython.wraparound(False) @@ -1579,27 +1585,28 @@ def is_monotonic_int32(ndarray[int32_t] arr, bint timelike): """ Returns ------- - is_monotonic_inc, is_monotonic_dec + is_monotonic_inc, is_monotonic_dec, is_unique """ cdef: Py_ssize_t i, n int32_t prev, cur bint is_monotonic_inc = 1 bint is_monotonic_dec = 1 + bint is_unique = 1 n = len(arr) if n == 1: if arr[0] != arr[0] or (timelike and arr[0] == iNaT): # single value is NaN - return False, False + return False, False, True else: - return True, True + return True, True, True elif n < 2: - return True, True + return True, True, True if timelike and arr[0] == iNaT: - return False, False + return False, False, True with nogil: prev = arr[0] @@ -1614,7 +1621,7 @@ def is_monotonic_int32(ndarray[int32_t] arr, bint timelike): elif cur > prev: is_monotonic_dec = 0 elif cur == prev: - pass # is_unique = 0 + is_unique = 0 else: # cur or prev is NaN is_monotonic_inc = 0 @@ -1625,7 +1632,8 @@ def is_monotonic_int32(ndarray[int32_t] arr, bint timelike): is_monotonic_dec = 0 break prev = cur - return is_monotonic_inc, is_monotonic_dec + return is_monotonic_inc, is_monotonic_dec, \ + is_unique and (is_monotonic_inc or is_monotonic_dec) @cython.wraparound(False) @@ -1992,27 +2000,28 @@ def is_monotonic_int64(ndarray[int64_t] arr, bint timelike): """ Returns ------- - is_monotonic_inc, is_monotonic_dec + is_monotonic_inc, is_monotonic_dec, is_unique """ cdef: Py_ssize_t i, n int64_t prev, cur bint is_monotonic_inc = 1 bint is_monotonic_dec = 1 + bint is_unique = 1 n = len(arr) if n == 1: if arr[0] != arr[0] or (timelike and arr[0] == iNaT): # single value is NaN - return False, False + return False, False, True else: - return True, True + return True, True, True elif n < 2: - return True, True + return True, True, True if timelike and arr[0] == iNaT: - return False, False + return False, False, True with nogil: prev = arr[0] @@ -2027,7 +2036,7 @@ def is_monotonic_int64(ndarray[int64_t] arr, bint timelike): elif cur > prev: is_monotonic_dec = 0 elif cur == prev: - pass # is_unique = 0 + is_unique = 0 else: # cur or prev is NaN is_monotonic_inc = 0 @@ -2038,7 +2047,8 @@ def is_monotonic_int64(ndarray[int64_t] arr, bint timelike): is_monotonic_dec = 0 break prev = cur - return is_monotonic_inc, is_monotonic_dec + return is_monotonic_inc, is_monotonic_dec, \ + is_unique and (is_monotonic_inc or is_monotonic_dec) @cython.wraparound(False) @@ -2405,27 +2415,28 @@ def is_monotonic_bool(ndarray[uint8_t] arr, bint timelike): """ Returns ------- - is_monotonic_inc, is_monotonic_dec + is_monotonic_inc, is_monotonic_dec, is_unique """ cdef: Py_ssize_t i, n uint8_t prev, cur bint is_monotonic_inc = 1 bint is_monotonic_dec = 1 + bint is_unique = 1 n = len(arr) if n == 1: if arr[0] != arr[0] or (timelike and arr[0] == iNaT): # single value is NaN - return False, False + return False, False, True else: - return True, True + return True, True, True elif n < 2: - return True, True + return True, True, True if timelike and arr[0] == iNaT: - return False, False + return False, False, True with nogil: prev = arr[0] @@ -2440,7 +2451,7 @@ def is_monotonic_bool(ndarray[uint8_t] arr, bint timelike): elif cur > prev: is_monotonic_dec = 0 elif cur == prev: - pass # is_unique = 0 + is_unique = 0 else: # cur or prev is NaN is_monotonic_inc = 0 @@ -2451,7 +2462,8 @@ def is_monotonic_bool(ndarray[uint8_t] arr, bint timelike): is_monotonic_dec = 0 break prev = cur - return is_monotonic_inc, is_monotonic_dec + return is_monotonic_inc, is_monotonic_dec, \ + is_unique and (is_monotonic_inc or is_monotonic_dec) @cython.wraparound(False) diff --git a/pandas/src/algos_common_helper.pxi.in b/pandas/src/algos_common_helper.pxi.in index 1451ffb054e5d..cec5712c0b7f4 100644 --- a/pandas/src/algos_common_helper.pxi.in +++ b/pandas/src/algos_common_helper.pxi.in @@ -362,27 +362,28 @@ def is_monotonic_{{name}}(ndarray[{{c_type}}] arr, bint timelike): """ Returns ------- - is_monotonic_inc, is_monotonic_dec + is_monotonic_inc, is_monotonic_dec, is_unique """ cdef: Py_ssize_t i, n {{c_type}} prev, cur bint is_monotonic_inc = 1 bint is_monotonic_dec = 1 + bint is_unique = 1 n = len(arr) if n == 1: if arr[0] != arr[0] or (timelike and arr[0] == iNaT): # single value is NaN - return False, False + return False, False, True else: - return True, True + return True, True, True elif n < 2: - return True, True + return True, True, True if timelike and arr[0] == iNaT: - return False, False + return False, False, True {{nogil_str}} {{tab}}prev = arr[0] @@ -397,7 +398,7 @@ def is_monotonic_{{name}}(ndarray[{{c_type}}] arr, bint timelike): {{tab}} elif cur > prev: {{tab}} is_monotonic_dec = 0 {{tab}} elif cur == prev: - {{tab}} pass # is_unique = 0 + {{tab}} is_unique = 0 {{tab}} else: {{tab}} # cur or prev is NaN {{tab}} is_monotonic_inc = 0 @@ -408,7 +409,8 @@ def is_monotonic_{{name}}(ndarray[{{c_type}}] arr, bint timelike): {{tab}} is_monotonic_dec = 0 {{tab}} break {{tab}} prev = cur - return is_monotonic_inc, is_monotonic_dec + return is_monotonic_inc, is_monotonic_dec, \ + is_unique and (is_monotonic_inc or is_monotonic_dec) @cython.wraparound(False) diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index e26a0548fdc78..b2326cb7b3255 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -1387,6 +1387,7 @@ def get_loc(self, key, method=None, tolerance=None): ------- loc : int """ + if tolerance is not None: # try converting tolerance now, so errors don't get swallowed by # the try/except clauses below