diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index c69e61c899600..d461452f15a85 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -493,6 +493,7 @@ Performance improvements - Performance improvement in :meth:`Series.value_counts` and :meth:`Series.mode` for masked dtypes (:issue:`54984`, :issue:`55340`) - Performance improvement in :meth:`.DataFrameGroupBy.nunique` and :meth:`.SeriesGroupBy.nunique` (:issue:`55972`) - Performance improvement in :meth:`.SeriesGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.DataFrameGroupBy.idxmin` (:issue:`54234`) +- Performance improvement in :meth:`Series.isin` (:issue:`39799`) - Performance improvement when indexing into a non-unique index (:issue:`55816`) - Performance improvement when indexing with more than 4 keys (:issue:`54550`) - Performance improvement when localizing time to UTC (:issue:`55241`) diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 336af306d410f..b08c89ae654df 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -25,8 +25,28 @@ dtypes = [('Complex128', 'complex128', 'complex128', }} +cdef extern from "pandas/vendored/klib/khash.h": + ctypedef uint32_t khuint_t + {{for name, dtype, ttype, c_type, to_c_type in dtypes}} +{{if dtype != "object" }} +cdef extern from "pandas/vendored/klib/khash.h": + ctypedef struct kh_{{dtype}}_set_t: + khuint_t n_buckets, size, n_occupied, upper_bound + uint32_t *flags + {{c_type}} *keys + char *vals + + kh_{{dtype}}_set_t* kh_init_{{dtype}}_set() nogil + void kh_destroy_{{dtype}}_set(kh_{{dtype}}_set_t*) nogil + void kh_clear_{{dtype}}_set(kh_{{dtype}}_set_t*) nogil + void kh_resize_{{dtype}}_set(kh_{{dtype}}_set_t*, khuint_t) nogil + khuint_t kh_put_{{dtype}}_set(kh_{{dtype}}_set_t*, {{c_type}}, int*) nogil + void kh_del_{{dtype}}_set(kh_{{dtype}}_set_t*, khuint_t) nogil + + bint kh_exist_{{dtype}}(kh_{{dtype}}_t*, khiter_t) nogil +{{endif}} @cython.wraparound(False) @cython.boundscheck(False) @@ -138,12 +158,17 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', cons {{endif}} Py_ssize_t i, n = len(values), first_na = -1 khiter_t k - kh_{{ttype}}_t *table = kh_init_{{ttype}}() ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool') bint seen_na = False, uses_mask = mask is not None bint seen_multiple_na = False + {{if dtype == "object"}} + cdef kh_{{ttype}}_t *table = kh_init_{{ttype}}() kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT)) + {{else}} + cdef kh_{{ttype}}_set_t *table = kh_init_{{ttype}}_set() + kh_resize_{{ttype}}_set(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT)) + {{endif}} if keep not in ('last', 'first', False): raise ValueError('keep must be either "first", "last" or False') @@ -152,6 +177,21 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', cons {{cond}} keep == {{keep}}: {{if dtype == 'object'}} if True: + {{if keep == '"last"'}} + for i in range(n - 1, -1, -1): + {{else}} + for i in range(n): + {{endif}} + if uses_mask and mask[i]: + if seen_na: + out[i] = True + else: + out[i] = False + seen_na = True + else: + value = {{to_c_type}}(values[i]) + kh_put_{{ttype}}(table, value, &ret) + out[i] = ret == 0 {{else}} with nogil: {{endif}} @@ -168,16 +208,13 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', cons seen_na = True else: value = {{to_c_type}}(values[i]) - kh_put_{{ttype}}(table, value, &ret) + kh_put_{{ttype}}_set(table, value, &ret) out[i] = ret == 0 {{endfor}} else: {{if dtype == 'object'}} if True: - {{else}} - with nogil: - {{endif}} for i in range(n): if uses_mask and mask[i]: if not seen_na: @@ -201,8 +238,38 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', cons k = kh_put_{{ttype}}(table, value, &ret) table.vals[k] = i out[i] = 0 + {{else}} + with nogil: + for i in range(n): + if uses_mask and mask[i]: + if not seen_na: + first_na = i + seen_na = True + out[i] = 0 + elif not seen_multiple_na: + out[i] = 1 + out[first_na] = 1 + seen_multiple_na = True + else: + out[i] = 1 + else: + value = {{to_c_type}}(values[i]) + k = kh_exist_{{ttype}}(table, value) + if k: + out[table.vals[k]] = 1 + out[i] = 1 + else: + k = kh_put_{{ttype}}_set(table, value, &ret) + table.vals[k] = i + out[i] = 0 + {{endif}} + + {{if dtype == "object"}} kh_destroy_{{ttype}}(table) + {{else}} + kh_destroy_{{ttype}}_set(table) + {{endif}} return out @@ -243,11 +310,19 @@ cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values): {{c_type}} val {{endif}} + {{if dtype != "object"}} + kh_{{ttype}}_set_t *table = kh_init_{{ttype}}_set() + {{else}} kh_{{ttype}}_t *table = kh_init_{{ttype}}() + {{endif}} # construct the table n = len(values) + {{if dtype != "object"}} + kh_resize_{{ttype}}_set(table, n) + {{else}} kh_resize_{{ttype}}(table, n) + {{endif}} {{if dtype == 'object'}} if True: @@ -256,7 +331,11 @@ cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values): {{endif}} for i in range(n): val = {{to_c_type}}(values[i]) + {{if dtype != "object"}} + kh_put_{{ttype}}_set(table, val, &ret) + {{else}} kh_put_{{ttype}}(table, val, &ret) + {{endif}} # test membership n = len(arr) @@ -269,10 +348,18 @@ cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values): {{endif}} for i in range(n): val = {{to_c_type}}(arr[i]) + {{if dtype != "object"}} + k = kh_get_{{ttype}}_set(table, val) + {{else}} k = kh_get_{{ttype}}(table, val) + {{endif}} result[i] = (k != table.n_buckets) + {{if dtype != "object"}} + kh_destroy_{{ttype}}_set(table) + {{else}} kh_destroy_{{ttype}}(table) + {{endif}} return result.view(np.bool_) # ---------------------------------------------------------------------- diff --git a/pandas/_libs/include/pandas/vendored/klib/khash.h b/pandas/_libs/include/pandas/vendored/klib/khash.h index f072106e09596..467a2b213e4b3 100644 --- a/pandas/_libs/include/pandas/vendored/klib/khash.h +++ b/pandas/_libs/include/pandas/vendored/klib/khash.h @@ -653,6 +653,9 @@ static inline khuint_t __ac_Wang_hash(khuint_t key) { #define KHASH_MAP_INIT_UINT(name, khval_t) \ KHASH_INIT(name, khuint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) +#define KHASH_SET_INIT_UINT(name) \ + KHASH_INIT(name, khuint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) + /*! @function @abstract Instantiate a hash map containing 64-bit integer keys @param name Name of the hash table [symbol] @@ -684,9 +687,15 @@ static inline khuint_t __ac_Wang_hash(khuint_t key) { #define KHASH_MAP_INIT_INT16(name, khval_t) \ KHASH_INIT(name, khint16_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) +#define KHASH_SET_INIT_INT16(name) \ + KHASH_INIT(name, khint16_t, char, 0, kh_int_hash_func, kh_int_hash_equal) + #define KHASH_MAP_INIT_UINT16(name, khval_t) \ KHASH_INIT(name, khuint16_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) +#define KHASH_SET_INIT_UINT16(name) \ + KHASH_INIT(name, khuint16_t, char, 0, kh_int_hash_func, kh_int_hash_equal) + /*! @function @abstract Instantiate a hash map containing 8bit-integer keys @param name Name of the hash table [symbol] @@ -695,9 +704,15 @@ static inline khuint_t __ac_Wang_hash(khuint_t key) { #define KHASH_MAP_INIT_INT8(name, khval_t) \ KHASH_INIT(name, khint8_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) +#define KHASH_SET_INIT_INT8(name) \ + KHASH_INIT(name, khint8_t, char, 0, kh_int_hash_func, kh_int_hash_equal) + #define KHASH_MAP_INIT_UINT8(name, khval_t) \ KHASH_INIT(name, khuint8_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) +#define KHASH_SET_INIT_UINT8(name) \ + KHASH_INIT(name, khuint8_t, char, 0, kh_int_hash_func, kh_int_hash_equal) + typedef const char *kh_cstr_t; /*! @function @abstract Instantiate a hash map containing const char* keys @@ -728,12 +743,20 @@ typedef const char *kh_cstr_t; KHASH_MAP_INIT_STR(str, size_t) KHASH_MAP_INIT_INT(int32, size_t) +KHASH_SET_INIT_INT(int32_set) KHASH_MAP_INIT_UINT(uint32, size_t) +KHASH_SET_INIT_UINT(uint32_set) KHASH_MAP_INIT_INT64(int64, size_t) +KHASH_SET_INIT_INT64(int64_set) KHASH_MAP_INIT_UINT64(uint64, size_t) +KHASH_SET_INIT_UINT64(uint64_set) KHASH_MAP_INIT_INT16(int16, size_t) +KHASH_SET_INIT_INT16(int16_set) KHASH_MAP_INIT_UINT16(uint16, size_t) +KHASH_SET_INIT_UINT16(uint16_set) KHASH_MAP_INIT_INT8(int8, size_t) +KHASH_SET_INIT_INT8(int8_set) KHASH_MAP_INIT_UINT8(uint8, size_t) +KHASH_SET_INIT_UINT8(uint8_set) #endif /* __AC_KHASH_H */ diff --git a/pandas/_libs/include/pandas/vendored/klib/khash_python.h b/pandas/_libs/include/pandas/vendored/klib/khash_python.h index 5a933b45d9e21..e8035f2b3f8a7 100644 --- a/pandas/_libs/include/pandas/vendored/klib/khash_python.h +++ b/pandas/_libs/include/pandas/vendored/klib/khash_python.h @@ -130,13 +130,23 @@ static inline khuint32_t kh_float32_hash_func(float val) { KHASH_INIT(name, khfloat64_t, khval_t, 1, kh_float64_hash_func, \ kh_floats_hash_equal) +#define KHASH_SET_INIT_FLOAT64(name) \ + KHASH_INIT(name, khfloat64_t, char, 0, kh_float64_hash_func, \ + kh_floats_hash_equal) + KHASH_MAP_INIT_FLOAT64(float64, size_t) +KHASH_SET_INIT_FLOAT64(float64_set) #define KHASH_MAP_INIT_FLOAT32(name, khval_t) \ KHASH_INIT(name, khfloat32_t, khval_t, 1, kh_float32_hash_func, \ kh_floats_hash_equal) +#define KHASH_SET_INIT_FLOAT32(name) \ + KHASH_INIT(name, khfloat32_t, char, 0, kh_float32_hash_func, \ + kh_floats_hash_equal) + KHASH_MAP_INIT_FLOAT32(float32, size_t) +KHASH_SET_INIT_FLOAT32(float32_set) static inline khint32_t kh_complex128_hash_func(khcomplex128_t val) { return kh_float64_hash_func(val.real) ^ kh_float64_hash_func(val.imag); @@ -152,13 +162,23 @@ static inline khint32_t kh_complex64_hash_func(khcomplex64_t val) { KHASH_INIT(name, khcomplex64_t, khval_t, 1, kh_complex64_hash_func, \ kh_complex_hash_equal) +#define KHASH_SET_INIT_COMPLEX64(name) \ + KHASH_INIT(name, khcomplex64_t, char, 0, kh_complex64_hash_func, \ + kh_complex_hash_equal) + KHASH_MAP_INIT_COMPLEX64(complex64, size_t) +KHASH_SET_INIT_COMPLEX64(complex64_set) #define KHASH_MAP_INIT_COMPLEX128(name, khval_t) \ KHASH_INIT(name, khcomplex128_t, khval_t, 1, kh_complex128_hash_func, \ kh_complex_hash_equal) +#define KHASH_SET_INIT_COMPLEX128(name) \ + KHASH_INIT(name, khcomplex128_t, char, 0, kh_complex128_hash_func, \ + kh_complex_hash_equal) + KHASH_MAP_INIT_COMPLEX128(complex128, size_t) +KHASH_SET_INIT_COMPLEX128(complex128_set) #define kh_exist_complex64(h, k) (kh_exist(h, k)) #define kh_exist_complex128(h, k) (kh_exist(h, k))