From 939529bb5ff01dc3ee0245fe1117b8c6bea13383 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 3 May 2023 13:32:52 -0700 Subject: [PATCH 1/8] hacks to get int32/int64 hash sets --- pandas/_libs/hashtable_func_helper.pxi.in | 41 +++++++++++++++++++++++ pandas/_libs/khash.pxd | 1 + pandas/_libs/src/klib/khash.h | 32 ++++++++++++------ 3 files changed, 63 insertions(+), 11 deletions(-) diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index b9cf6011481af..476233e795d54 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -25,8 +25,29 @@ dtypes = [('Complex128', 'complex128', 'complex128', }} +cdef extern from "khash.h": + ctypedef uint32_t khuint_t + {{for name, dtype, ttype, c_type, to_c_type in dtypes}} +{{if dtype in ("int32", "int64")}} +cdef extern from "khash.h": + ctypedef struct kh_{{dtype}}_t_set: + khuint_t n_buckets, size, n_occupied, upper_bound + uint32_t *flags + {{c_type}} *keys + char *vals + + kh_{{dtype}}_t_set* kh_init_{{dtype}}_set() nogil + void kh_destroy_{{dtype}}_set(kh_{{dtype}}_t_set*) nogil + void kh_clear_{{dtype}}_set(kh_{{dtype}}_t_set*) nogil + khuint_t kh_get_{{dtype}}_set(kh_{{dtype}}_t_set*, {{c_type}}) nogil + void kh_resize_{{dtype}}_set(kh_{{dtype}}_t_set*, khuint_t) nogil + khuint_t kh_put_{{dtype}}_set(kh_{{dtype}}_t_set*, {{c_type}}, int*) nogil + void kh_del_{{dtype}}_set(kh_{{dtype}}_t_set*, khuint_t) nogil + + bint kh_exist_{{dtype}}(kh_{{dtype}}_t*, khiter_t) nogil +{{endif}} @cython.wraparound(False) @cython.boundscheck(False) @@ -236,11 +257,19 @@ cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values): {{c_type}} val {{endif}} + {{if dtype in ("int32", "int64")}} + kh_{{ttype}}_t_set *table = kh_init_{{ttype}}_set() + {{else}} kh_{{ttype}}_t *table = kh_init_{{ttype}}() + {{endif}} # construct the table n = len(values) + {{if dtype in ("int32", "int64")}} + kh_resize_{{ttype}}_set(table, n) + {{else}} kh_resize_{{ttype}}(table, n) + {{endif}} {{if dtype == 'object'}} if True: @@ -249,7 +278,11 @@ cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values): {{endif}} for i in range(n): val = {{to_c_type}}(values[i]) + {{if dtype in ("int32", "int64")}} + kh_put_{{ttype}}_set(table, val, &ret) + {{else}} kh_put_{{ttype}}(table, val, &ret) + {{endif}} # test membership n = len(arr) @@ -262,10 +295,18 @@ cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values): {{endif}} for i in range(n): val = {{to_c_type}}(arr[i]) + {{if dtype in ("int32", "int64")}} + k = kh_get_{{ttype}}_set(table, val) + {{else}} k = kh_get_{{ttype}}(table, val) + {{endif}} result[i] = (k != table.n_buckets) + {{if dtype in ("int32", "int64")}} + kh_destroy_{{ttype}}_set(table) + {{else}} kh_destroy_{{ttype}}(table) + {{endif}} return result.view(np.bool_) # ---------------------------------------------------------------------- diff --git a/pandas/_libs/khash.pxd b/pandas/_libs/khash.pxd index a9f819e5e16db..31826fcfc6185 100644 --- a/pandas/_libs/khash.pxd +++ b/pandas/_libs/khash.pxd @@ -17,6 +17,7 @@ from numpy cimport ( cdef extern from "khash_python.h": const int KHASH_TRACE_DOMAIN + void KHASH_SET_INIT_INT64(int64) ctypedef uint32_t khuint_t ctypedef khuint_t khiter_t diff --git a/pandas/_libs/src/klib/khash.h b/pandas/_libs/src/klib/khash.h index e17d82d51f0fb..f88e75ef586e4 100644 --- a/pandas/_libs/src/klib/khash.h +++ b/pandas/_libs/src/klib/khash.h @@ -282,17 +282,24 @@ static const double __ac_HASH_UPPER = 0.77; extern khuint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \ extern void kh_del_##name(kh_##name##_t *h, khuint_t x); +// From https://stackoverflow.com/a/1489985/621736 +#define PASTER(x,y) x ## y +#define EVALUATOR(x, y) PASTER(x, y) +#define KHASH_FUNCTION_POSTFIX_0 _set +#define KHASH_FUNCTION_POSTFIX_1 +#define KHASH_FUNCTION_POSTFIX(kh_is_map) KHASH_FUNCTION_POSTFIX_##kh_is_map + #define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ typedef struct { \ khuint_t n_buckets, size, n_occupied, upper_bound; \ khuint32_t *flags; \ khkey_t *keys; \ khval_t *vals; \ - } kh_##name##_t; \ - SCOPE kh_##name##_t *kh_init_##name(void) { \ - return (kh_##name##_t*)KHASH_CALLOC(1, sizeof(kh_##name##_t)); \ + } EVALUATOR(kh_##name##_t, KHASH_FUNCTION_POSTFIX(kh_is_map)); \ + SCOPE EVALUATOR(kh_##name##_t, KHASH_FUNCTION_POSTFIX(kh_is_map)) EVALUATOR(*kh_init_##name, KHASH_FUNCTION_POSTFIX(kh_is_map))(void) { \ + return (EVALUATOR(kh_##name##_t, KHASH_FUNCTION_POSTFIX(kh_is_map))*)KHASH_CALLOC(1, sizeof(EVALUATOR(kh_##name##_t, KHASH_FUNCTION_POSTFIX(kh_is_map)))); \ } \ - SCOPE void kh_destroy_##name(kh_##name##_t *h) \ + SCOPE void EVALUATOR(kh_destroy_##name, KHASH_FUNCTION_POSTFIX(kh_is_map))(EVALUATOR(kh_##name##_t, KHASH_FUNCTION_POSTFIX(kh_is_map)) *h) \ { \ if (h) { \ KHASH_FREE(h->keys); KHASH_FREE(h->flags); \ @@ -300,14 +307,14 @@ static const double __ac_HASH_UPPER = 0.77; KHASH_FREE(h); \ } \ } \ - SCOPE void kh_clear_##name(kh_##name##_t *h) \ + SCOPE void EVALUATOR(kh_clear_##name, KHASH_FUNCTION_POSTFIX(kh_is_map))(EVALUATOR(kh_##name##_t, KHASH_FUNCTION_POSTFIX(kh_is_map)) *h) \ { \ if (h && h->flags) { \ memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khuint32_t)); \ h->size = h->n_occupied = 0; \ } \ } \ - SCOPE khuint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ + SCOPE khuint_t EVALUATOR(kh_get_##name, KHASH_FUNCTION_POSTFIX(kh_is_map))(const EVALUATOR(kh_##name##_t, KHASH_FUNCTION_POSTFIX(kh_is_map)) *h, khkey_t key) \ { \ if (h->n_buckets) { \ khuint_t inc, k, i, last, mask; \ @@ -321,7 +328,7 @@ static const double __ac_HASH_UPPER = 0.77; return __ac_iseither(h->flags, i)? h->n_buckets : i; \ } else return 0; \ } \ - SCOPE void kh_resize_##name(kh_##name##_t *h, khuint_t new_n_buckets) \ + SCOPE void EVALUATOR(kh_resize_##name, KHASH_FUNCTION_POSTFIX(kh_is_map))(EVALUATOR(kh_##name##_t, KHASH_FUNCTION_POSTFIX(kh_is_map)) *h, khuint_t new_n_buckets) \ { /* This function uses 0.25*n_bucktes bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \ khuint32_t *new_flags = 0; \ khuint_t j = 1; \ @@ -377,12 +384,12 @@ static const double __ac_HASH_UPPER = 0.77; h->upper_bound = (khuint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ } \ } \ - SCOPE khuint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ + SCOPE khuint_t EVALUATOR(kh_put_##name, KHASH_FUNCTION_POSTFIX(kh_is_map))(EVALUATOR(kh_##name##_t, KHASH_FUNCTION_POSTFIX(kh_is_map)) *h, khkey_t key, int *ret) \ { \ khuint_t x; \ if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \ - if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); /* clear "deleted" elements */ \ - else kh_resize_##name(h, h->n_buckets + 1); /* expand the hash table */ \ + if (h->n_buckets > (h->size<<1)) EVALUATOR(kh_resize_##name, KHASH_FUNCTION_POSTFIX(kh_is_map))(h, h->n_buckets - 1); /* clear "deleted" elements */ \ + else EVALUATOR(kh_resize_##name, KHASH_FUNCTION_POSTFIX(kh_is_map))(h, h->n_buckets + 1); /* expand the hash table */ \ } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \ { \ khuint_t inc, k, i, site, last, mask = h->n_buckets - 1; \ @@ -414,7 +421,7 @@ static const double __ac_HASH_UPPER = 0.77; } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \ return x; \ } \ - SCOPE void kh_del_##name(kh_##name##_t *h, khuint_t x) \ + SCOPE void EVALUATOR(kh_del_##name, KHASH_FUNCTION_POSTFIX(kh_is_map))(EVALUATOR(kh_##name##_t, KHASH_FUNCTION_POSTFIX(kh_is_map)) *h, khuint_t x) \ { \ if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ __ac_set_isdel_true(h->flags, x); \ @@ -706,9 +713,12 @@ typedef const char *kh_cstr_t; #define kh_exist_uint8(h, k) (kh_exist(h, k)) KHASH_MAP_INIT_STR(str, size_t) +//KHASH_SET_INIT_STR(str) KHASH_MAP_INIT_INT(int32, size_t) +KHASH_SET_INIT_INT(int32) KHASH_MAP_INIT_UINT(uint32, size_t) KHASH_MAP_INIT_INT64(int64, size_t) +KHASH_SET_INIT_INT64(int64) KHASH_MAP_INIT_UINT64(uint64, size_t) KHASH_MAP_INIT_INT16(int16, size_t) KHASH_MAP_INIT_UINT16(uint16, size_t) From a05dda6be16d8164107faa9df5dfbfa704acd3f8 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 4 May 2023 13:09:55 -0700 Subject: [PATCH 2/8] simplify implementation --- pandas/_libs/hashtable_func_helper.pxi.in | 18 ++++++------ pandas/_libs/khash.pxd | 1 - pandas/_libs/src/klib/khash.h | 34 +++++++++-------------- 3 files changed, 22 insertions(+), 31 deletions(-) diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 476233e795d54..d401f73ef231b 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -32,19 +32,19 @@ cdef extern from "khash.h": {{if dtype in ("int32", "int64")}} cdef extern from "khash.h": - ctypedef struct kh_{{dtype}}_t_set: + ctypedef struct kh_{{dtype}}_set_t: khuint_t n_buckets, size, n_occupied, upper_bound uint32_t *flags {{c_type}} *keys char *vals - kh_{{dtype}}_t_set* kh_init_{{dtype}}_set() nogil - void kh_destroy_{{dtype}}_set(kh_{{dtype}}_t_set*) nogil - void kh_clear_{{dtype}}_set(kh_{{dtype}}_t_set*) nogil - khuint_t kh_get_{{dtype}}_set(kh_{{dtype}}_t_set*, {{c_type}}) nogil - void kh_resize_{{dtype}}_set(kh_{{dtype}}_t_set*, khuint_t) nogil - khuint_t kh_put_{{dtype}}_set(kh_{{dtype}}_t_set*, {{c_type}}, int*) nogil - void kh_del_{{dtype}}_set(kh_{{dtype}}_t_set*, khuint_t) nogil + kh_{{dtype}}_set_t* kh_init_{{dtype}}_set() nogil + void kh_destroy_{{dtype}}_set(kh_{{dtype}}_set_t*) nogil + void kh_clear_{{dtype}}_set(kh_{{dtype}}_set_t*) nogil + khuint_t kh_get_{{dtype}}_set(kh_{{dtype}}_set_t*, {{c_type}}) nogil + void kh_resize_{{dtype}}_set(kh_{{dtype}}_set_t*, khuint_t) nogil + khuint_t kh_put_{{dtype}}_set(kh_{{dtype}}_set_t*, {{c_type}}, int*) nogil + void kh_del_{{dtype}}_set(kh_{{dtype}}_set_t*, khuint_t) nogil bint kh_exist_{{dtype}}(kh_{{dtype}}_t*, khiter_t) nogil {{endif}} @@ -258,7 +258,7 @@ cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values): {{endif}} {{if dtype in ("int32", "int64")}} - kh_{{ttype}}_t_set *table = kh_init_{{ttype}}_set() + kh_{{ttype}}_set_t *table = kh_init_{{ttype}}_set() {{else}} kh_{{ttype}}_t *table = kh_init_{{ttype}}() {{endif}} diff --git a/pandas/_libs/khash.pxd b/pandas/_libs/khash.pxd index 31826fcfc6185..a9f819e5e16db 100644 --- a/pandas/_libs/khash.pxd +++ b/pandas/_libs/khash.pxd @@ -17,7 +17,6 @@ from numpy cimport ( cdef extern from "khash_python.h": const int KHASH_TRACE_DOMAIN - void KHASH_SET_INIT_INT64(int64) ctypedef uint32_t khuint_t ctypedef khuint_t khiter_t diff --git a/pandas/_libs/src/klib/khash.h b/pandas/_libs/src/klib/khash.h index f88e75ef586e4..032de7c7217ab 100644 --- a/pandas/_libs/src/klib/khash.h +++ b/pandas/_libs/src/klib/khash.h @@ -282,24 +282,17 @@ static const double __ac_HASH_UPPER = 0.77; extern khuint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \ extern void kh_del_##name(kh_##name##_t *h, khuint_t x); -// From https://stackoverflow.com/a/1489985/621736 -#define PASTER(x,y) x ## y -#define EVALUATOR(x, y) PASTER(x, y) -#define KHASH_FUNCTION_POSTFIX_0 _set -#define KHASH_FUNCTION_POSTFIX_1 -#define KHASH_FUNCTION_POSTFIX(kh_is_map) KHASH_FUNCTION_POSTFIX_##kh_is_map - #define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ typedef struct { \ khuint_t n_buckets, size, n_occupied, upper_bound; \ khuint32_t *flags; \ khkey_t *keys; \ khval_t *vals; \ - } EVALUATOR(kh_##name##_t, KHASH_FUNCTION_POSTFIX(kh_is_map)); \ - SCOPE EVALUATOR(kh_##name##_t, KHASH_FUNCTION_POSTFIX(kh_is_map)) EVALUATOR(*kh_init_##name, KHASH_FUNCTION_POSTFIX(kh_is_map))(void) { \ - return (EVALUATOR(kh_##name##_t, KHASH_FUNCTION_POSTFIX(kh_is_map))*)KHASH_CALLOC(1, sizeof(EVALUATOR(kh_##name##_t, KHASH_FUNCTION_POSTFIX(kh_is_map)))); \ + } kh_##name##_t; \ + SCOPE kh_##name##_t *kh_init_##name(void) { \ + return (kh_##name##_t*)KHASH_CALLOC(1, sizeof(kh_##name##_t)); \ } \ - SCOPE void EVALUATOR(kh_destroy_##name, KHASH_FUNCTION_POSTFIX(kh_is_map))(EVALUATOR(kh_##name##_t, KHASH_FUNCTION_POSTFIX(kh_is_map)) *h) \ + SCOPE void kh_destroy_##name(kh_##name##_t *h) \ { \ if (h) { \ KHASH_FREE(h->keys); KHASH_FREE(h->flags); \ @@ -307,14 +300,14 @@ static const double __ac_HASH_UPPER = 0.77; KHASH_FREE(h); \ } \ } \ - SCOPE void EVALUATOR(kh_clear_##name, KHASH_FUNCTION_POSTFIX(kh_is_map))(EVALUATOR(kh_##name##_t, KHASH_FUNCTION_POSTFIX(kh_is_map)) *h) \ + SCOPE void kh_clear_##name(kh_##name##_t *h) \ { \ if (h && h->flags) { \ memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khuint32_t)); \ h->size = h->n_occupied = 0; \ } \ } \ - SCOPE khuint_t EVALUATOR(kh_get_##name, KHASH_FUNCTION_POSTFIX(kh_is_map))(const EVALUATOR(kh_##name##_t, KHASH_FUNCTION_POSTFIX(kh_is_map)) *h, khkey_t key) \ + SCOPE khuint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ { \ if (h->n_buckets) { \ khuint_t inc, k, i, last, mask; \ @@ -328,7 +321,7 @@ static const double __ac_HASH_UPPER = 0.77; return __ac_iseither(h->flags, i)? h->n_buckets : i; \ } else return 0; \ } \ - SCOPE void EVALUATOR(kh_resize_##name, KHASH_FUNCTION_POSTFIX(kh_is_map))(EVALUATOR(kh_##name##_t, KHASH_FUNCTION_POSTFIX(kh_is_map)) *h, khuint_t new_n_buckets) \ + SCOPE void kh_resize_##name(kh_##name##_t *h, khuint_t new_n_buckets) \ { /* This function uses 0.25*n_bucktes bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \ khuint32_t *new_flags = 0; \ khuint_t j = 1; \ @@ -384,12 +377,12 @@ static const double __ac_HASH_UPPER = 0.77; h->upper_bound = (khuint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ } \ } \ - SCOPE khuint_t EVALUATOR(kh_put_##name, KHASH_FUNCTION_POSTFIX(kh_is_map))(EVALUATOR(kh_##name##_t, KHASH_FUNCTION_POSTFIX(kh_is_map)) *h, khkey_t key, int *ret) \ + SCOPE khuint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ { \ khuint_t x; \ if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \ - if (h->n_buckets > (h->size<<1)) EVALUATOR(kh_resize_##name, KHASH_FUNCTION_POSTFIX(kh_is_map))(h, h->n_buckets - 1); /* clear "deleted" elements */ \ - else EVALUATOR(kh_resize_##name, KHASH_FUNCTION_POSTFIX(kh_is_map))(h, h->n_buckets + 1); /* expand the hash table */ \ + if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); /* clear "deleted" elements */ \ + else kh_resize_##name(h, h->n_buckets + 1); /* expand the hash table */ \ } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \ { \ khuint_t inc, k, i, site, last, mask = h->n_buckets - 1; \ @@ -421,7 +414,7 @@ static const double __ac_HASH_UPPER = 0.77; } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \ return x; \ } \ - SCOPE void EVALUATOR(kh_del_##name, KHASH_FUNCTION_POSTFIX(kh_is_map))(EVALUATOR(kh_##name##_t, KHASH_FUNCTION_POSTFIX(kh_is_map)) *h, khuint_t x) \ + SCOPE void kh_del_##name(kh_##name##_t *h, khuint_t x) \ { \ if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ __ac_set_isdel_true(h->flags, x); \ @@ -713,12 +706,11 @@ typedef const char *kh_cstr_t; #define kh_exist_uint8(h, k) (kh_exist(h, k)) KHASH_MAP_INIT_STR(str, size_t) -//KHASH_SET_INIT_STR(str) KHASH_MAP_INIT_INT(int32, size_t) -KHASH_SET_INIT_INT(int32) +KHASH_SET_INIT_INT(int32_set) KHASH_MAP_INIT_UINT(uint32, size_t) KHASH_MAP_INIT_INT64(int64, size_t) -KHASH_SET_INIT_INT64(int64) +KHASH_SET_INIT_INT64(int64_set) KHASH_MAP_INIT_UINT64(uint64, size_t) KHASH_MAP_INIT_INT16(int16, size_t) KHASH_MAP_INIT_UINT16(uint16, size_t) From ed3f0467557d827ac8d595ab37d974330a2054fc Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 4 Dec 2023 15:06:35 -0800 Subject: [PATCH 3/8] working excluding pyobject --- pandas/_libs/hashtable_func_helper.pxi.in | 12 +++++------ .../include/pandas/vendored/klib/khash.h | 6 ++++++ .../pandas/vendored/klib/khash_python.h | 20 +++++++++++++++++++ 3 files changed, 32 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 849d02ae553c6..616c9cd084043 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -30,7 +30,7 @@ cdef extern from "pandas/vendored/klib/khash.h": {{for name, dtype, ttype, c_type, to_c_type in dtypes}} -{{if dtype in ("int32", "int64")}} +{{if dtype != "object" }} cdef extern from "pandas/vendored/klib/khash.h": ctypedef struct kh_{{dtype}}_set_t: khuint_t n_buckets, size, n_occupied, upper_bound @@ -264,7 +264,7 @@ cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values): {{c_type}} val {{endif}} - {{if dtype in ("int32", "int64")}} + {{if dtype != "object"}} kh_{{ttype}}_set_t *table = kh_init_{{ttype}}_set() {{else}} kh_{{ttype}}_t *table = kh_init_{{ttype}}() @@ -272,7 +272,7 @@ cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values): # construct the table n = len(values) - {{if dtype in ("int32", "int64")}} + {{if dtype != "object"}} kh_resize_{{ttype}}_set(table, n) {{else}} kh_resize_{{ttype}}(table, n) @@ -285,7 +285,7 @@ cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values): {{endif}} for i in range(n): val = {{to_c_type}}(values[i]) - {{if dtype in ("int32", "int64")}} + {{if dtype != "object"}} kh_put_{{ttype}}_set(table, val, &ret) {{else}} kh_put_{{ttype}}(table, val, &ret) @@ -302,14 +302,14 @@ cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values): {{endif}} for i in range(n): val = {{to_c_type}}(arr[i]) - {{if dtype in ("int32", "int64")}} + {{if dtype != "object"}} k = kh_get_{{ttype}}_set(table, val) {{else}} k = kh_get_{{ttype}}(table, val) {{endif}} result[i] = (k != table.n_buckets) - {{if dtype in ("int32", "int64")}} + {{if dtype != "object"}} kh_destroy_{{ttype}}_set(table) {{else}} kh_destroy_{{ttype}}(table) diff --git a/pandas/_libs/include/pandas/vendored/klib/khash.h b/pandas/_libs/include/pandas/vendored/klib/khash.h index e4eb662cc15f6..5a7436270d950 100644 --- a/pandas/_libs/include/pandas/vendored/klib/khash.h +++ b/pandas/_libs/include/pandas/vendored/klib/khash.h @@ -730,12 +730,18 @@ KHASH_MAP_INIT_STR(str, size_t) KHASH_MAP_INIT_INT(int32, size_t) KHASH_SET_INIT_INT(int32_set) KHASH_MAP_INIT_UINT(uint32, size_t) +KHASH_SET_INIT_INT(uint32_set) KHASH_MAP_INIT_INT64(int64, size_t) KHASH_SET_INIT_INT64(int64_set) KHASH_MAP_INIT_UINT64(uint64, size_t) +KHASH_SET_INIT_INT(uint64_set) KHASH_MAP_INIT_INT16(int16, size_t) +KHASH_SET_INIT_INT(int16_set) KHASH_MAP_INIT_UINT16(uint16, size_t) +KHASH_SET_INIT_INT(uint16_set) KHASH_MAP_INIT_INT8(int8, size_t) +KHASH_SET_INIT_INT(int8_set) KHASH_MAP_INIT_UINT8(uint8, size_t) +KHASH_SET_INIT_INT(uint8_set) #endif /* __AC_KHASH_H */ diff --git a/pandas/_libs/include/pandas/vendored/klib/khash_python.h b/pandas/_libs/include/pandas/vendored/klib/khash_python.h index 5a933b45d9e21..e8035f2b3f8a7 100644 --- a/pandas/_libs/include/pandas/vendored/klib/khash_python.h +++ b/pandas/_libs/include/pandas/vendored/klib/khash_python.h @@ -130,13 +130,23 @@ static inline khuint32_t kh_float32_hash_func(float val) { KHASH_INIT(name, khfloat64_t, khval_t, 1, kh_float64_hash_func, \ kh_floats_hash_equal) +#define KHASH_SET_INIT_FLOAT64(name) \ + KHASH_INIT(name, khfloat64_t, char, 0, kh_float64_hash_func, \ + kh_floats_hash_equal) + KHASH_MAP_INIT_FLOAT64(float64, size_t) +KHASH_SET_INIT_FLOAT64(float64_set) #define KHASH_MAP_INIT_FLOAT32(name, khval_t) \ KHASH_INIT(name, khfloat32_t, khval_t, 1, kh_float32_hash_func, \ kh_floats_hash_equal) +#define KHASH_SET_INIT_FLOAT32(name) \ + KHASH_INIT(name, khfloat32_t, char, 0, kh_float32_hash_func, \ + kh_floats_hash_equal) + KHASH_MAP_INIT_FLOAT32(float32, size_t) +KHASH_SET_INIT_FLOAT32(float32_set) static inline khint32_t kh_complex128_hash_func(khcomplex128_t val) { return kh_float64_hash_func(val.real) ^ kh_float64_hash_func(val.imag); @@ -152,13 +162,23 @@ static inline khint32_t kh_complex64_hash_func(khcomplex64_t val) { KHASH_INIT(name, khcomplex64_t, khval_t, 1, kh_complex64_hash_func, \ kh_complex_hash_equal) +#define KHASH_SET_INIT_COMPLEX64(name) \ + KHASH_INIT(name, khcomplex64_t, char, 0, kh_complex64_hash_func, \ + kh_complex_hash_equal) + KHASH_MAP_INIT_COMPLEX64(complex64, size_t) +KHASH_SET_INIT_COMPLEX64(complex64_set) #define KHASH_MAP_INIT_COMPLEX128(name, khval_t) \ KHASH_INIT(name, khcomplex128_t, khval_t, 1, kh_complex128_hash_func, \ kh_complex_hash_equal) +#define KHASH_SET_INIT_COMPLEX128(name) \ + KHASH_INIT(name, khcomplex128_t, char, 0, kh_complex128_hash_func, \ + kh_complex_hash_equal) + KHASH_MAP_INIT_COMPLEX128(complex128, size_t) +KHASH_SET_INIT_COMPLEX128(complex128_set) #define kh_exist_complex64(h, k) (kh_exist(h, k)) #define kh_exist_complex128(h, k) (kh_exist(h, k)) From 5248e1aa7b2681155eb0ade42664655e51603426 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 4 Dec 2023 16:16:55 -0800 Subject: [PATCH 4/8] perf cleanups --- .../include/pandas/vendored/klib/khash.h | 27 ++++++++++++++----- 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/include/pandas/vendored/klib/khash.h b/pandas/_libs/include/pandas/vendored/klib/khash.h index 5a7436270d950..467a2b213e4b3 100644 --- a/pandas/_libs/include/pandas/vendored/klib/khash.h +++ b/pandas/_libs/include/pandas/vendored/klib/khash.h @@ -653,6 +653,9 @@ static inline khuint_t __ac_Wang_hash(khuint_t key) { #define KHASH_MAP_INIT_UINT(name, khval_t) \ KHASH_INIT(name, khuint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) +#define KHASH_SET_INIT_UINT(name) \ + KHASH_INIT(name, khuint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) + /*! @function @abstract Instantiate a hash map containing 64-bit integer keys @param name Name of the hash table [symbol] @@ -684,9 +687,15 @@ static inline khuint_t __ac_Wang_hash(khuint_t key) { #define KHASH_MAP_INIT_INT16(name, khval_t) \ KHASH_INIT(name, khint16_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) +#define KHASH_SET_INIT_INT16(name) \ + KHASH_INIT(name, khint16_t, char, 0, kh_int_hash_func, kh_int_hash_equal) + #define KHASH_MAP_INIT_UINT16(name, khval_t) \ KHASH_INIT(name, khuint16_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) +#define KHASH_SET_INIT_UINT16(name) \ + KHASH_INIT(name, khuint16_t, char, 0, kh_int_hash_func, kh_int_hash_equal) + /*! @function @abstract Instantiate a hash map containing 8bit-integer keys @param name Name of the hash table [symbol] @@ -695,9 +704,15 @@ static inline khuint_t __ac_Wang_hash(khuint_t key) { #define KHASH_MAP_INIT_INT8(name, khval_t) \ KHASH_INIT(name, khint8_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) +#define KHASH_SET_INIT_INT8(name) \ + KHASH_INIT(name, khint8_t, char, 0, kh_int_hash_func, kh_int_hash_equal) + #define KHASH_MAP_INIT_UINT8(name, khval_t) \ KHASH_INIT(name, khuint8_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) +#define KHASH_SET_INIT_UINT8(name) \ + KHASH_INIT(name, khuint8_t, char, 0, kh_int_hash_func, kh_int_hash_equal) + typedef const char *kh_cstr_t; /*! @function @abstract Instantiate a hash map containing const char* keys @@ -730,18 +745,18 @@ KHASH_MAP_INIT_STR(str, size_t) KHASH_MAP_INIT_INT(int32, size_t) KHASH_SET_INIT_INT(int32_set) KHASH_MAP_INIT_UINT(uint32, size_t) -KHASH_SET_INIT_INT(uint32_set) +KHASH_SET_INIT_UINT(uint32_set) KHASH_MAP_INIT_INT64(int64, size_t) KHASH_SET_INIT_INT64(int64_set) KHASH_MAP_INIT_UINT64(uint64, size_t) -KHASH_SET_INIT_INT(uint64_set) +KHASH_SET_INIT_UINT64(uint64_set) KHASH_MAP_INIT_INT16(int16, size_t) -KHASH_SET_INIT_INT(int16_set) +KHASH_SET_INIT_INT16(int16_set) KHASH_MAP_INIT_UINT16(uint16, size_t) -KHASH_SET_INIT_INT(uint16_set) +KHASH_SET_INIT_UINT16(uint16_set) KHASH_MAP_INIT_INT8(int8, size_t) -KHASH_SET_INIT_INT(int8_set) +KHASH_SET_INIT_INT8(int8_set) KHASH_MAP_INIT_UINT8(uint8, size_t) -KHASH_SET_INIT_INT(uint8_set) +KHASH_SET_INIT_UINT8(uint8_set) #endif /* __AC_KHASH_H */ From 7486a771d743e7c2d38a5eee0c7c3e331af7e3eb Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 4 Dec 2023 17:40:54 -0800 Subject: [PATCH 5/8] whatsnew --- doc/source/whatsnew/v2.2.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 5ee2bb1778cb1..5d156d4cbcaaf 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -464,6 +464,7 @@ Performance improvements - Performance improvement in :meth:`Series.str` methods (:issue:`55736`) - Performance improvement in :meth:`Series.value_counts` and :meth:`Series.mode` for masked dtypes (:issue:`54984`, :issue:`55340`) - Performance improvement in :meth:`DataFrameGroupBy.nunique` and :meth:`SeriesGroupBy.nunique` (:issue:`55972`) +- Performance improvement in :meth:`Series.isin` (:issue:`39799`) - Performance improvement in :meth:`SeriesGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`DataFrameGroupBy.idxmin` (:issue:`54234`) - Performance improvement when indexing into a non-unique index (:issue:`55816`) - Performance improvement when indexing with more than 4 keys (:issue:`54550`) From 3ff32e256de002a137864d80dab90193a54eb4e5 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 4 Dec 2023 22:12:57 -0800 Subject: [PATCH 6/8] try set for duplicated --- pandas/_libs/hashtable_func_helper.pxi.in | 51 ++++++++++++++++++++--- 1 file changed, 46 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 616c9cd084043..e3627a86ec4c7 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -41,7 +41,6 @@ cdef extern from "pandas/vendored/klib/khash.h": kh_{{dtype}}_set_t* kh_init_{{dtype}}_set() nogil void kh_destroy_{{dtype}}_set(kh_{{dtype}}_set_t*) nogil void kh_clear_{{dtype}}_set(kh_{{dtype}}_set_t*) nogil - khuint_t kh_get_{{dtype}}_set(kh_{{dtype}}_set_t*, {{c_type}}) nogil void kh_resize_{{dtype}}_set(kh_{{dtype}}_set_t*, khuint_t) nogil khuint_t kh_put_{{dtype}}_set(kh_{{dtype}}_set_t*, {{c_type}}, int*) nogil void kh_del_{{dtype}}_set(kh_{{dtype}}_set_t*, khuint_t) nogil @@ -164,7 +163,11 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', cons bint seen_na = False, uses_mask = mask is not None bint seen_multiple_na = False + {{if dtype == "object"}} kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT)) + {{else}} + kh_resize_{{ttype}}_set(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT)) + {{endif}} if keep not in ('last', 'first', False): raise ValueError('keep must be either "first", "last" or False') @@ -173,6 +176,21 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', cons {{cond}} keep == {{keep}}: {{if dtype == 'object'}} if True: + {{if keep == '"last"'}} + for i in range(n - 1, -1, -1): + {{else}} + for i in range(n): + {{endif}} + if uses_mask and mask[i]: + if seen_na: + out[i] = True + else: + out[i] = False + seen_na = True + else: + value = {{to_c_type}}(values[i]) + kh_put_{{ttype}}(table, value, &ret) + out[i] = ret == 0 {{else}} with nogil: {{endif}} @@ -189,16 +207,13 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', cons seen_na = True else: value = {{to_c_type}}(values[i]) - kh_put_{{ttype}}(table, value, &ret) + kh_put_{{ttype}}_set(table, value, &ret) out[i] = ret == 0 {{endfor}} else: {{if dtype == 'object'}} if True: - {{else}} - with nogil: - {{endif}} for i in range(n): if uses_mask and mask[i]: if not seen_na: @@ -222,6 +237,32 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', cons k = kh_put_{{ttype}}(table, value, &ret) table.vals[k] = i out[i] = 0 + {{else}} + with nogil: + for i in range(n): + if uses_mask and mask[i]: + if not seen_na: + first_na = i + seen_na = True + out[i] = 0 + elif not seen_multiple_na: + out[i] = 1 + out[first_na] = 1 + seen_multiple_na = True + else: + out[i] = 1 + + else: + value = {{to_c_type}}(values[i]) + k = kh_exist_{{ttype}}(table, value) + if k: + out[table.vals[k]] = 1 + out[i] = 1 + else: + k = kh_put_{{ttype}}_set(table, value, &ret) + table.vals[k] = i + out[i] = 0 + {{endif}} kh_destroy_{{ttype}}(table) return out From 8c418e74d6e4ab5150510ee69df284d478279756 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 5 Dec 2023 17:37:29 -0800 Subject: [PATCH 7/8] compiler warning fixup --- pandas/_libs/hashtable_func_helper.pxi.in | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index e3627a86ec4c7..907dd6adb4a32 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -158,14 +158,15 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', cons {{endif}} Py_ssize_t i, n = len(values), first_na = -1 khiter_t k - kh_{{ttype}}_t *table = kh_init_{{ttype}}() ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool') bint seen_na = False, uses_mask = mask is not None bint seen_multiple_na = False {{if dtype == "object"}} + cdef kh_{{ttype}}_t *table = kh_init_{{ttype}}() kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT)) {{else}} + cdef kh_{{ttype}}_set_t *table = kh_init_{{ttype}}() kh_resize_{{ttype}}_set(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT)) {{endif}} @@ -264,7 +265,11 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', cons out[i] = 0 {{endif}} + {{if dtype == "object"}} kh_destroy_{{ttype}}(table) + {{else}} + kh_destroy_{{ttype}}_set(table) + {{endif}} return out From c36a0ba163671e948eb28e75a14ee9b9b3d2511a Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 5 Dec 2023 17:47:40 -0800 Subject: [PATCH 8/8] more fix --- pandas/_libs/hashtable_func_helper.pxi.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 907dd6adb4a32..b08c89ae654df 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -166,7 +166,7 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', cons cdef kh_{{ttype}}_t *table = kh_init_{{ttype}}() kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT)) {{else}} - cdef kh_{{ttype}}_set_t *table = kh_init_{{ttype}}() + cdef kh_{{ttype}}_set_t *table = kh_init_{{ttype}}_set() kh_resize_{{ttype}}_set(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT)) {{endif}}