diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index 75c273b35ee7d..7b630c264753f 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -1,13 +1,27 @@ from numpy cimport intp_t, ndarray from pandas._libs.khash cimport ( + float32_t, float64_t, + int8_t, + int16_t, + int32_t, int64_t, + kh_float32_t, kh_float64_t, + kh_int8_t, + kh_int16_t, + kh_int32_t, kh_int64_t, kh_pymap_t, kh_str_t, + kh_uint8_t, + kh_uint16_t, + kh_uint32_t, kh_uint64_t, + uint8_t, + uint16_t, + uint32_t, uint64_t, ) @@ -28,12 +42,54 @@ cdef class Int64HashTable(HashTable): cpdef get_item(self, int64_t val) cpdef set_item(self, int64_t key, Py_ssize_t val) +cdef class UInt32HashTable(HashTable): + cdef kh_uint32_t *table + + cpdef get_item(self, uint32_t val) + cpdef set_item(self, uint32_t key, Py_ssize_t val) + +cdef class Int32HashTable(HashTable): + cdef kh_int32_t *table + + cpdef get_item(self, int32_t val) + cpdef set_item(self, int32_t key, Py_ssize_t val) + +cdef class UInt16HashTable(HashTable): + cdef kh_uint16_t *table + + cpdef get_item(self, uint16_t val) + cpdef set_item(self, uint16_t key, Py_ssize_t val) + +cdef class Int16HashTable(HashTable): + cdef kh_int16_t *table + + cpdef get_item(self, int16_t val) + cpdef set_item(self, int16_t key, Py_ssize_t val) + +cdef class UInt8HashTable(HashTable): + cdef kh_uint8_t *table + + cpdef get_item(self, uint8_t val) + cpdef set_item(self, uint8_t key, Py_ssize_t val) + +cdef class Int8HashTable(HashTable): + cdef kh_int8_t *table + + cpdef get_item(self, int8_t val) + cpdef set_item(self, int8_t key, Py_ssize_t val) + cdef class Float64HashTable(HashTable): cdef kh_float64_t *table cpdef get_item(self, float64_t val) cpdef set_item(self, float64_t key, Py_ssize_t val) +cdef class Float32HashTable(HashTable): + cdef kh_float32_t *table + + cpdef get_item(self, float32_t val) + cpdef set_item(self, float32_t key, Py_ssize_t val) + cdef class PyObjectHashTable(HashTable): cdef kh_pymap_t *table diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 5a0cddb0af197..cc080a87cfb5b 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -13,45 +13,7 @@ cnp.import_array() from pandas._libs cimport util -from pandas._libs.khash cimport ( - kh_destroy_float64, - kh_destroy_int64, - kh_destroy_pymap, - kh_destroy_str, - kh_destroy_uint64, - kh_exist_float64, - kh_exist_int64, - kh_exist_pymap, - kh_exist_str, - kh_exist_uint64, - kh_float64_t, - kh_get_float64, - kh_get_int64, - kh_get_pymap, - kh_get_str, - kh_get_strbox, - kh_get_uint64, - kh_init_float64, - kh_init_int64, - kh_init_pymap, - kh_init_str, - kh_init_strbox, - kh_init_uint64, - kh_int64_t, - kh_put_float64, - kh_put_int64, - kh_put_pymap, - kh_put_str, - kh_put_strbox, - kh_put_uint64, - kh_resize_float64, - kh_resize_int64, - kh_resize_pymap, - kh_resize_str, - kh_resize_uint64, - kh_str_t, - khiter_t, -) +from pandas._libs.khash cimport kh_str_t, khiter_t from pandas._libs.missing cimport checknull diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index da91fa69b0dec..f7001c165870e 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -5,6 +5,35 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ +{{py: + +# name +cimported_types = ['float32', + 'float64', + 'int8', + 'int16', + 'int32', + 'int64', + 'pymap', + 'str', + 'strbox', + 'uint8', + 'uint16', + 'uint32', + 'uint64'] +}} + +{{for name in cimported_types}} +from pandas._libs.khash cimport ( + kh_destroy_{{name}}, + kh_exist_{{name}}, + kh_get_{{name}}, + kh_init_{{name}}, + kh_put_{{name}}, + kh_resize_{{name}}, +) +{{endfor}} + # ---------------------------------------------------------------------- # VectorData # ---------------------------------------------------------------------- @@ -20,9 +49,16 @@ from pandas._libs.missing cimport C_NA # for uniques in hashtables) dtypes = [('Float64', 'float64', 'float64_t'), + ('Float32', 'float32', 'float32_t'), ('Int64', 'int64', 'int64_t'), + ('Int32', 'int32', 'int32_t'), + ('Int16', 'int16', 'int16_t'), + ('Int8', 'int8', 'int8_t'), ('String', 'string', 'char *'), - ('UInt64', 'uint64', 'uint64_t')] + ('UInt64', 'uint64', 'uint64_t'), + ('UInt32', 'uint32', 'uint32_t'), + ('UInt16', 'uint16', 'uint16_t'), + ('UInt8', 'uint8', 'uint8_t')] }} {{for name, dtype, c_type in dtypes}} @@ -49,8 +85,15 @@ cdef inline void append_data_{{dtype}}({{name}}VectorData *data, ctypedef fused vector_data: Int64VectorData + Int32VectorData + Int16VectorData + Int8VectorData UInt64VectorData + UInt32VectorData + UInt16VectorData + UInt8VectorData Float64VectorData + Float32VectorData StringVectorData cdef inline bint needs_resize(vector_data *data) nogil: @@ -65,7 +108,14 @@ cdef inline bint needs_resize(vector_data *data) nogil: # name, dtype, c_type dtypes = [('Float64', 'float64', 'float64_t'), ('UInt64', 'uint64', 'uint64_t'), - ('Int64', 'int64', 'int64_t')] + ('Int64', 'int64', 'int64_t'), + ('Float32', 'float32', 'float32_t'), + ('UInt32', 'uint32', 'uint32_t'), + ('Int32', 'int32', 'int32_t'), + ('UInt16', 'uint16', 'uint16_t'), + ('Int16', 'int16', 'int16_t'), + ('UInt8', 'uint8', 'uint8_t'), + ('Int8', 'int8', 'int8_t')] }} @@ -253,15 +303,22 @@ cdef class HashTable: {{py: -# name, dtype, float_group, default_na_value -dtypes = [('Float64', 'float64', True, 'np.nan'), - ('UInt64', 'uint64', False, 0), - ('Int64', 'int64', False, 'NPY_NAT')] +# name, dtype, float_group +dtypes = [('Float64', 'float64', True), + ('UInt64', 'uint64', False), + ('Int64', 'int64', False), + ('Float32', 'float32', True), + ('UInt32', 'uint32', False), + ('Int32', 'int32', False), + ('UInt16', 'uint16', False), + ('Int16', 'int16', False), + ('UInt8', 'uint8', False), + ('Int8', 'int8', False)] }} -{{for name, dtype, float_group, default_na_value in dtypes}} +{{for name, dtype, float_group in dtypes}} cdef class {{name}}HashTable(HashTable): @@ -430,7 +487,7 @@ cdef class {{name}}HashTable(HashTable): # which is only used if it's *specified*. na_value2 = <{{dtype}}_t>na_value else: - na_value2 = {{default_na_value}} + na_value2 = 0 with nogil: for i in range(n): diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 4a466ada765ca..7c5afa4ff6b27 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -8,9 +8,16 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in # dtype, ttype, c_type dtypes = [('float64', 'float64', 'float64_t'), + ('float32', 'float32', 'float32_t'), ('uint64', 'uint64', 'uint64_t'), + ('uint32', 'uint32', 'uint32_t'), + ('uint16', 'uint16', 'uint16_t'), + ('uint8', 'uint8', 'uint8_t'), ('object', 'pymap', 'object'), - ('int64', 'int64', 'int64_t')] + ('int64', 'int64', 'int64_t'), + ('int32', 'int32', 'int32_t'), + ('int16', 'int16', 'int16_t'), + ('int8', 'int8', 'int8_t')] }} @@ -54,7 +61,7 @@ cdef build_count_table_{{dtype}}({{dtype}}_t[:] values, for i in range(n): val = values[i] - {{if dtype == 'float64'}} + {{if dtype == 'float64' or dtype == 'float32'}} if val == val or not dropna: {{else}} if True: @@ -275,8 +282,15 @@ def ismember_{{dtype}}(const {{c_type}}[:] arr, const {{c_type}}[:] values): # dtype, ctype, table_type, npy_dtype dtypes = [('float64', 'float64_t', 'float64', 'float64'), + ('float32', 'float32_t', 'float32', 'float32'), ('int64', 'int64_t', 'int64', 'int64'), + ('int32', 'int32_t', 'int32', 'int32'), + ('int16', 'int16_t', 'int16', 'int16'), + ('int8', 'int8_t', 'int8', 'int8'), ('uint64', 'uint64_t', 'uint64', 'uint64'), + ('uint32', 'uint32_t', 'uint32', 'uint32'), + ('uint16', 'uint16_t', 'uint16', 'uint16'), + ('uint8', 'uint8_t', 'uint8', 'uint8'), ('object', 'object', 'pymap', 'object_')] }} diff --git a/pandas/_libs/khash.pxd b/pandas/_libs/khash.pxd index 1bb3a158b4b1a..8b082747bf22b 100644 --- a/pandas/_libs/khash.pxd +++ b/pandas/_libs/khash.pxd @@ -1,5 +1,16 @@ from cpython.object cimport PyObject -from numpy cimport float64_t, int32_t, int64_t, uint32_t, uint64_t +from numpy cimport ( + float32_t, + float64_t, + int8_t, + int16_t, + int32_t, + int64_t, + uint8_t, + uint16_t, + uint32_t, + uint64_t, +) cdef extern from "khash_python.h": @@ -67,72 +78,6 @@ cdef extern from "khash_python.h": void kh_destroy_str_starts(kh_str_starts_t*) nogil void kh_resize_str_starts(kh_str_starts_t*, khint_t) nogil - ctypedef struct kh_int64_t: - khint_t n_buckets, size, n_occupied, upper_bound - uint32_t *flags - int64_t *keys - size_t *vals - - kh_int64_t* kh_init_int64() nogil - void kh_destroy_int64(kh_int64_t*) nogil - void kh_clear_int64(kh_int64_t*) nogil - khint_t kh_get_int64(kh_int64_t*, int64_t) nogil - void kh_resize_int64(kh_int64_t*, khint_t) nogil - khint_t kh_put_int64(kh_int64_t*, int64_t, int*) nogil - void kh_del_int64(kh_int64_t*, khint_t) nogil - - bint kh_exist_int64(kh_int64_t*, khiter_t) nogil - - ctypedef uint64_t khuint64_t - - ctypedef struct kh_uint64_t: - khint_t n_buckets, size, n_occupied, upper_bound - uint32_t *flags - khuint64_t *keys - size_t *vals - - kh_uint64_t* kh_init_uint64() nogil - void kh_destroy_uint64(kh_uint64_t*) nogil - void kh_clear_uint64(kh_uint64_t*) nogil - khint_t kh_get_uint64(kh_uint64_t*, uint64_t) nogil - void kh_resize_uint64(kh_uint64_t*, khint_t) nogil - khint_t kh_put_uint64(kh_uint64_t*, uint64_t, int*) nogil - void kh_del_uint64(kh_uint64_t*, khint_t) nogil - - bint kh_exist_uint64(kh_uint64_t*, khiter_t) nogil - - ctypedef struct kh_float64_t: - khint_t n_buckets, size, n_occupied, upper_bound - uint32_t *flags - float64_t *keys - size_t *vals - - kh_float64_t* kh_init_float64() nogil - void kh_destroy_float64(kh_float64_t*) nogil - void kh_clear_float64(kh_float64_t*) nogil - khint_t kh_get_float64(kh_float64_t*, float64_t) nogil - void kh_resize_float64(kh_float64_t*, khint_t) nogil - khint_t kh_put_float64(kh_float64_t*, float64_t, int*) nogil - void kh_del_float64(kh_float64_t*, khint_t) nogil - - bint kh_exist_float64(kh_float64_t*, khiter_t) nogil - - ctypedef struct kh_int32_t: - khint_t n_buckets, size, n_occupied, upper_bound - uint32_t *flags - int32_t *keys - size_t *vals - - kh_int32_t* kh_init_int32() nogil - void kh_destroy_int32(kh_int32_t*) nogil - void kh_clear_int32(kh_int32_t*) nogil - khint_t kh_get_int32(kh_int32_t*, int32_t) nogil - void kh_resize_int32(kh_int32_t*, khint_t) nogil - khint_t kh_put_int32(kh_int32_t*, int32_t, int*) nogil - void kh_del_int32(kh_int32_t*, khint_t) nogil - - bint kh_exist_int32(kh_int32_t*, khiter_t) nogil - # sweep factorize ctypedef struct kh_strbox_t: @@ -150,3 +95,5 @@ cdef extern from "khash_python.h": void kh_del_strbox(kh_strbox_t*, khint_t) nogil bint kh_exist_strbox(kh_strbox_t*, khiter_t) nogil + +include "khash_for_primitive_helper.pxi" diff --git a/pandas/_libs/khash_for_primitive_helper.pxi.in b/pandas/_libs/khash_for_primitive_helper.pxi.in new file mode 100644 index 0000000000000..db8d3e0b19417 --- /dev/null +++ b/pandas/_libs/khash_for_primitive_helper.pxi.in @@ -0,0 +1,42 @@ +""" +Template for wrapping khash-tables for each primitive `dtype` + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +{{py: + +# name, c_type +primitive_types = [('int64', 'int64_t'), + ('uint64', 'uint64_t'), + ('float64', 'float64_t'), + ('int32', 'int32_t'), + ('uint32', 'uint32_t'), + ('float32', 'float32_t'), + ('int16', 'int16_t'), + ('uint16', 'uint16_t'), + ('int8', 'int8_t'), + ('uint8', 'uint8_t'), + ] +}} + +{{for name, c_type in primitive_types}} + +cdef extern from "khash_python.h": + ctypedef struct kh_{{name}}_t: + khint_t n_buckets, size, n_occupied, upper_bound + uint32_t *flags + {{c_type}} *keys + size_t *vals + + kh_{{name}}_t* kh_init_{{name}}() nogil + void kh_destroy_{{name}}(kh_{{name}}_t*) nogil + void kh_clear_{{name}}(kh_{{name}}_t*) nogil + khint_t kh_get_{{name}}(kh_{{name}}_t*, {{c_type}}) nogil + void kh_resize_{{name}}(kh_{{name}}_t*, khint_t) nogil + khint_t kh_put_{{name}}(kh_{{name}}_t*, {{c_type}}, int*) nogil + void kh_del_{{name}}(kh_{{name}}_t*, khint_t) nogil + + bint kh_exist_{{name}}(kh_{{name}}_t*, khiter_t) nogil + +{{endfor}} diff --git a/pandas/_libs/src/klib/khash.h b/pandas/_libs/src/klib/khash.h index 61a4e80ea8cbc..ecd15d1893c23 100644 --- a/pandas/_libs/src/klib/khash.h +++ b/pandas/_libs/src/klib/khash.h @@ -122,14 +122,23 @@ typedef unsigned long khint32_t; #endif #if ULONG_MAX == ULLONG_MAX -typedef unsigned long khuint64_t; -typedef signed long khint64_t; +typedef unsigned long khint64_t; #else -typedef unsigned long long khuint64_t; -typedef signed long long khint64_t; +typedef unsigned long long khint64_t; +#endif + +#if UINT_MAX == 0xffffu +typedef unsigned int khint16_t; +#elif USHRT_MAX == 0xffffu +typedef unsigned short khint16_t; +#endif + +#if UCHAR_MAX == 0xffu +typedef unsigned char khint8_t; #endif typedef double khfloat64_t; +typedef double khfloat32_t; typedef khint32_t khint_t; typedef khint_t khiter_t; @@ -588,15 +597,25 @@ PANDAS_INLINE khint_t __ac_Wang_hash(khint_t key) @param name Name of the hash table [symbol] @param khval_t Type of values [type] */ + +// we implicitly convert signed int to unsigned int, thus potential overflows +// for operations (<<,*,+) don't trigger undefined behavior, also >>-operator +// is implementation defined for signed ints if sign-bit is set. +// because we never really "get" the keys, there will be no convertion from +// unsigend int to (signed) int (which would be implementation defined behavior) +// this holds also for 64-, 16- and 8-bit integers #define KHASH_MAP_INIT_INT(name, khval_t) \ KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) +#define KHASH_MAP_INIT_UINT(name, khval_t) \ + KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) + /*! @function @abstract Instantiate a hash map containing 64-bit integer keys @param name Name of the hash table [symbol] */ #define KHASH_SET_INIT_UINT64(name) \ - KHASH_INIT(name, khuint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) + KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) #define KHASH_SET_INIT_INT64(name) \ KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) @@ -607,11 +626,34 @@ PANDAS_INLINE khint_t __ac_Wang_hash(khint_t key) @param khval_t Type of values [type] */ #define KHASH_MAP_INIT_UINT64(name, khval_t) \ - KHASH_INIT(name, khuint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) + KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) #define KHASH_MAP_INIT_INT64(name, khval_t) \ KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) +/*! @function + @abstract Instantiate a hash map containing 16bit-integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT16(name, khval_t) \ + KHASH_INIT(name, khint16_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) + +#define KHASH_MAP_INIT_UINT16(name, khval_t) \ + KHASH_INIT(name, khint16_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing 8bit-integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT8(name, khval_t) \ + KHASH_INIT(name, khint8_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) + +#define KHASH_MAP_INIT_UINT8(name, khval_t) \ + KHASH_INIT(name, khint8_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) + + typedef const char *kh_cstr_t; /*! @function @@ -634,12 +676,23 @@ typedef const char *kh_cstr_t; #define kh_exist_float64(h, k) (kh_exist(h, k)) #define kh_exist_uint64(h, k) (kh_exist(h, k)) #define kh_exist_int64(h, k) (kh_exist(h, k)) +#define kh_exist_float32(h, k) (kh_exist(h, k)) #define kh_exist_int32(h, k) (kh_exist(h, k)) +#define kh_exist_uint32(h, k) (kh_exist(h, k)) +#define kh_exist_int16(h, k) (kh_exist(h, k)) +#define kh_exist_uint16(h, k) (kh_exist(h, k)) +#define kh_exist_int8(h, k) (kh_exist(h, k)) +#define kh_exist_uint8(h, k) (kh_exist(h, k)) KHASH_MAP_INIT_STR(str, size_t) KHASH_MAP_INIT_INT(int32, size_t) +KHASH_MAP_INIT_UINT(uint32, size_t) KHASH_MAP_INIT_INT64(int64, size_t) KHASH_MAP_INIT_UINT64(uint64, size_t) +KHASH_MAP_INIT_INT16(int16, size_t) +KHASH_MAP_INIT_UINT16(uint16, size_t) +KHASH_MAP_INIT_INT16(int8, size_t) +KHASH_MAP_INIT_UINT16(uint8, size_t) #endif /* __AC_KHASH_H */ diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h index aebc229abddd2..c37f0e950baa7 100644 --- a/pandas/_libs/src/klib/khash_python.h +++ b/pandas/_libs/src/klib/khash_python.h @@ -23,6 +23,12 @@ khint64_t PANDAS_INLINE asint64(double key) { return val; } +khint32_t PANDAS_INLINE asint32(float key) { + khint32_t val; + memcpy(&val, &key, sizeof(float)); + return val; +} + #define ZERO_HASH 0 #define NAN_HASH 0 @@ -39,13 +45,31 @@ khint32_t PANDAS_INLINE kh_float64_hash_func(double val){ return murmur2_64to32(as_int); } -#define kh_float64_hash_equal(a, b) ((a) == (b) || ((b) != (b) && (a) != (a))) +khint32_t PANDAS_INLINE kh_float32_hash_func(float val){ + // 0.0 and -0.0 should have the same hash: + if (val == 0.0f){ + return ZERO_HASH; + } + // all nans should have the same hash: + if ( val!=val ){ + return NAN_HASH; + } + khint32_t as_int = asint32(val); + return murmur2_32to32(as_int); +} + +#define kh_floats_hash_equal(a, b) ((a) == (b) || ((b) != (b) && (a) != (a))) #define KHASH_MAP_INIT_FLOAT64(name, khval_t) \ - KHASH_INIT(name, khfloat64_t, khval_t, 1, kh_float64_hash_func, kh_float64_hash_equal) + KHASH_INIT(name, khfloat64_t, khval_t, 1, kh_float64_hash_func, kh_floats_hash_equal) KHASH_MAP_INIT_FLOAT64(float64, size_t) +#define KHASH_MAP_INIT_FLOAT32(name, khval_t) \ + KHASH_INIT(name, khfloat32_t, khval_t, 1, kh_float32_hash_func, kh_floats_hash_equal) + +KHASH_MAP_INIT_FLOAT32(float32, size_t) + int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) { int result = PyObject_RichCompareBool(a, b, Py_EQ); diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py new file mode 100644 index 0000000000000..5ef110e9672f0 --- /dev/null +++ b/pandas/tests/libs/test_hashtable.py @@ -0,0 +1,265 @@ +import numpy as np +import pytest + +from pandas._libs import hashtable as ht + +import pandas._testing as tm + + +@pytest.mark.parametrize( + "table_type, dtype", + [ + (ht.Int64HashTable, np.int64), + (ht.UInt64HashTable, np.uint64), + (ht.Float64HashTable, np.float64), + (ht.Int32HashTable, np.int32), + (ht.UInt32HashTable, np.uint32), + (ht.Float32HashTable, np.float32), + (ht.Int16HashTable, np.int16), + (ht.UInt16HashTable, np.uint16), + (ht.Int8HashTable, np.int8), + (ht.UInt8HashTable, np.uint8), + ], +) +class TestHashTable: + def test_get_set_contains_len(self, table_type, dtype): + index = 5 + table = table_type(55) + assert len(table) == 0 + assert index not in table + + table.set_item(index, 42) + assert len(table) == 1 + assert index in table + assert table.get_item(index) == 42 + + table.set_item(index + 1, 41) + assert index in table + assert index + 1 in table + assert len(table) == 2 + assert table.get_item(index) == 42 + assert table.get_item(index + 1) == 41 + + table.set_item(index, 21) + assert index in table + assert index + 1 in table + assert len(table) == 2 + assert table.get_item(index) == 21 + assert table.get_item(index + 1) == 41 + assert index + 2 not in table + + with pytest.raises(KeyError) as excinfo: + table.get_item(index + 2) + assert str(index + 2) in str(excinfo.value) + + def test_map(self, table_type, dtype): + N = 77 + table = table_type() + keys = np.arange(N).astype(dtype) + vals = np.arange(N).astype(np.int64) + N + table.map(keys, vals) + for i in range(N): + assert table.get_item(keys[i]) == i + N + + def test_map_locations(self, table_type, dtype): + N = 8 + table = table_type() + keys = (np.arange(N) + N).astype(dtype) + table.map_locations(keys) + for i in range(N): + assert table.get_item(keys[i]) == i + + def test_lookup(self, table_type, dtype): + N = 3 + table = table_type() + keys = (np.arange(N) + N).astype(dtype) + table.map_locations(keys) + result = table.lookup(keys) + expected = np.arange(N) + tm.assert_numpy_array_equal(result.astype(np.int64), expected.astype(np.int64)) + + def test_lookup_wrong(self, table_type, dtype): + if dtype in (np.int8, np.uint8): + N = 100 + else: + N = 512 + table = table_type() + keys = (np.arange(N) + N).astype(dtype) + table.map_locations(keys) + wrong_keys = np.arange(N).astype(dtype) + result = table.lookup(wrong_keys) + assert np.all(result == -1) + + def test_unique(self, table_type, dtype): + if dtype in (np.int8, np.uint8): + N = 88 + else: + N = 1000 + table = table_type() + expected = (np.arange(N) + N).astype(dtype) + keys = np.repeat(expected, 5) + unique = table.unique(keys) + tm.assert_numpy_array_equal(unique, expected) + + +@pytest.mark.parametrize( + "table_type, dtype", + [ + (ht.Float64HashTable, np.float64), + (ht.Float32HashTable, np.float32), + ], +) +class TestHashTableWithNans: + def test_get_set_contains_len(self, table_type, dtype): + index = float("nan") + table = table_type() + assert index not in table + + table.set_item(index, 42) + assert len(table) == 1 + assert index in table + assert table.get_item(index) == 42 + + table.set_item(index, 41) + assert len(table) == 1 + assert index in table + assert table.get_item(index) == 41 + + def test_map(self, table_type, dtype): + N = 332 + table = table_type() + keys = np.full(N, np.nan, dtype=dtype) + vals = (np.arange(N) + N).astype(np.int64) + table.map(keys, vals) + assert len(table) == 1 + assert table.get_item(np.nan) == 2 * N - 1 + + def test_map_locations(self, table_type, dtype): + N = 10 + table = table_type() + keys = np.full(N, np.nan, dtype=dtype) + table.map_locations(keys) + assert len(table) == 1 + assert table.get_item(np.nan) == N - 1 + + def test_unique(self, table_type, dtype): + N = 1020 + table = table_type() + keys = np.full(N, np.nan, dtype=dtype) + unique = table.unique(keys) + assert np.all(np.isnan(unique)) and len(unique) == 1 + + +def get_ht_function(fun_name, type_suffix): + return getattr(ht, fun_name + "_" + type_suffix) + + +@pytest.mark.parametrize( + "dtype, type_suffix", + [ + (np.int64, "int64"), + (np.uint64, "uint64"), + (np.float64, "float64"), + (np.int32, "int32"), + (np.uint32, "uint32"), + (np.float32, "float32"), + (np.int16, "int16"), + (np.uint16, "uint16"), + (np.int8, "int8"), + (np.uint8, "uint8"), + ], +) +class TestHelpFunctions: + def test_value_count(self, dtype, type_suffix): + N = 43 + value_count = get_ht_function("value_count", type_suffix) + expected = (np.arange(N) + N).astype(dtype) + values = np.repeat(expected, 5) + keys, counts = value_count(values, False) + tm.assert_numpy_array_equal(np.sort(keys), expected) + assert np.all(counts == 5) + + def test_duplicated_first(self, dtype, type_suffix): + N = 100 + duplicated = get_ht_function("duplicated", type_suffix) + values = np.repeat(np.arange(N).astype(dtype), 5) + result = duplicated(values) + expected = np.ones_like(values, dtype=np.bool_) + expected[::5] = False + tm.assert_numpy_array_equal(result, expected) + + def test_ismember_yes(self, dtype, type_suffix): + N = 127 + ismember = get_ht_function("ismember", type_suffix) + arr = np.arange(N).astype(dtype) + values = np.arange(N).astype(dtype) + result = ismember(arr, values) + expected = np.ones_like(values, dtype=np.bool_) + tm.assert_numpy_array_equal(result, expected) + + def test_ismember_no(self, dtype, type_suffix): + N = 17 + ismember = get_ht_function("ismember", type_suffix) + arr = np.arange(N).astype(dtype) + values = (np.arange(N) + N).astype(dtype) + result = ismember(arr, values) + expected = np.zeros_like(values, dtype=np.bool_) + tm.assert_numpy_array_equal(result, expected) + + def test_mode(self, dtype, type_suffix): + if dtype in (np.int8, np.uint8): + N = 53 + else: + N = 11111 + mode = get_ht_function("mode", type_suffix) + values = np.repeat(np.arange(N).astype(dtype), 5) + values[0] = 42 + result = mode(values, False) + assert result == 42 + + +@pytest.mark.parametrize( + "dtype, type_suffix", + [ + (np.float64, "float64"), + (np.float32, "float32"), + ], +) +class TestHelpFunctionsWithNans: + def test_value_count(self, dtype, type_suffix): + value_count = get_ht_function("value_count", type_suffix) + values = np.array([np.nan, np.nan, np.nan], dtype=dtype) + keys, counts = value_count(values, True) + assert len(keys) == 0 + keys, counts = value_count(values, False) + assert len(keys) == 1 and np.all(np.isnan(keys)) + assert counts[0] == 3 + + def test_duplicated_first(self, dtype, type_suffix): + duplicated = get_ht_function("duplicated", type_suffix) + values = np.array([np.nan, np.nan, np.nan], dtype=dtype) + result = duplicated(values) + expected = np.array([False, True, True]) + tm.assert_numpy_array_equal(result, expected) + + def test_ismember_yes(self, dtype, type_suffix): + ismember = get_ht_function("ismember", type_suffix) + arr = np.array([np.nan, np.nan, np.nan], dtype=dtype) + values = np.array([np.nan, np.nan], dtype=dtype) + result = ismember(arr, values) + expected = np.array([True, True, True], dtype=np.bool_) + tm.assert_numpy_array_equal(result, expected) + + def test_ismember_no(self, dtype, type_suffix): + ismember = get_ht_function("ismember", type_suffix) + arr = np.array([np.nan, np.nan, np.nan], dtype=dtype) + values = np.array([1], dtype=dtype) + result = ismember(arr, values) + expected = np.array([False, False, False], dtype=np.bool_) + tm.assert_numpy_array_equal(result, expected) + + def test_mode(self, dtype, type_suffix): + mode = get_ht_function("mode", type_suffix) + values = np.array([42, np.nan, np.nan, np.nan], dtype=dtype) + assert mode(values, True) == 42 + assert np.isnan(mode(values, False)) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 34b7d0e73e914..3b6f5d145b500 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1517,6 +1517,7 @@ def test_get_unique(self): (ht.StringHashTable, ht.ObjectVector, "object", True), (ht.Float64HashTable, ht.Float64Vector, "float64", False), (ht.Int64HashTable, ht.Int64Vector, "int64", False), + (ht.Int32HashTable, ht.Int32Vector, "int32", False), (ht.UInt64HashTable, ht.UInt64Vector, "uint64", False), ], ) @@ -1640,6 +1641,7 @@ def test_hashtable_factorize(self, htable, tm_dtype, writable): ht.StringHashTable, ht.Float64HashTable, ht.Int64HashTable, + ht.Int32HashTable, ht.UInt64HashTable, ], ) diff --git a/setup.py b/setup.py index 78a789c808efb..9f33c045df6ed 100755 --- a/setup.py +++ b/setup.py @@ -53,6 +53,7 @@ def is_platform_mac(): "hashtable": [ "_libs/hashtable_class_helper.pxi.in", "_libs/hashtable_func_helper.pxi.in", + "_libs/khash_for_primitive_helper.pxi.in", ], "index": ["_libs/index_class_helper.pxi.in"], "sparse": ["_libs/sparse_op_helper.pxi.in"], @@ -525,7 +526,10 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): "_libs.hashtable": { "pyxfile": "_libs/hashtable", "include": klib_include, - "depends": (["pandas/_libs/src/klib/khash_python.h"] + _pxi_dep["hashtable"]), + "depends": ( + ["pandas/_libs/src/klib/khash_python.h", "pandas/_libs/src/klib/khash.h"] + + _pxi_dep["hashtable"] + ), }, "_libs.index": { "pyxfile": "_libs/index",