diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 52b1a3aae788c..7920f05b5e7a1 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -413,7 +413,7 @@ def _value_counts_arraylike(values, dropna=True): freq = values.freq values = values.view(np.int64) - keys, counts = htable.value_count_scalar64(values, dropna) + keys, counts = htable.value_count_int64(values, dropna) if dropna: msk = keys != iNaT @@ -434,10 +434,10 @@ def _value_counts_arraylike(values, dropna=True): elif is_integer_dtype(dtype): values = _ensure_int64(values) - keys, counts = htable.value_count_scalar64(values, dropna) + keys, counts = htable.value_count_int64(values, dropna) elif is_float_dtype(dtype): values = _ensure_float64(values) - keys, counts = htable.value_count_scalar64(values, dropna) + keys, counts = htable.value_count_float64(values, dropna) else: values = _ensure_object(values) mask = isnull(values) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 6179857978b7b..f14f9b5dd24af 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -4399,7 +4399,7 @@ def _groupby_indices(values): # bit better than factorizing again reverse = dict(enumerate(values.categories)) codes = values.codes.astype('int64') - _, counts = _hash.value_count_scalar64(codes, False) + _, counts = _hash.value_count_int64(codes, False) else: reverse, codes, counts = _algos.group_labels( _values_from_object(_ensure_object(values))) diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx index 18e54621e8bf5..d1b6b326d7de6 100644 --- a/pandas/hashtable.pyx +++ b/pandas/hashtable.pyx @@ -35,773 +35,9 @@ cdef extern from "Python.h": cdef size_t _INIT_VEC_CAP = 32 -cdef class ObjectVector: - - cdef: - PyObject **data - size_t n, m - ndarray ao - - def __cinit__(self): - self.n = 0 - self.m = _INIT_VEC_CAP - self.ao = np.empty(_INIT_VEC_CAP, dtype=object) - self.data = self.ao.data - - def __len__(self): - return self.n - - cdef inline append(self, object o): - if self.n == self.m: - self.m = max(self.m * 2, _INIT_VEC_CAP) - self.ao.resize(self.m) - self.data = self.ao.data - - Py_INCREF(o) - self.data[self.n] = o - self.n += 1 - - def to_array(self): - self.ao.resize(self.n) - self.m = self.n - return self.ao - -ctypedef struct Int64VectorData: - int64_t *data - size_t n, m - -ctypedef struct Float64VectorData: - float64_t *data - size_t n, m - -ctypedef fused vector_data: - Int64VectorData - Float64VectorData - -ctypedef fused sixty_four_bit_scalar: - int64_t - float64_t - -cdef bint needs_resize(vector_data *data) nogil: - return data.n == data.m - -cdef void append_data(vector_data *data, sixty_four_bit_scalar x) nogil: - - # compile time specilization of the fused types - # as the cross-product is generated, but we cannot assign float->int - # the types that don't pass are pruned - if (vector_data is Int64VectorData and sixty_four_bit_scalar is int64_t) or ( - vector_data is Float64VectorData and sixty_four_bit_scalar is float64_t): - - data.data[data.n] = x - data.n += 1 - -cdef class Int64Vector: - - cdef: - Int64VectorData *data - ndarray ao - - def __cinit__(self): - self.data = PyMem_Malloc(sizeof(Int64VectorData)) - if not self.data: - raise MemoryError() - self.data.n = 0 - self.data.m = _INIT_VEC_CAP - self.ao = np.empty(self.data.m, dtype=np.int64) - self.data.data = self.ao.data - - cdef resize(self): - self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) - self.ao.resize(self.data.m) - self.data.data = self.ao.data - - def __dealloc__(self): - PyMem_Free(self.data) - - def __len__(self): - return self.data.n - - def to_array(self): - self.ao.resize(self.data.n) - self.data.m = self.data.n - return self.ao - - cdef inline void append(self, int64_t x): - - if needs_resize(self.data): - self.resize() - - append_data(self.data, x) - -cdef class Float64Vector: - - cdef: - Float64VectorData *data - ndarray ao - - def __cinit__(self): - self.data = PyMem_Malloc(sizeof(Float64VectorData)) - if not self.data: - raise MemoryError() - self.data.n = 0 - self.data.m = _INIT_VEC_CAP - self.ao = np.empty(self.data.m, dtype=np.float64) - self.data.data = self.ao.data - - cdef resize(self): - self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) - self.ao.resize(self.data.m) - self.data.data = self.ao.data - - def __dealloc__(self): - PyMem_Free(self.data) - - def __len__(self): - return self.data.n - - def to_array(self): - self.ao.resize(self.data.n) - self.data.m = self.data.n - return self.ao - - cdef inline void append(self, float64_t x): - - if needs_resize(self.data): - self.resize() - - append_data(self.data, x) - -cdef class HashTable: - pass - -cdef class StringHashTable(HashTable): - cdef kh_str_t *table - - def __cinit__(self, int size_hint=1): - self.table = kh_init_str() - if size_hint is not None: - kh_resize_str(self.table, size_hint) - - def __dealloc__(self): - kh_destroy_str(self.table) - - cpdef get_item(self, object val): - cdef khiter_t k - k = kh_get_str(self.table, util.get_c_string(val)) - if k != self.table.n_buckets: - return self.table.vals[k] - else: - raise KeyError(val) - - def get_iter_test(self, object key, Py_ssize_t iterations): - cdef Py_ssize_t i, val - for i in range(iterations): - k = kh_get_str(self.table, util.get_c_string(key)) - if k != self.table.n_buckets: - val = self.table.vals[k] - - cpdef set_item(self, object key, Py_ssize_t val): - cdef: - khiter_t k - int ret = 0 - char* buf - - buf = util.get_c_string(key) - - k = kh_put_str(self.table, buf, &ret) - self.table.keys[k] = key - if kh_exist_str(self.table, k): - self.table.vals[k] = val - else: - raise KeyError(key) - - def get_indexer(self, ndarray[object] values): - cdef: - Py_ssize_t i, n = len(values) - ndarray[int64_t] labels = np.empty(n, dtype=np.int64) - char *buf - int64_t *resbuf = labels.data - khiter_t k - kh_str_t *table = self.table - - for i in range(n): - buf = util.get_c_string(values[i]) - k = kh_get_str(table, buf) - if k != table.n_buckets: - resbuf[i] = table.vals[k] - else: - resbuf[i] = -1 - return labels - - def unique(self, ndarray[object] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - object val - char *buf - khiter_t k - ObjectVector uniques = ObjectVector() - - for i in range(n): - val = values[i] - buf = util.get_c_string(val) - k = kh_get_str(self.table, buf) - if k == self.table.n_buckets: - kh_put_str(self.table, buf, &ret) - uniques.append(val) - - return uniques.to_array() - - def factorize(self, ndarray[object] values): - cdef: - Py_ssize_t i, n = len(values) - ndarray[int64_t] labels = np.empty(n, dtype=np.int64) - dict reverse = {} - Py_ssize_t idx, count = 0 - int ret = 0 - object val - char *buf - khiter_t k - - for i in range(n): - val = values[i] - buf = util.get_c_string(val) - k = kh_get_str(self.table, buf) - if k != self.table.n_buckets: - idx = self.table.vals[k] - labels[i] = idx - else: - k = kh_put_str(self.table, buf, &ret) - # print 'putting %s, %s' % (val, count) - - self.table.vals[k] = count - reverse[count] = val - labels[i] = count - count += 1 - - return reverse, labels - -cdef class Int64HashTable(HashTable): - - def __cinit__(self, size_hint=1): - self.table = kh_init_int64() - if size_hint is not None: - kh_resize_int64(self.table, size_hint) - - def __len__(self): - return self.table.size - - def __dealloc__(self): - kh_destroy_int64(self.table) - - def __contains__(self, object key): - cdef khiter_t k - k = kh_get_int64(self.table, key) - return k != self.table.n_buckets - - cpdef get_item(self, int64_t val): - cdef khiter_t k - k = kh_get_int64(self.table, val) - if k != self.table.n_buckets: - return self.table.vals[k] - else: - raise KeyError(val) - - def get_iter_test(self, int64_t key, Py_ssize_t iterations): - cdef Py_ssize_t i, val=0 - for i in range(iterations): - k = kh_get_int64(self.table, val) - if k != self.table.n_buckets: - val = self.table.vals[k] - - cpdef set_item(self, int64_t key, Py_ssize_t val): - cdef: - khiter_t k - int ret = 0 - - k = kh_put_int64(self.table, key, &ret) - self.table.keys[k] = key - if kh_exist_int64(self.table, k): - self.table.vals[k] = val - else: - raise KeyError(key) - - @cython.boundscheck(False) - def map(self, int64_t[:] keys, int64_t[:] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - int64_t key - khiter_t k - - with nogil: - for i in range(n): - key = keys[i] - k = kh_put_int64(self.table, key, &ret) - self.table.vals[k] = values[i] - - @cython.boundscheck(False) - def map_locations(self, ndarray[int64_t, ndim=1] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - int64_t val - khiter_t k - - with nogil: - for i in range(n): - val = values[i] - k = kh_put_int64(self.table, val, &ret) - self.table.vals[k] = i - - @cython.boundscheck(False) - def lookup(self, int64_t[:] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - int64_t val - khiter_t k - int64_t[:] locs = np.empty(n, dtype=np.int64) - - with nogil: - for i in range(n): - val = values[i] - k = kh_get_int64(self.table, val) - if k != self.table.n_buckets: - locs[i] = self.table.vals[k] - else: - locs[i] = -1 - - return np.asarray(locs) - - def factorize(self, ndarray[object] values): - reverse = {} - labels = self.get_labels(values, reverse, 0, 0) - return reverse, labels - - @cython.boundscheck(False) - def get_labels(self, int64_t[:] values, Int64Vector uniques, - Py_ssize_t count_prior, Py_ssize_t na_sentinel, - bint check_null=True): - cdef: - Py_ssize_t i, n = len(values) - int64_t[:] labels - Py_ssize_t idx, count = count_prior - int ret = 0 - int64_t val - khiter_t k - Int64VectorData *ud - - labels = np.empty(n, dtype=np.int64) - ud = uniques.data - - with nogil: - for i in range(n): - val = values[i] - k = kh_get_int64(self.table, val) - - if check_null and val == iNaT: - labels[i] = na_sentinel - continue - - if k != self.table.n_buckets: - idx = self.table.vals[k] - labels[i] = idx - else: - k = kh_put_int64(self.table, val, &ret) - self.table.vals[k] = count - - if needs_resize(ud): - with gil: - uniques.resize() - append_data(ud, val) - labels[i] = count - count += 1 - - return np.asarray(labels) - - @cython.boundscheck(False) - def get_labels_groupby(self, int64_t[:] values): - cdef: - Py_ssize_t i, n = len(values) - int64_t[:] labels - Py_ssize_t idx, count = 0 - int ret = 0 - int64_t val - khiter_t k - Int64Vector uniques = Int64Vector() - Int64VectorData *ud - - labels = np.empty(n, dtype=np.int64) - ud = uniques.data - - with nogil: - for i in range(n): - val = values[i] - - # specific for groupby - if val < 0: - labels[i] = -1 - continue - - k = kh_get_int64(self.table, val) - if k != self.table.n_buckets: - idx = self.table.vals[k] - labels[i] = idx - else: - k = kh_put_int64(self.table, val, &ret) - self.table.vals[k] = count - - if needs_resize(ud): - with gil: - uniques.resize() - append_data(ud, val) - labels[i] = count - count += 1 - - arr_uniques = uniques.to_array() - - return np.asarray(labels), arr_uniques - - @cython.boundscheck(False) - def unique(self, int64_t[:] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - int64_t val - khiter_t k - Int64Vector uniques = Int64Vector() - Int64VectorData *ud - - ud = uniques.data - - with nogil: - for i in range(n): - val = values[i] - k = kh_get_int64(self.table, val) - if k == self.table.n_buckets: - kh_put_int64(self.table, val, &ret) - - if needs_resize(ud): - with gil: - uniques.resize() - append_data(ud, val) - - return uniques.to_array() - - -cdef class Float64HashTable(HashTable): - - def __cinit__(self, size_hint=1): - self.table = kh_init_float64() - if size_hint is not None: - kh_resize_float64(self.table, size_hint) - - def __len__(self): - return self.table.size - - cpdef get_item(self, float64_t val): - cdef khiter_t k - k = kh_get_float64(self.table, val) - if k != self.table.n_buckets: - return self.table.vals[k] - else: - raise KeyError(val) - - cpdef set_item(self, float64_t key, Py_ssize_t val): - cdef: - khiter_t k - int ret = 0 - - k = kh_put_float64(self.table, key, &ret) - self.table.keys[k] = key - if kh_exist_float64(self.table, k): - self.table.vals[k] = val - else: - raise KeyError(key) - - def __dealloc__(self): - kh_destroy_float64(self.table) - - def __contains__(self, object key): - cdef khiter_t k - k = kh_get_float64(self.table, key) - return k != self.table.n_buckets - - def factorize(self, float64_t[:] values): - uniques = Float64Vector() - labels = self.get_labels(values, uniques, 0, -1, 1) - return uniques.to_array(), labels - - @cython.boundscheck(False) - def get_labels(self, float64_t[:] values, - Float64Vector uniques, - Py_ssize_t count_prior, int64_t na_sentinel, - bint check_null=True): - cdef: - Py_ssize_t i, n = len(values) - int64_t[:] labels - Py_ssize_t idx, count = count_prior - int ret = 0 - float64_t val - khiter_t k - Float64VectorData *ud - - labels = np.empty(n, dtype=np.int64) - ud = uniques.data - - with nogil: - for i in range(n): - val = values[i] - - if check_null and val != val: - labels[i] = na_sentinel - continue - - k = kh_get_float64(self.table, val) - if k != self.table.n_buckets: - idx = self.table.vals[k] - labels[i] = idx - else: - k = kh_put_float64(self.table, val, &ret) - self.table.vals[k] = count - - if needs_resize(ud): - with gil: - uniques.resize() - append_data(ud, val) - labels[i] = count - count += 1 - - return np.asarray(labels) - - @cython.boundscheck(False) - def map_locations(self, ndarray[float64_t, ndim=1] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - khiter_t k - - with nogil: - for i in range(n): - k = kh_put_float64(self.table, values[i], &ret) - self.table.vals[k] = i - - @cython.boundscheck(False) - def lookup(self, float64_t[:] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - float64_t val - khiter_t k - int64_t[:] locs = np.empty(n, dtype=np.int64) - - with nogil: - for i in range(n): - val = values[i] - k = kh_get_float64(self.table, val) - if k != self.table.n_buckets: - locs[i] = self.table.vals[k] - else: - locs[i] = -1 - - return np.asarray(locs) - - @cython.boundscheck(False) - def unique(self, float64_t[:] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - float64_t val - khiter_t k - bint seen_na = 0 - Float64Vector uniques = Float64Vector() - Float64VectorData *ud - - ud = uniques.data - - with nogil: - for i in range(n): - val = values[i] - - if val == val: - k = kh_get_float64(self.table, val) - if k == self.table.n_buckets: - kh_put_float64(self.table, val, &ret) - - if needs_resize(ud): - with gil: - uniques.resize() - append_data(ud, val) - - elif not seen_na: - seen_na = 1 - - if needs_resize(ud): - with gil: - uniques.resize() - append_data(ud, NAN) - - return uniques.to_array() - -na_sentinel = object - -cdef class PyObjectHashTable(HashTable): - - def __init__(self, size_hint=1): - self.table = kh_init_pymap() - kh_resize_pymap(self.table, size_hint) - - def __dealloc__(self): - if self.table is not NULL: - self.destroy() - - def __len__(self): - return self.table.size - - def __contains__(self, object key): - cdef khiter_t k - hash(key) - if key != key or key is None: - key = na_sentinel - k = kh_get_pymap(self.table, key) - return k != self.table.n_buckets - - def destroy(self): - kh_destroy_pymap(self.table) - self.table = NULL - - cpdef get_item(self, object val): - cdef khiter_t k - if val != val or val is None: - val = na_sentinel - k = kh_get_pymap(self.table, val) - if k != self.table.n_buckets: - return self.table.vals[k] - else: - raise KeyError(val) - - def get_iter_test(self, object key, Py_ssize_t iterations): - cdef Py_ssize_t i, val - if key != key or key is None: - key = na_sentinel - for i in range(iterations): - k = kh_get_pymap(self.table, key) - if k != self.table.n_buckets: - val = self.table.vals[k] - - cpdef set_item(self, object key, Py_ssize_t val): - cdef: - khiter_t k - int ret = 0 - char* buf - - hash(key) - if key != key or key is None: - key = na_sentinel - k = kh_put_pymap(self.table, key, &ret) - # self.table.keys[k] = key - if kh_exist_pymap(self.table, k): - self.table.vals[k] = val - else: - raise KeyError(key) - - def map_locations(self, ndarray[object] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - object val - khiter_t k - - for i in range(n): - val = values[i] - hash(val) - if val != val or val is None: - val = na_sentinel - - k = kh_put_pymap(self.table, val, &ret) - self.table.vals[k] = i - - def lookup(self, ndarray[object] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - object val - khiter_t k - int64_t[:] locs = np.empty(n, dtype=np.int64) - - for i in range(n): - val = values[i] - hash(val) - if val != val or val is None: - val = na_sentinel - - k = kh_get_pymap(self.table, val) - if k != self.table.n_buckets: - locs[i] = self.table.vals[k] - else: - locs[i] = -1 - - return np.asarray(locs) - - def unique(self, ndarray[object] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - object val - khiter_t k - ObjectVector uniques = ObjectVector() - bint seen_na = 0 - - for i in range(n): - val = values[i] - hash(val) - if not _checknan(val): - k = kh_get_pymap(self.table, val) - if k == self.table.n_buckets: - kh_put_pymap(self.table, val, &ret) - uniques.append(val) - elif not seen_na: - seen_na = 1 - uniques.append(nan) - - return uniques.to_array() - - def get_labels(self, ndarray[object] values, ObjectVector uniques, - Py_ssize_t count_prior, int64_t na_sentinel, - bint check_null=True): - cdef: - Py_ssize_t i, n = len(values) - int64_t[:] labels - Py_ssize_t idx, count = count_prior - int ret = 0 - object val - khiter_t k - - labels = np.empty(n, dtype=np.int64) - - for i in range(n): - val = values[i] - hash(val) - - if check_null and val != val or val is None: - labels[i] = na_sentinel - continue - - k = kh_get_pymap(self.table, val) - if k != self.table.n_buckets: - idx = self.table.vals[k] - labels[i] = idx - else: - k = kh_put_pymap(self.table, val, &ret) - self.table.vals[k] = count - uniques.append(val) - labels[i] = count - count += 1 - - return np.asarray(labels) +include "hashtable_class_helper.pxi" +include "hashtable_func_helper.pxi" cdef class Factorizer: cdef public PyObjectHashTable table @@ -876,94 +112,9 @@ cdef class Int64Factorizer: self.count = len(self.uniques) return labels -ctypedef fused kh_scalar64: - kh_int64_t - kh_float64_t - -@cython.boundscheck(False) -cdef build_count_table_scalar64(sixty_four_bit_scalar[:] values, - kh_scalar64 *table, bint dropna): - cdef: - khiter_t k - Py_ssize_t i, n = len(values) - sixty_four_bit_scalar val - int ret = 0 - - if sixty_four_bit_scalar is float64_t and kh_scalar64 is kh_float64_t: - with nogil: - kh_resize_float64(table, n) - - for i in range(n): - val = values[i] - if val == val or not dropna: - k = kh_get_float64(table, val) - if k != table.n_buckets: - table.vals[k] += 1 - else: - k = kh_put_float64(table, val, &ret) - table.vals[k] = 1 - elif sixty_four_bit_scalar is int64_t and kh_scalar64 is kh_int64_t: - with nogil: - kh_resize_int64(table, n) - - for i in range(n): - val = values[i] - k = kh_get_int64(table, val) - if k != table.n_buckets: - table.vals[k] += 1 - else: - k = kh_put_int64(table, val, &ret) - table.vals[k] = 1 - else: - raise ValueError("Table type must match scalar type.") - - +@cython.wraparound(False) @cython.boundscheck(False) -cpdef value_count_scalar64(sixty_four_bit_scalar[:] values, bint dropna): - cdef: - Py_ssize_t i - kh_float64_t *ftable - kh_int64_t *itable - sixty_four_bit_scalar[:] result_keys - int64_t[:] result_counts - int k - - i = 0 - - if sixty_four_bit_scalar is float64_t: - ftable = kh_init_float64() - build_count_table_scalar64(values, ftable, dropna) - - result_keys = np.empty(ftable.n_occupied, dtype=np.float64) - result_counts = np.zeros(ftable.n_occupied, dtype=np.int64) - - with nogil: - for k in range(ftable.n_buckets): - if kh_exist_float64(ftable, k): - result_keys[i] = ftable.keys[k] - result_counts[i] = ftable.vals[k] - i += 1 - kh_destroy_float64(ftable) - - elif sixty_four_bit_scalar is int64_t: - itable = kh_init_int64() - build_count_table_scalar64(values, itable, dropna) - - result_keys = np.empty(itable.n_occupied, dtype=np.int64) - result_counts = np.zeros(itable.n_occupied, dtype=np.int64) - - with nogil: - for k in range(itable.n_buckets): - if kh_exist_int64(itable, k): - result_keys[i] = itable.keys[k] - result_counts[i] = itable.vals[k] - i += 1 - kh_destroy_int64(itable) - - return np.asarray(result_keys), np.asarray(result_counts) - - cdef build_count_table_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask, kh_pymap_t *table): @@ -987,6 +138,8 @@ cdef build_count_table_object(ndarray[object] values, table.vals[k] = 1 +@cython.wraparound(False) +@cython.boundscheck(False) cpdef value_count_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask): cdef: @@ -1010,6 +163,8 @@ cpdef value_count_object(ndarray[object] values, return result_keys, result_counts +@cython.wraparound(False) +@cython.boundscheck(False) def mode_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask): cdef: int count, max_count = 2 @@ -1040,6 +195,7 @@ def mode_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask): return modes[:j+1] +@cython.wraparound(False) @cython.boundscheck(False) def mode_int64(int64_t[:] values): cdef: @@ -1051,7 +207,7 @@ def mode_int64(int64_t[:] values): table = kh_init_int64() - build_count_table_scalar64(values, table, 0) + build_count_table_int64(values, table, 0) modes = np.empty(table.n_buckets, dtype=np.int64) @@ -1074,6 +230,8 @@ def mode_int64(int64_t[:] values): return modes[:j+1] +@cython.wraparound(False) +@cython.boundscheck(False) def duplicated_object(ndarray[object] values, object keep='first'): cdef: Py_ssize_t i, n @@ -1114,92 +272,6 @@ def duplicated_object(ndarray[object] values, object keep='first'): return result.view(np.bool_) -@cython.wraparound(False) -@cython.boundscheck(False) -def duplicated_float64(ndarray[float64_t, ndim=1] values, - object keep='first'): - cdef: - int ret = 0, k - float64_t value - Py_ssize_t i, n = len(values) - kh_float64_t * table = kh_init_float64() - ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool') - - kh_resize_float64(table, min(n, _SIZE_HINT_LIMIT)) - - if keep not in ('last', 'first', False): - raise ValueError('keep must be either "first", "last" or False') - - if keep == 'last': - with nogil: - for i from n > i >=0: - kh_put_float64(table, values[i], &ret) - out[i] = ret == 0 - elif keep == 'first': - with nogil: - for i from 0 <= i < n: - kh_put_float64(table, values[i], &ret) - out[i] = ret == 0 - else: - with nogil: - for i from 0 <= i < n: - value = values[i] - k = kh_get_float64(table, value) - if k != table.n_buckets: - out[table.vals[k]] = 1 - out[i] = 1 - else: - k = kh_put_float64(table, value, &ret) - table.keys[k] = value - table.vals[k] = i - out[i] = 0 - kh_destroy_float64(table) - return out - - -@cython.wraparound(False) -@cython.boundscheck(False) -def duplicated_int64(ndarray[int64_t, ndim=1] values, - object keep='first'): - cdef: - int ret = 0, k - int64_t value - Py_ssize_t i, n = len(values) - kh_int64_t * table = kh_init_int64() - ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool') - - kh_resize_int64(table, min(n, _SIZE_HINT_LIMIT)) - - if keep not in ('last', 'first', False): - raise ValueError('keep must be either "first", "last" or False') - - if keep == 'last': - with nogil: - for i from n > i >=0: - kh_put_int64(table, values[i], &ret) - out[i] = ret == 0 - elif keep == 'first': - with nogil: - for i from 0 <= i < n: - kh_put_int64(table, values[i], &ret) - out[i] = ret == 0 - else: - with nogil: - for i from 0 <= i < n: - value = values[i] - k = kh_get_int64(table, value) - if k != table.n_buckets: - out[table.vals[k]] = 1 - out[i] = 1 - else: - k = kh_put_int64(table, value, &ret) - table.keys[k] = value - table.vals[k] = i - out[i] = 0 - kh_destroy_int64(table) - return out - - @cython.wraparound(False) @cython.boundscheck(False) def unique_label_indices(ndarray[int64_t, ndim=1] labels): @@ -1225,7 +297,7 @@ def unique_label_indices(ndarray[int64_t, ndim=1] labels): if needs_resize(ud): with gil: idx.resize() - append_data(ud, i) + append_data_int64(ud, i) kh_destroy_int64(table) diff --git a/pandas/src/hashtable_class_helper.pxi b/pandas/src/hashtable_class_helper.pxi new file mode 100644 index 0000000000000..da0c76aeca86f --- /dev/null +++ b/pandas/src/hashtable_class_helper.pxi @@ -0,0 +1,860 @@ +""" +Template for each `dtype` helper function for hashtable + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +#---------------------------------------------------------------------- +# VectorData +#---------------------------------------------------------------------- + + +ctypedef struct Float64VectorData: + float64_t *data + size_t n, m + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef void append_data_float64(Float64VectorData *data, + float64_t x) nogil: + + data.data[data.n] = x + data.n += 1 + + +ctypedef struct Int64VectorData: + int64_t *data + size_t n, m + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef void append_data_int64(Int64VectorData *data, + int64_t x) nogil: + + data.data[data.n] = x + data.n += 1 + +ctypedef fused vector_data: + Int64VectorData + Float64VectorData + +cdef bint needs_resize(vector_data *data) nogil: + return data.n == data.m + +#---------------------------------------------------------------------- +# Vector +#---------------------------------------------------------------------- + +cdef class Float64Vector: + + cdef: + Float64VectorData *data + ndarray ao + + def __cinit__(self): + self.data = PyMem_Malloc( + sizeof(Float64VectorData)) + if not self.data: + raise MemoryError() + self.data.n = 0 + self.data.m = _INIT_VEC_CAP + self.ao = np.empty(self.data.m, dtype=np.float64) + self.data.data = self.ao.data + + cdef resize(self): + self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) + self.ao.resize(self.data.m) + self.data.data = self.ao.data + + def __dealloc__(self): + PyMem_Free(self.data) + + def __len__(self): + return self.data.n + + def to_array(self): + self.ao.resize(self.data.n) + self.data.m = self.data.n + return self.ao + + cdef inline void append(self, float64_t x): + + if needs_resize(self.data): + self.resize() + + append_data_float64(self.data, x) + +cdef class Int64Vector: + + cdef: + Int64VectorData *data + ndarray ao + + def __cinit__(self): + self.data = PyMem_Malloc( + sizeof(Int64VectorData)) + if not self.data: + raise MemoryError() + self.data.n = 0 + self.data.m = _INIT_VEC_CAP + self.ao = np.empty(self.data.m, dtype=np.int64) + self.data.data = self.ao.data + + cdef resize(self): + self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) + self.ao.resize(self.data.m) + self.data.data = self.ao.data + + def __dealloc__(self): + PyMem_Free(self.data) + + def __len__(self): + return self.data.n + + def to_array(self): + self.ao.resize(self.data.n) + self.data.m = self.data.n + return self.ao + + cdef inline void append(self, int64_t x): + + if needs_resize(self.data): + self.resize() + + append_data_int64(self.data, x) + + +cdef class ObjectVector: + + cdef: + PyObject **data + size_t n, m + ndarray ao + + def __cinit__(self): + self.n = 0 + self.m = _INIT_VEC_CAP + self.ao = np.empty(_INIT_VEC_CAP, dtype=object) + self.data = self.ao.data + + def __len__(self): + return self.n + + cdef inline append(self, object o): + if self.n == self.m: + self.m = max(self.m * 2, _INIT_VEC_CAP) + self.ao.resize(self.m) + self.data = self.ao.data + + Py_INCREF(o) + self.data[self.n] = o + self.n += 1 + + def to_array(self): + self.ao.resize(self.n) + self.m = self.n + return self.ao + + +#---------------------------------------------------------------------- +# HashTable +#---------------------------------------------------------------------- + + +cdef class HashTable: + pass + +cdef class Float64HashTable(HashTable): + + def __cinit__(self, size_hint=1): + self.table = kh_init_float64() + if size_hint is not None: + kh_resize_float64(self.table, size_hint) + + def __len__(self): + return self.table.size + + def __dealloc__(self): + kh_destroy_float64(self.table) + + def __contains__(self, object key): + cdef khiter_t k + k = kh_get_float64(self.table, key) + return k != self.table.n_buckets + + cpdef get_item(self, float64_t val): + cdef khiter_t k + k = kh_get_float64(self.table, val) + if k != self.table.n_buckets: + return self.table.vals[k] + else: + raise KeyError(val) + + def get_iter_test(self, float64_t key, Py_ssize_t iterations): + cdef Py_ssize_t i, val=0 + for i in range(iterations): + k = kh_get_float64(self.table, val) + if k != self.table.n_buckets: + val = self.table.vals[k] + + cpdef set_item(self, float64_t key, Py_ssize_t val): + cdef: + khiter_t k + int ret = 0 + + k = kh_put_float64(self.table, key, &ret) + self.table.keys[k] = key + if kh_exist_float64(self.table, k): + self.table.vals[k] = val + else: + raise KeyError(key) + + @cython.boundscheck(False) + def map(self, float64_t[:] keys, int64_t[:] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + float64_t key + khiter_t k + + with nogil: + for i in range(n): + key = keys[i] + k = kh_put_float64(self.table, key, &ret) + self.table.vals[k] = values[i] + + @cython.boundscheck(False) + def map_locations(self, ndarray[float64_t, ndim=1] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + float64_t val + khiter_t k + + with nogil: + for i in range(n): + val = values[i] + k = kh_put_float64(self.table, val, &ret) + self.table.vals[k] = i + + @cython.boundscheck(False) + def lookup(self, float64_t[:] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + float64_t val + khiter_t k + int64_t[:] locs = np.empty(n, dtype=np.int64) + + with nogil: + for i in range(n): + val = values[i] + k = kh_get_float64(self.table, val) + if k != self.table.n_buckets: + locs[i] = self.table.vals[k] + else: + locs[i] = -1 + + return np.asarray(locs) + + def factorize(self, float64_t values): + uniques = Float64Vector() + labels = self.get_labels(values, uniques, 0, 0) + return uniques.to_array(), labels + + @cython.boundscheck(False) + def get_labels(self, float64_t[:] values, Float64Vector uniques, + Py_ssize_t count_prior, Py_ssize_t na_sentinel, + bint check_null=True): + cdef: + Py_ssize_t i, n = len(values) + int64_t[:] labels + Py_ssize_t idx, count = count_prior + int ret = 0 + float64_t val + khiter_t k + Float64VectorData *ud + + labels = np.empty(n, dtype=np.int64) + ud = uniques.data + + with nogil: + for i in range(n): + val = values[i] + + if check_null and val != val: + labels[i] = na_sentinel + continue + + k = kh_get_float64(self.table, val) + + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + else: + k = kh_put_float64(self.table, val, &ret) + self.table.vals[k] = count + + if needs_resize(ud): + with gil: + uniques.resize() + append_data_float64(ud, val) + labels[i] = count + count += 1 + + return np.asarray(labels) + + @cython.boundscheck(False) + def get_labels_groupby(self, float64_t[:] values): + cdef: + Py_ssize_t i, n = len(values) + int64_t[:] labels + Py_ssize_t idx, count = 0 + int ret = 0 + float64_t val + khiter_t k + Float64Vector uniques = Float64Vector() + Float64VectorData *ud + + labels = np.empty(n, dtype=np.int64) + ud = uniques.data + + with nogil: + for i in range(n): + val = values[i] + + # specific for groupby + if val < 0: + labels[i] = -1 + continue + + k = kh_get_float64(self.table, val) + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + else: + k = kh_put_float64(self.table, val, &ret) + self.table.vals[k] = count + + if needs_resize(ud): + with gil: + uniques.resize() + append_data_float64(ud, val) + labels[i] = count + count += 1 + + arr_uniques = uniques.to_array() + + return np.asarray(labels), arr_uniques + + @cython.boundscheck(False) + def unique(self, float64_t[:] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + float64_t val + khiter_t k + bint seen_na = 0 + Float64Vector uniques = Float64Vector() + Float64VectorData *ud + + ud = uniques.data + + with nogil: + for i in range(n): + val = values[i] + + if val == val: + k = kh_get_float64(self.table, val) + if k == self.table.n_buckets: + kh_put_float64(self.table, val, &ret) + if needs_resize(ud): + with gil: + uniques.resize() + append_data_float64(ud, val) + elif not seen_na: + seen_na = 1 + if needs_resize(ud): + with gil: + uniques.resize() + append_data_float64(ud, NAN) + + return uniques.to_array() + +cdef class Int64HashTable(HashTable): + + def __cinit__(self, size_hint=1): + self.table = kh_init_int64() + if size_hint is not None: + kh_resize_int64(self.table, size_hint) + + def __len__(self): + return self.table.size + + def __dealloc__(self): + kh_destroy_int64(self.table) + + def __contains__(self, object key): + cdef khiter_t k + k = kh_get_int64(self.table, key) + return k != self.table.n_buckets + + cpdef get_item(self, int64_t val): + cdef khiter_t k + k = kh_get_int64(self.table, val) + if k != self.table.n_buckets: + return self.table.vals[k] + else: + raise KeyError(val) + + def get_iter_test(self, int64_t key, Py_ssize_t iterations): + cdef Py_ssize_t i, val=0 + for i in range(iterations): + k = kh_get_int64(self.table, val) + if k != self.table.n_buckets: + val = self.table.vals[k] + + cpdef set_item(self, int64_t key, Py_ssize_t val): + cdef: + khiter_t k + int ret = 0 + + k = kh_put_int64(self.table, key, &ret) + self.table.keys[k] = key + if kh_exist_int64(self.table, k): + self.table.vals[k] = val + else: + raise KeyError(key) + + @cython.boundscheck(False) + def map(self, int64_t[:] keys, int64_t[:] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + int64_t key + khiter_t k + + with nogil: + for i in range(n): + key = keys[i] + k = kh_put_int64(self.table, key, &ret) + self.table.vals[k] = values[i] + + @cython.boundscheck(False) + def map_locations(self, ndarray[int64_t, ndim=1] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + int64_t val + khiter_t k + + with nogil: + for i in range(n): + val = values[i] + k = kh_put_int64(self.table, val, &ret) + self.table.vals[k] = i + + @cython.boundscheck(False) + def lookup(self, int64_t[:] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + int64_t val + khiter_t k + int64_t[:] locs = np.empty(n, dtype=np.int64) + + with nogil: + for i in range(n): + val = values[i] + k = kh_get_int64(self.table, val) + if k != self.table.n_buckets: + locs[i] = self.table.vals[k] + else: + locs[i] = -1 + + return np.asarray(locs) + + def factorize(self, int64_t values): + uniques = Int64Vector() + labels = self.get_labels(values, uniques, 0, 0) + return uniques.to_array(), labels + + @cython.boundscheck(False) + def get_labels(self, int64_t[:] values, Int64Vector uniques, + Py_ssize_t count_prior, Py_ssize_t na_sentinel, + bint check_null=True): + cdef: + Py_ssize_t i, n = len(values) + int64_t[:] labels + Py_ssize_t idx, count = count_prior + int ret = 0 + int64_t val + khiter_t k + Int64VectorData *ud + + labels = np.empty(n, dtype=np.int64) + ud = uniques.data + + with nogil: + for i in range(n): + val = values[i] + + if check_null and val == iNaT: + labels[i] = na_sentinel + continue + + k = kh_get_int64(self.table, val) + + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + else: + k = kh_put_int64(self.table, val, &ret) + self.table.vals[k] = count + + if needs_resize(ud): + with gil: + uniques.resize() + append_data_int64(ud, val) + labels[i] = count + count += 1 + + return np.asarray(labels) + + @cython.boundscheck(False) + def get_labels_groupby(self, int64_t[:] values): + cdef: + Py_ssize_t i, n = len(values) + int64_t[:] labels + Py_ssize_t idx, count = 0 + int ret = 0 + int64_t val + khiter_t k + Int64Vector uniques = Int64Vector() + Int64VectorData *ud + + labels = np.empty(n, dtype=np.int64) + ud = uniques.data + + with nogil: + for i in range(n): + val = values[i] + + # specific for groupby + if val < 0: + labels[i] = -1 + continue + + k = kh_get_int64(self.table, val) + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + else: + k = kh_put_int64(self.table, val, &ret) + self.table.vals[k] = count + + if needs_resize(ud): + with gil: + uniques.resize() + append_data_int64(ud, val) + labels[i] = count + count += 1 + + arr_uniques = uniques.to_array() + + return np.asarray(labels), arr_uniques + + @cython.boundscheck(False) + def unique(self, int64_t[:] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + int64_t val + khiter_t k + bint seen_na = 0 + Int64Vector uniques = Int64Vector() + Int64VectorData *ud + + ud = uniques.data + + with nogil: + for i in range(n): + val = values[i] + + k = kh_get_int64(self.table, val) + if k == self.table.n_buckets: + kh_put_int64(self.table, val, &ret) + if needs_resize(ud): + with gil: + uniques.resize() + append_data_int64(ud, val) + + return uniques.to_array() + + +cdef class StringHashTable(HashTable): + cdef kh_str_t *table + + def __cinit__(self, int size_hint=1): + self.table = kh_init_str() + if size_hint is not None: + kh_resize_str(self.table, size_hint) + + def __dealloc__(self): + kh_destroy_str(self.table) + + cpdef get_item(self, object val): + cdef khiter_t k + k = kh_get_str(self.table, util.get_c_string(val)) + if k != self.table.n_buckets: + return self.table.vals[k] + else: + raise KeyError(val) + + def get_iter_test(self, object key, Py_ssize_t iterations): + cdef Py_ssize_t i, val + for i in range(iterations): + k = kh_get_str(self.table, util.get_c_string(key)) + if k != self.table.n_buckets: + val = self.table.vals[k] + + cpdef set_item(self, object key, Py_ssize_t val): + cdef: + khiter_t k + int ret = 0 + char* buf + + buf = util.get_c_string(key) + + k = kh_put_str(self.table, buf, &ret) + self.table.keys[k] = key + if kh_exist_str(self.table, k): + self.table.vals[k] = val + else: + raise KeyError(key) + + def get_indexer(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + ndarray[int64_t] labels = np.empty(n, dtype=np.int64) + char *buf + int64_t *resbuf = labels.data + khiter_t k + kh_str_t *table = self.table + + for i in range(n): + buf = util.get_c_string(values[i]) + k = kh_get_str(table, buf) + if k != table.n_buckets: + resbuf[i] = table.vals[k] + else: + resbuf[i] = -1 + return labels + + def unique(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + object val + char *buf + khiter_t k + ObjectVector uniques = ObjectVector() + + for i in range(n): + val = values[i] + buf = util.get_c_string(val) + k = kh_get_str(self.table, buf) + if k == self.table.n_buckets: + kh_put_str(self.table, buf, &ret) + uniques.append(val) + + return uniques.to_array() + + def factorize(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + ndarray[int64_t] labels = np.empty(n, dtype=np.int64) + dict reverse = {} + Py_ssize_t idx, count = 0 + int ret = 0 + object val + char *buf + khiter_t k + + for i in range(n): + val = values[i] + buf = util.get_c_string(val) + k = kh_get_str(self.table, buf) + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + else: + k = kh_put_str(self.table, buf, &ret) + # print 'putting %s, %s' % (val, count) + + self.table.vals[k] = count + reverse[count] = val + labels[i] = count + count += 1 + + return reverse, labels + + +na_sentinel = object + +cdef class PyObjectHashTable(HashTable): + + def __init__(self, size_hint=1): + self.table = kh_init_pymap() + kh_resize_pymap(self.table, size_hint) + + def __dealloc__(self): + if self.table is not NULL: + self.destroy() + + def __len__(self): + return self.table.size + + def __contains__(self, object key): + cdef khiter_t k + hash(key) + if key != key or key is None: + key = na_sentinel + k = kh_get_pymap(self.table, key) + return k != self.table.n_buckets + + def destroy(self): + kh_destroy_pymap(self.table) + self.table = NULL + + cpdef get_item(self, object val): + cdef khiter_t k + if val != val or val is None: + val = na_sentinel + k = kh_get_pymap(self.table, val) + if k != self.table.n_buckets: + return self.table.vals[k] + else: + raise KeyError(val) + + def get_iter_test(self, object key, Py_ssize_t iterations): + cdef Py_ssize_t i, val + if key != key or key is None: + key = na_sentinel + for i in range(iterations): + k = kh_get_pymap(self.table, key) + if k != self.table.n_buckets: + val = self.table.vals[k] + + cpdef set_item(self, object key, Py_ssize_t val): + cdef: + khiter_t k + int ret = 0 + char* buf + + hash(key) + if key != key or key is None: + key = na_sentinel + k = kh_put_pymap(self.table, key, &ret) + # self.table.keys[k] = key + if kh_exist_pymap(self.table, k): + self.table.vals[k] = val + else: + raise KeyError(key) + + def map_locations(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + object val + khiter_t k + + for i in range(n): + val = values[i] + hash(val) + if val != val or val is None: + val = na_sentinel + + k = kh_put_pymap(self.table, val, &ret) + self.table.vals[k] = i + + def lookup(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + object val + khiter_t k + int64_t[:] locs = np.empty(n, dtype=np.int64) + + for i in range(n): + val = values[i] + hash(val) + if val != val or val is None: + val = na_sentinel + + k = kh_get_pymap(self.table, val) + if k != self.table.n_buckets: + locs[i] = self.table.vals[k] + else: + locs[i] = -1 + + return np.asarray(locs) + + def unique(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + object val + khiter_t k + ObjectVector uniques = ObjectVector() + bint seen_na = 0 + + for i in range(n): + val = values[i] + hash(val) + if not _checknan(val): + k = kh_get_pymap(self.table, val) + if k == self.table.n_buckets: + kh_put_pymap(self.table, val, &ret) + uniques.append(val) + elif not seen_na: + seen_na = 1 + uniques.append(nan) + + return uniques.to_array() + + def get_labels(self, ndarray[object] values, ObjectVector uniques, + Py_ssize_t count_prior, int64_t na_sentinel, + bint check_null=True): + cdef: + Py_ssize_t i, n = len(values) + int64_t[:] labels + Py_ssize_t idx, count = count_prior + int ret = 0 + object val + khiter_t k + + labels = np.empty(n, dtype=np.int64) + + for i in range(n): + val = values[i] + hash(val) + + if check_null and val != val or val is None: + labels[i] = na_sentinel + continue + + k = kh_get_pymap(self.table, val) + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + else: + k = kh_put_pymap(self.table, val, &ret) + self.table.vals[k] = count + uniques.append(val) + labels[i] = count + count += 1 + + return np.asarray(labels) \ No newline at end of file diff --git a/pandas/src/hashtable_class_helper.pxi.in b/pandas/src/hashtable_class_helper.pxi.in new file mode 100644 index 0000000000000..14e5363eee20c --- /dev/null +++ b/pandas/src/hashtable_class_helper.pxi.in @@ -0,0 +1,642 @@ +""" +Template for each `dtype` helper function for hashtable + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +#---------------------------------------------------------------------- +# VectorData +#---------------------------------------------------------------------- + +{{py: + +# name, dtype +dtypes = [('Float64', 'float64'), ('Int64', 'int64')] + +}} + +{{for name, dtype in dtypes}} + + +ctypedef struct {{name}}VectorData: + {{dtype}}_t *data + size_t n, m + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef void append_data_{{dtype}}({{name}}VectorData *data, + {{dtype}}_t x) nogil: + + data.data[data.n] = x + data.n += 1 + +{{endfor}} + +ctypedef fused vector_data: + Int64VectorData + Float64VectorData + +cdef bint needs_resize(vector_data *data) nogil: + return data.n == data.m + +#---------------------------------------------------------------------- +# Vector +#---------------------------------------------------------------------- + +{{py: + +# name, dtype +dtypes = [('Float64', 'float64'), ('Int64', 'int64')] + +}} + +{{for name, dtype in dtypes}} + +cdef class {{name}}Vector: + + cdef: + {{name}}VectorData *data + ndarray ao + + def __cinit__(self): + self.data = <{{name}}VectorData *>PyMem_Malloc( + sizeof({{name}}VectorData)) + if not self.data: + raise MemoryError() + self.data.n = 0 + self.data.m = _INIT_VEC_CAP + self.ao = np.empty(self.data.m, dtype=np.{{dtype}}) + self.data.data = <{{dtype}}_t*> self.ao.data + + cdef resize(self): + self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) + self.ao.resize(self.data.m) + self.data.data = <{{dtype}}_t*> self.ao.data + + def __dealloc__(self): + PyMem_Free(self.data) + + def __len__(self): + return self.data.n + + def to_array(self): + self.ao.resize(self.data.n) + self.data.m = self.data.n + return self.ao + + cdef inline void append(self, {{dtype}}_t x): + + if needs_resize(self.data): + self.resize() + + append_data_{{dtype}}(self.data, x) + +{{endfor}} + + +cdef class ObjectVector: + + cdef: + PyObject **data + size_t n, m + ndarray ao + + def __cinit__(self): + self.n = 0 + self.m = _INIT_VEC_CAP + self.ao = np.empty(_INIT_VEC_CAP, dtype=object) + self.data = self.ao.data + + def __len__(self): + return self.n + + cdef inline append(self, object o): + if self.n == self.m: + self.m = max(self.m * 2, _INIT_VEC_CAP) + self.ao.resize(self.m) + self.data = self.ao.data + + Py_INCREF(o) + self.data[self.n] = o + self.n += 1 + + def to_array(self): + self.ao.resize(self.n) + self.m = self.n + return self.ao + + +#---------------------------------------------------------------------- +# HashTable +#---------------------------------------------------------------------- + + +cdef class HashTable: + pass + +{{py: + +# name, dtype, null_condition, float_group +dtypes = [('Float64', 'float64', 'val != val', True), + ('Int64', 'int64', 'val == iNaT', False)] + +}} + + +{{for name, dtype, null_condition, float_group in dtypes}} + +cdef class {{name}}HashTable(HashTable): + + def __cinit__(self, size_hint=1): + self.table = kh_init_{{dtype}}() + if size_hint is not None: + kh_resize_{{dtype}}(self.table, size_hint) + + def __len__(self): + return self.table.size + + def __dealloc__(self): + kh_destroy_{{dtype}}(self.table) + + def __contains__(self, object key): + cdef khiter_t k + k = kh_get_{{dtype}}(self.table, key) + return k != self.table.n_buckets + + cpdef get_item(self, {{dtype}}_t val): + cdef khiter_t k + k = kh_get_{{dtype}}(self.table, val) + if k != self.table.n_buckets: + return self.table.vals[k] + else: + raise KeyError(val) + + def get_iter_test(self, {{dtype}}_t key, Py_ssize_t iterations): + cdef Py_ssize_t i, val=0 + for i in range(iterations): + k = kh_get_{{dtype}}(self.table, val) + if k != self.table.n_buckets: + val = self.table.vals[k] + + cpdef set_item(self, {{dtype}}_t key, Py_ssize_t val): + cdef: + khiter_t k + int ret = 0 + + k = kh_put_{{dtype}}(self.table, key, &ret) + self.table.keys[k] = key + if kh_exist_{{dtype}}(self.table, k): + self.table.vals[k] = val + else: + raise KeyError(key) + + @cython.boundscheck(False) + def map(self, {{dtype}}_t[:] keys, int64_t[:] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + {{dtype}}_t key + khiter_t k + + with nogil: + for i in range(n): + key = keys[i] + k = kh_put_{{dtype}}(self.table, key, &ret) + self.table.vals[k] = values[i] + + @cython.boundscheck(False) + def map_locations(self, ndarray[{{dtype}}_t, ndim=1] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + {{dtype}}_t val + khiter_t k + + with nogil: + for i in range(n): + val = values[i] + k = kh_put_{{dtype}}(self.table, val, &ret) + self.table.vals[k] = i + + @cython.boundscheck(False) + def lookup(self, {{dtype}}_t[:] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + {{dtype}}_t val + khiter_t k + int64_t[:] locs = np.empty(n, dtype=np.int64) + + with nogil: + for i in range(n): + val = values[i] + k = kh_get_{{dtype}}(self.table, val) + if k != self.table.n_buckets: + locs[i] = self.table.vals[k] + else: + locs[i] = -1 + + return np.asarray(locs) + + def factorize(self, {{dtype}}_t values): + uniques = {{name}}Vector() + labels = self.get_labels(values, uniques, 0, 0) + return uniques.to_array(), labels + + @cython.boundscheck(False) + def get_labels(self, {{dtype}}_t[:] values, {{name}}Vector uniques, + Py_ssize_t count_prior, Py_ssize_t na_sentinel, + bint check_null=True): + cdef: + Py_ssize_t i, n = len(values) + int64_t[:] labels + Py_ssize_t idx, count = count_prior + int ret = 0 + {{dtype}}_t val + khiter_t k + {{name}}VectorData *ud + + labels = np.empty(n, dtype=np.int64) + ud = uniques.data + + with nogil: + for i in range(n): + val = values[i] + + if check_null and {{null_condition}}: + labels[i] = na_sentinel + continue + + k = kh_get_{{dtype}}(self.table, val) + + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + else: + k = kh_put_{{dtype}}(self.table, val, &ret) + self.table.vals[k] = count + + if needs_resize(ud): + with gil: + uniques.resize() + append_data_{{dtype}}(ud, val) + labels[i] = count + count += 1 + + return np.asarray(labels) + + @cython.boundscheck(False) + def get_labels_groupby(self, {{dtype}}_t[:] values): + cdef: + Py_ssize_t i, n = len(values) + int64_t[:] labels + Py_ssize_t idx, count = 0 + int ret = 0 + {{dtype}}_t val + khiter_t k + {{name}}Vector uniques = {{name}}Vector() + {{name}}VectorData *ud + + labels = np.empty(n, dtype=np.int64) + ud = uniques.data + + with nogil: + for i in range(n): + val = values[i] + + # specific for groupby + if val < 0: + labels[i] = -1 + continue + + k = kh_get_{{dtype}}(self.table, val) + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + else: + k = kh_put_{{dtype}}(self.table, val, &ret) + self.table.vals[k] = count + + if needs_resize(ud): + with gil: + uniques.resize() + append_data_{{dtype}}(ud, val) + labels[i] = count + count += 1 + + arr_uniques = uniques.to_array() + + return np.asarray(labels), arr_uniques + + @cython.boundscheck(False) + def unique(self, {{dtype}}_t[:] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + {{dtype}}_t val + khiter_t k + bint seen_na = 0 + {{name}}Vector uniques = {{name}}Vector() + {{name}}VectorData *ud + + ud = uniques.data + + with nogil: + for i in range(n): + val = values[i] + + {{if float_group}} + if val == val: + k = kh_get_{{dtype}}(self.table, val) + if k == self.table.n_buckets: + kh_put_{{dtype}}(self.table, val, &ret) + if needs_resize(ud): + with gil: + uniques.resize() + append_data_{{dtype}}(ud, val) + elif not seen_na: + seen_na = 1 + if needs_resize(ud): + with gil: + uniques.resize() + append_data_{{dtype}}(ud, NAN) + {{else}} + k = kh_get_{{dtype}}(self.table, val) + if k == self.table.n_buckets: + kh_put_{{dtype}}(self.table, val, &ret) + if needs_resize(ud): + with gil: + uniques.resize() + append_data_{{dtype}}(ud, val) + {{endif}} + + return uniques.to_array() + +{{endfor}} + + +cdef class StringHashTable(HashTable): + cdef kh_str_t *table + + def __cinit__(self, int size_hint=1): + self.table = kh_init_str() + if size_hint is not None: + kh_resize_str(self.table, size_hint) + + def __dealloc__(self): + kh_destroy_str(self.table) + + cpdef get_item(self, object val): + cdef khiter_t k + k = kh_get_str(self.table, util.get_c_string(val)) + if k != self.table.n_buckets: + return self.table.vals[k] + else: + raise KeyError(val) + + def get_iter_test(self, object key, Py_ssize_t iterations): + cdef Py_ssize_t i, val + for i in range(iterations): + k = kh_get_str(self.table, util.get_c_string(key)) + if k != self.table.n_buckets: + val = self.table.vals[k] + + cpdef set_item(self, object key, Py_ssize_t val): + cdef: + khiter_t k + int ret = 0 + char* buf + + buf = util.get_c_string(key) + + k = kh_put_str(self.table, buf, &ret) + self.table.keys[k] = key + if kh_exist_str(self.table, k): + self.table.vals[k] = val + else: + raise KeyError(key) + + def get_indexer(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + ndarray[int64_t] labels = np.empty(n, dtype=np.int64) + char *buf + int64_t *resbuf = labels.data + khiter_t k + kh_str_t *table = self.table + + for i in range(n): + buf = util.get_c_string(values[i]) + k = kh_get_str(table, buf) + if k != table.n_buckets: + resbuf[i] = table.vals[k] + else: + resbuf[i] = -1 + return labels + + def unique(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + object val + char *buf + khiter_t k + ObjectVector uniques = ObjectVector() + + for i in range(n): + val = values[i] + buf = util.get_c_string(val) + k = kh_get_str(self.table, buf) + if k == self.table.n_buckets: + kh_put_str(self.table, buf, &ret) + uniques.append(val) + + return uniques.to_array() + + def factorize(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + ndarray[int64_t] labels = np.empty(n, dtype=np.int64) + dict reverse = {} + Py_ssize_t idx, count = 0 + int ret = 0 + object val + char *buf + khiter_t k + + for i in range(n): + val = values[i] + buf = util.get_c_string(val) + k = kh_get_str(self.table, buf) + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + else: + k = kh_put_str(self.table, buf, &ret) + # print 'putting %s, %s' % (val, count) + + self.table.vals[k] = count + reverse[count] = val + labels[i] = count + count += 1 + + return reverse, labels + + +na_sentinel = object + +cdef class PyObjectHashTable(HashTable): + + def __init__(self, size_hint=1): + self.table = kh_init_pymap() + kh_resize_pymap(self.table, size_hint) + + def __dealloc__(self): + if self.table is not NULL: + self.destroy() + + def __len__(self): + return self.table.size + + def __contains__(self, object key): + cdef khiter_t k + hash(key) + if key != key or key is None: + key = na_sentinel + k = kh_get_pymap(self.table, key) + return k != self.table.n_buckets + + def destroy(self): + kh_destroy_pymap(self.table) + self.table = NULL + + cpdef get_item(self, object val): + cdef khiter_t k + if val != val or val is None: + val = na_sentinel + k = kh_get_pymap(self.table, val) + if k != self.table.n_buckets: + return self.table.vals[k] + else: + raise KeyError(val) + + def get_iter_test(self, object key, Py_ssize_t iterations): + cdef Py_ssize_t i, val + if key != key or key is None: + key = na_sentinel + for i in range(iterations): + k = kh_get_pymap(self.table, key) + if k != self.table.n_buckets: + val = self.table.vals[k] + + cpdef set_item(self, object key, Py_ssize_t val): + cdef: + khiter_t k + int ret = 0 + char* buf + + hash(key) + if key != key or key is None: + key = na_sentinel + k = kh_put_pymap(self.table, key, &ret) + # self.table.keys[k] = key + if kh_exist_pymap(self.table, k): + self.table.vals[k] = val + else: + raise KeyError(key) + + def map_locations(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + object val + khiter_t k + + for i in range(n): + val = values[i] + hash(val) + if val != val or val is None: + val = na_sentinel + + k = kh_put_pymap(self.table, val, &ret) + self.table.vals[k] = i + + def lookup(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + object val + khiter_t k + int64_t[:] locs = np.empty(n, dtype=np.int64) + + for i in range(n): + val = values[i] + hash(val) + if val != val or val is None: + val = na_sentinel + + k = kh_get_pymap(self.table, val) + if k != self.table.n_buckets: + locs[i] = self.table.vals[k] + else: + locs[i] = -1 + + return np.asarray(locs) + + def unique(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + object val + khiter_t k + ObjectVector uniques = ObjectVector() + bint seen_na = 0 + + for i in range(n): + val = values[i] + hash(val) + if not _checknan(val): + k = kh_get_pymap(self.table, val) + if k == self.table.n_buckets: + kh_put_pymap(self.table, val, &ret) + uniques.append(val) + elif not seen_na: + seen_na = 1 + uniques.append(nan) + + return uniques.to_array() + + def get_labels(self, ndarray[object] values, ObjectVector uniques, + Py_ssize_t count_prior, int64_t na_sentinel, + bint check_null=True): + cdef: + Py_ssize_t i, n = len(values) + int64_t[:] labels + Py_ssize_t idx, count = count_prior + int ret = 0 + object val + khiter_t k + + labels = np.empty(n, dtype=np.int64) + + for i in range(n): + val = values[i] + hash(val) + + if check_null and val != val or val is None: + labels[i] = na_sentinel + continue + + k = kh_get_pymap(self.table, val) + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + else: + k = kh_put_pymap(self.table, val, &ret) + self.table.vals[k] = count + uniques.append(val) + labels[i] = count + count += 1 + + return np.asarray(labels) \ No newline at end of file diff --git a/pandas/src/hashtable_func_helper.pxi b/pandas/src/hashtable_func_helper.pxi new file mode 100644 index 0000000000000..d05b81acc5dd5 --- /dev/null +++ b/pandas/src/hashtable_func_helper.pxi @@ -0,0 +1,197 @@ +""" +Template for each `dtype` helper function for hashtable + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +#---------------------------------------------------------------------- +# VectorData +#---------------------------------------------------------------------- + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef build_count_table_float64(float64_t[:] values, + kh_float64_t *table, bint dropna): + cdef: + khiter_t k + Py_ssize_t i, n = len(values) + float64_t val + int ret = 0 + + with nogil: + kh_resize_float64(table, n) + + for i in range(n): + val = values[i] + if val == val or not dropna: + k = kh_get_float64(table, val) + if k != table.n_buckets: + table.vals[k] += 1 + else: + k = kh_put_float64(table, val, &ret) + table.vals[k] = 1 + + +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef value_count_float64(float64_t[:] values, bint dropna): + cdef: + Py_ssize_t i=0 + kh_float64_t *table + float64_t[:] result_keys + int64_t[:] result_counts + int k + + table = kh_init_float64() + build_count_table_float64(values, table, dropna) + + result_keys = np.empty(table.n_occupied, dtype=np.float64) + result_counts = np.zeros(table.n_occupied, dtype=np.int64) + + with nogil: + for k in range(table.n_buckets): + if kh_exist_float64(table, k): + result_keys[i] = table.keys[k] + result_counts[i] = table.vals[k] + i += 1 + kh_destroy_float64(table) + + return np.asarray(result_keys), np.asarray(result_counts) + + +@cython.wraparound(False) +@cython.boundscheck(False) +def duplicated_float64(float64_t[:] values, + object keep='first'): + cdef: + int ret = 0, k + float64_t value + Py_ssize_t i, n = len(values) + kh_float64_t * table = kh_init_float64() + ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool') + + kh_resize_float64(table, min(n, _SIZE_HINT_LIMIT)) + + if keep not in ('last', 'first', False): + raise ValueError('keep must be either "first", "last" or False') + + if keep == 'last': + with nogil: + for i from n > i >=0: + kh_put_float64(table, values[i], &ret) + out[i] = ret == 0 + elif keep == 'first': + with nogil: + for i from 0 <= i < n: + kh_put_float64(table, values[i], &ret) + out[i] = ret == 0 + else: + with nogil: + for i from 0 <= i < n: + value = values[i] + k = kh_get_float64(table, value) + if k != table.n_buckets: + out[table.vals[k]] = 1 + out[i] = 1 + else: + k = kh_put_float64(table, value, &ret) + table.keys[k] = value + table.vals[k] = i + out[i] = 0 + kh_destroy_float64(table) + return out + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef build_count_table_int64(int64_t[:] values, + kh_int64_t *table, bint dropna): + cdef: + khiter_t k + Py_ssize_t i, n = len(values) + int64_t val + int ret = 0 + + with nogil: + kh_resize_int64(table, n) + + for i in range(n): + val = values[i] + if val == val or not dropna: + k = kh_get_int64(table, val) + if k != table.n_buckets: + table.vals[k] += 1 + else: + k = kh_put_int64(table, val, &ret) + table.vals[k] = 1 + + +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef value_count_int64(int64_t[:] values, bint dropna): + cdef: + Py_ssize_t i=0 + kh_int64_t *table + int64_t[:] result_keys + int64_t[:] result_counts + int k + + table = kh_init_int64() + build_count_table_int64(values, table, dropna) + + result_keys = np.empty(table.n_occupied, dtype=np.int64) + result_counts = np.zeros(table.n_occupied, dtype=np.int64) + + with nogil: + for k in range(table.n_buckets): + if kh_exist_int64(table, k): + result_keys[i] = table.keys[k] + result_counts[i] = table.vals[k] + i += 1 + kh_destroy_int64(table) + + return np.asarray(result_keys), np.asarray(result_counts) + + +@cython.wraparound(False) +@cython.boundscheck(False) +def duplicated_int64(int64_t[:] values, + object keep='first'): + cdef: + int ret = 0, k + int64_t value + Py_ssize_t i, n = len(values) + kh_int64_t * table = kh_init_int64() + ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool') + + kh_resize_int64(table, min(n, _SIZE_HINT_LIMIT)) + + if keep not in ('last', 'first', False): + raise ValueError('keep must be either "first", "last" or False') + + if keep == 'last': + with nogil: + for i from n > i >=0: + kh_put_int64(table, values[i], &ret) + out[i] = ret == 0 + elif keep == 'first': + with nogil: + for i from 0 <= i < n: + kh_put_int64(table, values[i], &ret) + out[i] = ret == 0 + else: + with nogil: + for i from 0 <= i < n: + value = values[i] + k = kh_get_int64(table, value) + if k != table.n_buckets: + out[table.vals[k]] = 1 + out[i] = 1 + else: + k = kh_put_int64(table, value, &ret) + table.keys[k] = value + table.vals[k] = i + out[i] = 0 + kh_destroy_int64(table) + return out diff --git a/pandas/src/hashtable_func_helper.pxi.in b/pandas/src/hashtable_func_helper.pxi.in new file mode 100644 index 0000000000000..1840b914f3328 --- /dev/null +++ b/pandas/src/hashtable_func_helper.pxi.in @@ -0,0 +1,114 @@ +""" +Template for each `dtype` helper function for hashtable + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +#---------------------------------------------------------------------- +# VectorData +#---------------------------------------------------------------------- + +{{py: + +# name +dtypes = ['float64', 'int64'] + +}} + +{{for dtype in dtypes}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef build_count_table_{{dtype}}({{dtype}}_t[:] values, + kh_{{dtype}}_t *table, bint dropna): + cdef: + khiter_t k + Py_ssize_t i, n = len(values) + {{dtype}}_t val + int ret = 0 + + with nogil: + kh_resize_{{dtype}}(table, n) + + for i in range(n): + val = values[i] + if val == val or not dropna: + k = kh_get_{{dtype}}(table, val) + if k != table.n_buckets: + table.vals[k] += 1 + else: + k = kh_put_{{dtype}}(table, val, &ret) + table.vals[k] = 1 + + +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef value_count_{{dtype}}({{dtype}}_t[:] values, bint dropna): + cdef: + Py_ssize_t i=0 + kh_{{dtype}}_t *table + {{dtype}}_t[:] result_keys + int64_t[:] result_counts + int k + + table = kh_init_{{dtype}}() + build_count_table_{{dtype}}(values, table, dropna) + + result_keys = np.empty(table.n_occupied, dtype=np.{{dtype}}) + result_counts = np.zeros(table.n_occupied, dtype=np.int64) + + with nogil: + for k in range(table.n_buckets): + if kh_exist_{{dtype}}(table, k): + result_keys[i] = table.keys[k] + result_counts[i] = table.vals[k] + i += 1 + kh_destroy_{{dtype}}(table) + + return np.asarray(result_keys), np.asarray(result_counts) + + +@cython.wraparound(False) +@cython.boundscheck(False) +def duplicated_{{dtype}}({{dtype}}_t[:] values, + object keep='first'): + cdef: + int ret = 0, k + {{dtype}}_t value + Py_ssize_t i, n = len(values) + kh_{{dtype}}_t * table = kh_init_{{dtype}}() + ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool') + + kh_resize_{{dtype}}(table, min(n, _SIZE_HINT_LIMIT)) + + if keep not in ('last', 'first', False): + raise ValueError('keep must be either "first", "last" or False') + + if keep == 'last': + with nogil: + for i from n > i >=0: + kh_put_{{dtype}}(table, values[i], &ret) + out[i] = ret == 0 + elif keep == 'first': + with nogil: + for i from 0 <= i < n: + kh_put_{{dtype}}(table, values[i], &ret) + out[i] = ret == 0 + else: + with nogil: + for i from 0 <= i < n: + value = values[i] + k = kh_get_{{dtype}}(table, value) + if k != table.n_buckets: + out[table.vals[k]] = 1 + out[i] = 1 + else: + k = kh_put_{{dtype}}(table, value, &ret) + table.keys[k] = value + table.vals[k] = i + out[i] = 0 + kh_destroy_{{dtype}}(table) + return out + +{{endfor}} diff --git a/setup.py b/setup.py index 86777f5579a09..e81cae633427d 100755 --- a/setup.py +++ b/setup.py @@ -107,7 +107,8 @@ def is_platform_mac(): _pxipath = pjoin('pandas', 'src') _pxifiles = ['algos_common_helper.pxi.in', 'algos_groupby_helper.pxi.in', - 'algos_join_helper.pxi.in', 'algos_take_helper.pxi.in'] + 'algos_join_helper.pxi.in', 'algos_take_helper.pxi.in', + 'hashtable_class_helper.pxi.in', 'hashtable_func_helper.pxi.in'] class build_ext(_build_ext):