Skip to content

ENH: add data hashing routines #14729

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 28, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions asv_bench/benchmarks/algorithms.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import numpy as np
import pandas as pd
from pandas.util import testing as tm


class algorithm(object):
Expand Down Expand Up @@ -55,3 +56,35 @@ def time_add_overflow_neg_arr(self):

def time_add_overflow_mixed_arr(self):
self.checked_add(self.arr, self.arrmixed)


class hashing(object):
goal_time = 0.2

def setup(self):
N = 100000

self.df = pd.DataFrame(
{'A': pd.Series(tm.makeStringIndex(100).take(
np.random.randint(0, 100, size=N))),
'B': pd.Series(tm.makeStringIndex(10000).take(
np.random.randint(0, 10000, size=N))),
'D': np.random.randn(N),
'E': np.arange(N),
'F': pd.date_range('20110101', freq='s', periods=N),
'G': pd.timedelta_range('1 day', freq='s', periods=N),
})
self.df['C'] = self.df['B'].astype('category')
self.df.iloc[10:20] = np.nan

def time_frame(self):
self.df.hash()

def time_series_int(self):
self.df.E.hash()

def time_series_string(self):
self.df.B.hash()

def time_series_categorical(self):
self.df.C.hash()
180 changes: 180 additions & 0 deletions pandas/src/hash.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
# cython: profile=False
# Translated from the reference implementation
# at https://github.com/veorq/SipHash

import cython
cimport numpy as cnp
import numpy as np
from numpy cimport ndarray, uint8_t, uint32_t, uint64_t

from cpython cimport (PyString_Check,
PyBytes_Check,
PyUnicode_Check)
from libc.stdlib cimport malloc, free

DEF cROUNDS = 2
DEF dROUNDS = 4


@cython.boundscheck(False)
def hash_object_array(ndarray[object] arr, object key, object encoding='utf8'):
"""
Parameters
----------
arr : 1-d object ndarray of objects
key : hash key, must be 16 byte len encoded
encoding : encoding for key & arr, default to 'utf8'

Returns
-------
1-d uint64 ndarray of hashes

"""
cdef:
Py_ssize_t i, l, n
ndarray[uint64_t] result
bytes data, k
uint8_t *kb, *lens
char **vecs, *cdata
object val

k = <bytes>key.encode(encoding)
kb = <uint8_t *>k
if len(k) != 16:
raise ValueError(
'key should be a 16-byte string encoded, got {!r} (len {})'.format(
k, len(k)))

n = len(arr)

# create an array of bytes
vecs = <char **> malloc(n * sizeof(char *))
lens = <uint8_t*> malloc(n * sizeof(uint8_t))

cdef list datas = []
for i in range(n):
val = arr[i]
if PyString_Check(val):
data = <bytes>val.encode(encoding)
elif PyBytes_Check(val):
data = <bytes>val
elif PyUnicode_Check(val):
data = <bytes>val.encode(encoding)
else:
# non-strings
data = <bytes>str(val).encode(encoding)

l = len(data)
lens[i] = l
cdata = data

# keep the refernce alive thru the end of the
# function
datas.append(data)
vecs[i] = cdata

result = np.empty(n, dtype=np.uint64)
with nogil:
for i in range(n):
result[i] = low_level_siphash(<uint8_t *>vecs[i], lens[i], kb)

free(vecs)
free(lens)
return result

cdef inline uint64_t _rotl(uint64_t x, uint64_t b) nogil:
return (x << b) | (x >> (64 - b))

cdef inline void u32to8_le(uint8_t* p, uint32_t v) nogil:
p[0] = <uint8_t>(v)
p[1] = <uint8_t>(v >> 8)
p[2] = <uint8_t>(v >> 16)
p[3] = <uint8_t>(v >> 24)

cdef inline void u64to8_le(uint8_t* p, uint64_t v) nogil:
u32to8_le(p, <uint32_t>v)
u32to8_le(p + 4, <uint32_t>(v >> 32))

cdef inline uint64_t u8to64_le(uint8_t* p) nogil:
return (<uint64_t>p[0] |
<uint64_t>p[1] << 8 |
<uint64_t>p[2] << 16 |
<uint64_t>p[3] << 24 |
<uint64_t>p[4] << 32 |
<uint64_t>p[5] << 40 |
<uint64_t>p[6] << 48 |
<uint64_t>p[7] << 56)

cdef inline void _sipround(uint64_t* v0, uint64_t* v1,
uint64_t* v2, uint64_t* v3) nogil:
v0[0] += v1[0]
v1[0] = _rotl(v1[0], 13)
v1[0] ^= v0[0]
v0[0] = _rotl(v0[0], 32)
v2[0] += v3[0]
v3[0] = _rotl(v3[0], 16)
v3[0] ^= v2[0]
v0[0] += v3[0]
v3[0] = _rotl(v3[0], 21)
v3[0] ^= v0[0]
v2[0] += v1[0]
v1[0] = _rotl(v1[0], 17)
v1[0] ^= v2[0]
v2[0] = _rotl(v2[0], 32)

cpdef uint64_t siphash(bytes data, bytes key) except? 0:
if len(key) != 16:
raise ValueError(
'key should be a 16-byte bytestring, got {!r} (len {})'.format(
key, len(key)))
return low_level_siphash(data, len(data), key)


@cython.cdivision(True)
cdef uint64_t low_level_siphash(uint8_t* data, size_t datalen,
uint8_t* key) nogil:
cdef uint64_t v0 = 0x736f6d6570736575ULL
cdef uint64_t v1 = 0x646f72616e646f6dULL
cdef uint64_t v2 = 0x6c7967656e657261ULL
cdef uint64_t v3 = 0x7465646279746573ULL
cdef uint64_t b
cdef uint64_t k0 = u8to64_le(key)
cdef uint64_t k1 = u8to64_le(key + 8)
cdef uint64_t m
cdef int i
cdef uint8_t* end = data + datalen - (datalen % sizeof(uint64_t))
cdef int left = datalen & 7
cdef int left_byte

b = (<uint64_t>datalen) << 56
v3 ^= k1
v2 ^= k0
v1 ^= k1
v0 ^= k0

while (data != end):
m = u8to64_le(data)
v3 ^= m
for i in range(cROUNDS):
_sipround(&v0, &v1, &v2, &v3)
v0 ^= m

data += sizeof(uint64_t)

for i in range(left-1, -1, -1):
b |= (<uint64_t>data[i]) << (i * 8)

v3 ^= b

for i in range(cROUNDS):
_sipround(&v0, &v1, &v2, &v3)

v0 ^= b
v2 ^= 0xff

for i in range(dROUNDS):
_sipround(&v0, &v1, &v2, &v3)

b = v0 ^ v1 ^ v2 ^ v3

return b
137 changes: 137 additions & 0 deletions pandas/tools/hashing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
"""
data hash pandas / numpy objects
"""

import numpy as np
from pandas import _hash, Series, factorize, Categorical, Index
from pandas.lib import infer_dtype
from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame
from pandas.types.common import is_categorical_dtype

# 16 byte long hashing key
_default_hash_key = '0123456789123456'


def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None):
"""
Return a data hash of the Index/Series/DataFrame

.. versionadded:: 0.19.2

Parameters
----------
index : boolean, default True
include the index in the hash (if Series/DataFrame)
encoding : string, default 'utf8'
encoding for data & key when strings
hash_key : string key to encode, default to _default_hash_key

Returns
-------
Series of uint64, same length as the object

"""
if hash_key is None:
hash_key = _default_hash_key

def adder(h, hashed_to_add):
h = np.multiply(h, np.uint(3), h)
return np.add(h, hashed_to_add, h)

if isinstance(obj, ABCIndexClass):
h = hash_array(obj.values, encoding, hash_key).astype('uint64')
h = Series(h, index=obj, dtype='uint64')
elif isinstance(obj, ABCSeries):
h = hash_array(obj.values, encoding, hash_key).astype('uint64')
if index:
h = adder(h, hash_pandas_object(obj.index,
index=False,
encoding=encoding,
hash_key=hash_key).values)
h = Series(h, index=obj.index, dtype='uint64')
elif isinstance(obj, ABCDataFrame):
cols = obj.iteritems()
first_series = next(cols)[1]
h = hash_array(first_series.values, encoding,
hash_key).astype('uint64')
for _, col in cols:
h = adder(h, hash_array(col.values, encoding, hash_key))
if index:
h = adder(h, hash_pandas_object(obj.index,
index=False,
encoding=encoding,
hash_key=hash_key).values)

h = Series(h, index=obj.index, dtype='uint64')
else:
raise TypeError("Unexpected type for hashing %s" % type(obj))
return h


def hash_array(vals, encoding='utf8', hash_key=None):
"""
Given a 1d array, return an array of deterministic integers.

.. versionadded:: 0.19.2

Parameters
----------
vals : ndarray
encoding : string, default 'utf8'
encoding for data & key when strings
hash_key : string key to encode, default to _default_hash_key

Returns
-------
1d uint64 numpy array of hash values, same length as the vals

"""

# work with cagegoricals as ints. (This check is above the complex
# check so that we don't ask numpy if categorical is a subdtype of
# complex, as it will choke.
if hash_key is None:
hash_key = _default_hash_key

if is_categorical_dtype(vals.dtype):
vals = vals.codes

# we'll be working with everything as 64-bit values, so handle this
# 128-bit value early
if np.issubdtype(vals.dtype, np.complex128):
return hash_array(vals.real) + 23 * hash_array(vals.imag)

# MAIN LOGIC:
inferred = infer_dtype(vals)

# First, turn whatever array this is into unsigned 64-bit ints, if we can
# manage it.
if inferred == 'boolean':
vals = vals.astype('u8')

if (np.issubdtype(vals.dtype, np.datetime64) or
np.issubdtype(vals.dtype, np.timedelta64) or
np.issubdtype(vals.dtype, np.number)) and vals.dtype.itemsize <= 8:

vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
else:

# its MUCH faster to categorize object dtypes, then hash and rename
codes, categories = factorize(vals, sort=False)
categories = Index(categories)
c = Series(Categorical(codes, categories,
ordered=False, fastpath=True))
vals = _hash.hash_object_array(categories.values,
hash_key,
encoding)

# rename & extract
vals = c.cat.rename_categories(Index(vals)).astype(np.uint64).values

# Then, redistribute these 64-bit ints within the space of 64-bit ints
vals ^= vals >> 30
vals *= np.uint64(0xbf58476d1ce4e5b9)
vals ^= vals >> 27
vals *= np.uint64(0x94d049bb133111eb)
vals ^= vals >> 31
return vals
Loading