diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 9807639143ddb..53b7d55368f6a 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -1,5 +1,6 @@ import numpy as np import pandas as pd +from pandas.util import testing as tm class algorithm(object): @@ -55,3 +56,35 @@ def time_add_overflow_neg_arr(self): def time_add_overflow_mixed_arr(self): self.checked_add(self.arr, self.arrmixed) + + +class hashing(object): + goal_time = 0.2 + + def setup(self): + N = 100000 + + self.df = pd.DataFrame( + {'A': pd.Series(tm.makeStringIndex(100).take( + np.random.randint(0, 100, size=N))), + 'B': pd.Series(tm.makeStringIndex(10000).take( + np.random.randint(0, 10000, size=N))), + 'D': np.random.randn(N), + 'E': np.arange(N), + 'F': pd.date_range('20110101', freq='s', periods=N), + 'G': pd.timedelta_range('1 day', freq='s', periods=N), + }) + self.df['C'] = self.df['B'].astype('category') + self.df.iloc[10:20] = np.nan + + def time_frame(self): + self.df.hash() + + def time_series_int(self): + self.df.E.hash() + + def time_series_string(self): + self.df.B.hash() + + def time_series_categorical(self): + self.df.C.hash() diff --git a/pandas/src/hash.pyx b/pandas/src/hash.pyx new file mode 100644 index 0000000000000..b8c309f1f7a13 --- /dev/null +++ b/pandas/src/hash.pyx @@ -0,0 +1,180 @@ +# cython: profile=False +# Translated from the reference implementation +# at https://github.com/veorq/SipHash + +import cython +cimport numpy as cnp +import numpy as np +from numpy cimport ndarray, uint8_t, uint32_t, uint64_t + +from cpython cimport (PyString_Check, + PyBytes_Check, + PyUnicode_Check) +from libc.stdlib cimport malloc, free + +DEF cROUNDS = 2 +DEF dROUNDS = 4 + + +@cython.boundscheck(False) +def hash_object_array(ndarray[object] arr, object key, object encoding='utf8'): + """ + Parameters + ---------- + arr : 1-d object ndarray of objects + key : hash key, must be 16 byte len encoded + encoding : encoding for key & arr, default to 'utf8' + + Returns + ------- + 1-d uint64 ndarray of hashes + + """ + cdef: + Py_ssize_t i, l, n + ndarray[uint64_t] result + bytes data, k + uint8_t *kb, *lens + char **vecs, *cdata + object val + + k = key.encode(encoding) + kb = k + if len(k) != 16: + raise ValueError( + 'key should be a 16-byte string encoded, got {!r} (len {})'.format( + k, len(k))) + + n = len(arr) + + # create an array of bytes + vecs = malloc(n * sizeof(char *)) + lens = malloc(n * sizeof(uint8_t)) + + cdef list datas = [] + for i in range(n): + val = arr[i] + if PyString_Check(val): + data = val.encode(encoding) + elif PyBytes_Check(val): + data = val + elif PyUnicode_Check(val): + data = val.encode(encoding) + else: + # non-strings + data = str(val).encode(encoding) + + l = len(data) + lens[i] = l + cdata = data + + # keep the refernce alive thru the end of the + # function + datas.append(data) + vecs[i] = cdata + + result = np.empty(n, dtype=np.uint64) + with nogil: + for i in range(n): + result[i] = low_level_siphash(vecs[i], lens[i], kb) + + free(vecs) + free(lens) + return result + +cdef inline uint64_t _rotl(uint64_t x, uint64_t b) nogil: + return (x << b) | (x >> (64 - b)) + +cdef inline void u32to8_le(uint8_t* p, uint32_t v) nogil: + p[0] = (v) + p[1] = (v >> 8) + p[2] = (v >> 16) + p[3] = (v >> 24) + +cdef inline void u64to8_le(uint8_t* p, uint64_t v) nogil: + u32to8_le(p, v) + u32to8_le(p + 4, (v >> 32)) + +cdef inline uint64_t u8to64_le(uint8_t* p) nogil: + return (p[0] | + p[1] << 8 | + p[2] << 16 | + p[3] << 24 | + p[4] << 32 | + p[5] << 40 | + p[6] << 48 | + p[7] << 56) + +cdef inline void _sipround(uint64_t* v0, uint64_t* v1, + uint64_t* v2, uint64_t* v3) nogil: + v0[0] += v1[0] + v1[0] = _rotl(v1[0], 13) + v1[0] ^= v0[0] + v0[0] = _rotl(v0[0], 32) + v2[0] += v3[0] + v3[0] = _rotl(v3[0], 16) + v3[0] ^= v2[0] + v0[0] += v3[0] + v3[0] = _rotl(v3[0], 21) + v3[0] ^= v0[0] + v2[0] += v1[0] + v1[0] = _rotl(v1[0], 17) + v1[0] ^= v2[0] + v2[0] = _rotl(v2[0], 32) + +cpdef uint64_t siphash(bytes data, bytes key) except? 0: + if len(key) != 16: + raise ValueError( + 'key should be a 16-byte bytestring, got {!r} (len {})'.format( + key, len(key))) + return low_level_siphash(data, len(data), key) + + +@cython.cdivision(True) +cdef uint64_t low_level_siphash(uint8_t* data, size_t datalen, + uint8_t* key) nogil: + cdef uint64_t v0 = 0x736f6d6570736575ULL + cdef uint64_t v1 = 0x646f72616e646f6dULL + cdef uint64_t v2 = 0x6c7967656e657261ULL + cdef uint64_t v3 = 0x7465646279746573ULL + cdef uint64_t b + cdef uint64_t k0 = u8to64_le(key) + cdef uint64_t k1 = u8to64_le(key + 8) + cdef uint64_t m + cdef int i + cdef uint8_t* end = data + datalen - (datalen % sizeof(uint64_t)) + cdef int left = datalen & 7 + cdef int left_byte + + b = (datalen) << 56 + v3 ^= k1 + v2 ^= k0 + v1 ^= k1 + v0 ^= k0 + + while (data != end): + m = u8to64_le(data) + v3 ^= m + for i in range(cROUNDS): + _sipround(&v0, &v1, &v2, &v3) + v0 ^= m + + data += sizeof(uint64_t) + + for i in range(left-1, -1, -1): + b |= (data[i]) << (i * 8) + + v3 ^= b + + for i in range(cROUNDS): + _sipround(&v0, &v1, &v2, &v3) + + v0 ^= b + v2 ^= 0xff + + for i in range(dROUNDS): + _sipround(&v0, &v1, &v2, &v3) + + b = v0 ^ v1 ^ v2 ^ v3 + + return b diff --git a/pandas/tools/hashing.py b/pandas/tools/hashing.py new file mode 100644 index 0000000000000..aa18b8bc70c37 --- /dev/null +++ b/pandas/tools/hashing.py @@ -0,0 +1,137 @@ +""" +data hash pandas / numpy objects +""" + +import numpy as np +from pandas import _hash, Series, factorize, Categorical, Index +from pandas.lib import infer_dtype +from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame +from pandas.types.common import is_categorical_dtype + +# 16 byte long hashing key +_default_hash_key = '0123456789123456' + + +def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None): + """ + Return a data hash of the Index/Series/DataFrame + + .. versionadded:: 0.19.2 + + Parameters + ---------- + index : boolean, default True + include the index in the hash (if Series/DataFrame) + encoding : string, default 'utf8' + encoding for data & key when strings + hash_key : string key to encode, default to _default_hash_key + + Returns + ------- + Series of uint64, same length as the object + + """ + if hash_key is None: + hash_key = _default_hash_key + + def adder(h, hashed_to_add): + h = np.multiply(h, np.uint(3), h) + return np.add(h, hashed_to_add, h) + + if isinstance(obj, ABCIndexClass): + h = hash_array(obj.values, encoding, hash_key).astype('uint64') + h = Series(h, index=obj, dtype='uint64') + elif isinstance(obj, ABCSeries): + h = hash_array(obj.values, encoding, hash_key).astype('uint64') + if index: + h = adder(h, hash_pandas_object(obj.index, + index=False, + encoding=encoding, + hash_key=hash_key).values) + h = Series(h, index=obj.index, dtype='uint64') + elif isinstance(obj, ABCDataFrame): + cols = obj.iteritems() + first_series = next(cols)[1] + h = hash_array(first_series.values, encoding, + hash_key).astype('uint64') + for _, col in cols: + h = adder(h, hash_array(col.values, encoding, hash_key)) + if index: + h = adder(h, hash_pandas_object(obj.index, + index=False, + encoding=encoding, + hash_key=hash_key).values) + + h = Series(h, index=obj.index, dtype='uint64') + else: + raise TypeError("Unexpected type for hashing %s" % type(obj)) + return h + + +def hash_array(vals, encoding='utf8', hash_key=None): + """ + Given a 1d array, return an array of deterministic integers. + + .. versionadded:: 0.19.2 + + Parameters + ---------- + vals : ndarray + encoding : string, default 'utf8' + encoding for data & key when strings + hash_key : string key to encode, default to _default_hash_key + + Returns + ------- + 1d uint64 numpy array of hash values, same length as the vals + + """ + + # work with cagegoricals as ints. (This check is above the complex + # check so that we don't ask numpy if categorical is a subdtype of + # complex, as it will choke. + if hash_key is None: + hash_key = _default_hash_key + + if is_categorical_dtype(vals.dtype): + vals = vals.codes + + # we'll be working with everything as 64-bit values, so handle this + # 128-bit value early + if np.issubdtype(vals.dtype, np.complex128): + return hash_array(vals.real) + 23 * hash_array(vals.imag) + + # MAIN LOGIC: + inferred = infer_dtype(vals) + + # First, turn whatever array this is into unsigned 64-bit ints, if we can + # manage it. + if inferred == 'boolean': + vals = vals.astype('u8') + + if (np.issubdtype(vals.dtype, np.datetime64) or + np.issubdtype(vals.dtype, np.timedelta64) or + np.issubdtype(vals.dtype, np.number)) and vals.dtype.itemsize <= 8: + + vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8') + else: + + # its MUCH faster to categorize object dtypes, then hash and rename + codes, categories = factorize(vals, sort=False) + categories = Index(categories) + c = Series(Categorical(codes, categories, + ordered=False, fastpath=True)) + vals = _hash.hash_object_array(categories.values, + hash_key, + encoding) + + # rename & extract + vals = c.cat.rename_categories(Index(vals)).astype(np.uint64).values + + # Then, redistribute these 64-bit ints within the space of 64-bit ints + vals ^= vals >> 30 + vals *= np.uint64(0xbf58476d1ce4e5b9) + vals ^= vals >> 27 + vals *= np.uint64(0x94d049bb133111eb) + vals ^= vals >> 31 + return vals diff --git a/pandas/tools/tests/test_hashing.py b/pandas/tools/tests/test_hashing.py new file mode 100644 index 0000000000000..3e4c77244d2f7 --- /dev/null +++ b/pandas/tools/tests/test_hashing.py @@ -0,0 +1,143 @@ +import numpy as np +import pandas as pd + +from pandas import DataFrame, Series, Index +from pandas.tools.hashing import hash_array, hash_pandas_object +import pandas.util.testing as tm + + +class TestHashing(tm.TestCase): + + _multiprocess_can_split_ = True + + def setUp(self): + self.df = DataFrame( + {'i32': np.array([1, 2, 3] * 3, dtype='int32'), + 'f32': np.array([None, 2.5, 3.5] * 3, dtype='float32'), + 'cat': Series(['a', 'b', 'c'] * 3).astype('category'), + 'obj': Series(['d', 'e', 'f'] * 3), + 'bool': np.array([True, False, True] * 3), + 'dt': Series(pd.date_range('20130101', periods=9)), + 'dt_tz': Series(pd.date_range('20130101', periods=9, + tz='US/Eastern')), + 'td': Series(pd.timedelta_range('2000', periods=9))}) + + def test_consistency(self): + # check that our hash doesn't change because of a mistake + # in the actual code; this is the ground truth + result = hash_pandas_object(Index(['foo', 'bar', 'baz'])) + expected = Series(np.array([3600424527151052760, 1374399572096150070, + 477881037637427054], dtype='uint64'), + index=['foo', 'bar', 'baz']) + tm.assert_series_equal(result, expected) + + def test_hash_array(self): + for name, s in self.df.iteritems(): + a = s.values + tm.assert_numpy_array_equal(hash_array(a), hash_array(a)) + + def check_equal(self, obj, **kwargs): + a = hash_pandas_object(obj, **kwargs) + b = hash_pandas_object(obj, **kwargs) + tm.assert_series_equal(a, b) + + kwargs.pop('index', None) + a = hash_pandas_object(obj, **kwargs) + b = hash_pandas_object(obj, **kwargs) + tm.assert_series_equal(a, b) + + def check_not_equal_with_index(self, obj): + + # check that we are not hashing the same if + # we include the index + if not isinstance(obj, Index): + a = hash_pandas_object(obj, index=True) + b = hash_pandas_object(obj, index=False) + self.assertFalse((a == b).all()) + + def test_hash_pandas_object(self): + + for obj in [Series([1, 2, 3]), + Series([1.0, 1.5, 3.2]), + Series([1.0, 1.5, np.nan]), + Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]), + Series(['a', 'b', 'c']), + Series(['a', np.nan, 'c']), + Series([True, False, True]), + Index([1, 2, 3]), + Index([True, False, True]), + DataFrame({'x': ['a', 'b', 'c'], 'y': [1, 2, 3]}), + tm.makeMissingDataframe(), + tm.makeMixedDataFrame(), + tm.makeTimeDataFrame(), + tm.makeTimeSeries(), + tm.makeTimedeltaIndex(), + Series([1, 2, 3], index=pd.MultiIndex.from_tuples( + [('a', 1), ('a', 2), ('b', 1)]))]: + self.check_equal(obj) + self.check_not_equal_with_index(obj) + + def test_hash_pandas_object2(self): + for name, s in self.df.iteritems(): + self.check_equal(s) + self.check_not_equal_with_index(s) + + def test_hash_pandas_empty_object(self): + for obj in [Series([], dtype='float64'), + Series([], dtype='object'), + Index([])]: + self.check_equal(obj) + + # these are by-definition the same with + # or w/o the index as the data is empty + + def test_errors(self): + + for obj in [pd.Timestamp('20130101'), tm.makePanel()]: + def f(): + hash_pandas_object(f) + + self.assertRaises(TypeError, f) + + def test_hash_keys(self): + # using different hash keys, should have different hashes + # for the same data + + # this only matters for object dtypes + obj = Series(list('abc')) + a = hash_pandas_object(obj, hash_key='9876543210123456') + b = hash_pandas_object(obj, hash_key='9876543210123465') + self.assertTrue((a != b).all()) + + def test_invalid_key(self): + # this only matters for object dtypes + def f(): + hash_pandas_object(Series(list('abc')), hash_key='foo') + self.assertRaises(ValueError, f) + + def test_mixed(self): + # mixed objects + obj = Series(['1', 2, 3]) + self.check_equal(obj) + self.check_not_equal_with_index(obj) + + # mixed are actually equal when stringified + a = hash_pandas_object(obj) + b = hash_pandas_object(Series(list('123'))) + self.assert_series_equal(a, b) + + def test_alread_encoded(self): + # if already encoded then ok + + obj = Series(list('abc')).str.encode('utf8') + self.check_equal(obj) + + def test_alternate_encoding(self): + + obj = Series(list('abc')) + self.check_equal(obj, encoding='ascii') + + def test_long_strings(self): + + obj = Index(tm.rands_array(nchars=10000, size=100)) + self.check_equal(obj) diff --git a/setup.py b/setup.py index 2dd3fec150781..8d2e2669852ea 100755 --- a/setup.py +++ b/setup.py @@ -331,6 +331,7 @@ class CheckSDist(sdist_class): 'pandas/src/period.pyx', 'pandas/src/sparse.pyx', 'pandas/src/testing.pyx', + 'pandas/src/hash.pyx', 'pandas/io/sas/saslib.pyx'] def initialize_options(self): @@ -501,10 +502,12 @@ def pxd(name): 'sources': ['pandas/src/parser/tokenizer.c', 'pandas/src/parser/io.c']}, _sparse={'pyxfile': 'src/sparse', - 'depends': ([srcpath('sparse', suffix='.pyx')] - + _pxi_dep['_sparse'])}, + 'depends': ([srcpath('sparse', suffix='.pyx')] + + _pxi_dep['_sparse'])}, _testing={'pyxfile': 'src/testing', 'depends': [srcpath('testing', suffix='.pyx')]}, + _hash={'pyxfile': 'src/hash', + 'depends': [srcpath('hash', suffix='.pyx')]}, ) ext_data["io.sas.saslib"] = {'pyxfile': 'io/sas/saslib'}