Skip to content

Commit 0f8cf5d

Browse files
committed
ENH: add data hashing routines
xref dask/dask#1807
1 parent 22d982a commit 0f8cf5d

File tree

8 files changed

+550
-4
lines changed

8 files changed

+550
-4
lines changed

asv_bench/benchmarks/algorithms.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import numpy as np
22
import pandas as pd
3+
from pandas.util import testing as tm
34

45

56
class algorithm(object):
@@ -55,3 +56,35 @@ def time_add_overflow_neg_arr(self):
5556

5657
def time_add_overflow_mixed_arr(self):
5758
self.checked_add(self.arr, self.arrmixed)
59+
60+
61+
class hashing(object):
62+
goal_time = 0.2
63+
64+
def setup(self):
65+
N = 100000
66+
67+
self.df = pd.DataFrame(
68+
{'A': pd.Series(tm.makeStringIndex(100).take(
69+
np.random.randint(0, 100, size=N))),
70+
'B': pd.Series(tm.makeStringIndex(10000).take(
71+
np.random.randint(0, 10000, size=N))),
72+
'D': np.random.randn(N),
73+
'E': np.arange(N),
74+
'F': pd.date_range('20110101', freq='s', periods=N),
75+
'G': pd.timedelta_range('1 day', freq='s', periods=N),
76+
})
77+
self.df['C'] = self.df['B'].astype('category')
78+
self.df.iloc[10:20] = np.nan
79+
80+
def time_frame(self):
81+
self.df.hash()
82+
83+
def time_series_int(self):
84+
self.df.E.hash()
85+
86+
def time_series_string(self):
87+
self.df.B.hash()
88+
89+
def time_series_categorical(self):
90+
self.df.C.hash()

doc/source/whatsnew/v0.19.2.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,14 @@ Highlights include:
1616
:backlinks: none
1717

1818

19+
.. _whatsnew_0192.enhancements:
20+
21+
Enhancements
22+
~~~~~~~~~~~~
23+
24+
- ``Series/DataFrame/Index`` gain a ``.hash()`` method to provide a data hash (:issue:`14729`)
25+
26+
1927
.. _whatsnew_0192.performance:
2028

2129
Performance Improvements

pandas/core/base.py

Lines changed: 75 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -795,7 +795,81 @@ def __unicode__(self):
795795
return "%s(%s, dtype='%s')" % (type(self).__name__, prepr, self.dtype)
796796

797797

798-
class IndexOpsMixin(object):
798+
class HashableMixin(object):
799+
""" provide methods for hashable pandas objects """
800+
801+
def hash(self, index=True, encoding='utf8', hash_key=None):
802+
"""
803+
Return a data hash of the Series/DataFrame
804+
This is a 1-d array of unique hashses of all of the elements in that
805+
row, including the Index if desired.
806+
807+
Parameters
808+
----------
809+
index : boolean, default True
810+
include the index in the hash (if Series/DataFrame)
811+
encoding : string, default 'utf8'
812+
encoding for data & key when strings
813+
hash_key : string, must be 16 bytes length if passed
814+
815+
Returns
816+
-------
817+
1d uint64 numpy array of hash values, same length as the
818+
object
819+
820+
Examples
821+
--------
822+
>>> pd.Index([1, 2, 3]).hash()
823+
array([ 6238072747940578789, 15839785061582574730,
824+
2185194620014831856], dtype=uint64)
825+
826+
>>> pd.Series([1, 2, 3]).hash()
827+
array([ 267474170112184751, 16863939785269199747,
828+
3948624847917518682], dtype=uint64)
829+
830+
>>> pd.Series([1, 2, 3]).hash(index=False)
831+
array([ 6238072747940578789, 15839785061582574730,
832+
2185194620014831856], dtype=uint64)
833+
834+
>>> pd.DataFrame({'A': [1, 2, 3]}).hash()
835+
array([ 267474170112184751, 16863939785269199747,
836+
3948624847917518682], dtype=uint64)
837+
838+
>>> pd.DataFrame({'A': [1, 2, 3], 'B': ['foo', 'bar', 'baz']}).hash()
839+
array([11603696091789712533, 5345384428795788641,
840+
46691607209239364], dtype=uint64)
841+
842+
843+
Notes
844+
-----
845+
These functions do not hash attributes attached to the object
846+
e.g. name for Index/Series. Nor do they hash the columns of
847+
a DataFrame.
848+
849+
Mixed dtypes within a Series (or a column of a DataFrame) will
850+
be stringified, for example.
851+
852+
>>> Series(['1', 2, 3]).hash()
853+
array([ 8973981985592347666, 16940873351292606887,
854+
10100427194775696709], dtype=uint64)
855+
856+
>>> Series(['1', '2', '3']).hash()
857+
array([ 8973981985592347666, 16940873351292606887,
858+
10100427194775696709], dtype=uint64)
859+
860+
These have the same data hash, while a pure dtype is different.
861+
862+
>>> Series([1, 2, 3]).hash()
863+
array([ 267474170112184751, 16863939785269199747,
864+
3948624847917518682], dtype=uint64)
865+
866+
"""
867+
from pandas.tools.hashing import hash_pandas_object
868+
return hash_pandas_object(self, index=index, encoding=encoding,
869+
hash_key=hash_key)
870+
871+
872+
class IndexOpsMixin(HashableMixin):
799873
""" common ops mixin to support a unified inteface / docs for Series /
800874
Index
801875
"""

pandas/core/frame.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,7 @@
201201
# DataFrame class
202202

203203

204-
class DataFrame(NDFrame):
204+
class DataFrame(NDFrame, base.HashableMixin):
205205
""" Two-dimensional size-mutable, potentially heterogeneous tabular data
206206
structure with labeled axes (rows and columns). Arithmetic operations
207207
align on both row and column labels. Can be thought of as a dict-like

pandas/src/hash.pyx

Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
# cython: profile=False
2+
# Translated from the reference implementation
3+
# at https://github.com/veorq/SipHash
4+
5+
import cython
6+
cimport numpy as cnp
7+
import numpy as np
8+
from numpy cimport ndarray, uint8_t, uint32_t, uint64_t
9+
10+
from cpython cimport (PyString_Check,
11+
PyBytes_Check,
12+
PyUnicode_Check)
13+
from libc.stdlib cimport malloc, free
14+
15+
cdef extern from "stdlib.h":
16+
void memcpy(void *dst, void *src, size_t n)
17+
18+
DEF cROUNDS = 2
19+
DEF dROUNDS = 4
20+
21+
22+
def hash_object_array(ndarray[object] arr, object key, object encoding='utf8'):
23+
"""
24+
Parameters
25+
----------
26+
arr : 1-d object ndarray of objects
27+
key : hash key, must be 16 byte len encoded
28+
encoding : encoding for key & arr, default to 'utf8'
29+
30+
Returns
31+
-------
32+
1-d uint64 ndarray of hashes
33+
34+
"""
35+
cdef:
36+
Py_ssize_t i, l, n
37+
ndarray[uint64_t] result
38+
bytes data, k
39+
uint8_t *kb, *lens
40+
char **vecs, *cdata
41+
object val
42+
43+
k = <bytes>key.encode(encoding)
44+
kb = <uint8_t *>k
45+
if len(k) != 16:
46+
raise ValueError(
47+
'key should be a 16-byte string encoded, got {!r} (len {})'.format(
48+
k, len(k)))
49+
50+
n = len(arr)
51+
52+
# create an array of bytes
53+
vecs = <char **> malloc(n * sizeof(char *))
54+
lens = <uint8_t*> malloc(n * sizeof(uint8_t))
55+
56+
for i in range(n):
57+
val = arr[i]
58+
if PyString_Check(val):
59+
data = <bytes>val.encode(encoding)
60+
elif PyBytes_Check(val):
61+
data = <bytes>val
62+
elif PyUnicode_Check(val):
63+
data = <bytes>val.encode(encoding)
64+
else:
65+
# non-strings
66+
data = <bytes>str(val).encode(encoding)
67+
68+
l = len(data)
69+
lens[i] = l
70+
vecs[i] = <char *> malloc(l * sizeof(char))
71+
cdata = data
72+
memcpy(vecs[i], cdata, l)
73+
74+
result = np.empty(n, dtype=np.uint64)
75+
with nogil:
76+
for i in range(n):
77+
result[i] = low_level_siphash(<uint8_t *>vecs[i], lens[i], kb)
78+
free(vecs[i])
79+
80+
free(vecs)
81+
free(lens)
82+
return result
83+
84+
cdef inline uint64_t _rotl(uint64_t x, uint64_t b) nogil:
85+
return (x << b) | (x >> (64 - b))
86+
87+
cdef inline void u32to8_le(uint8_t* p, uint32_t v) nogil:
88+
p[0] = <uint8_t>(v)
89+
p[1] = <uint8_t>(v >> 8)
90+
p[2] = <uint8_t>(v >> 16)
91+
p[3] = <uint8_t>(v >> 24)
92+
93+
cdef inline void u64to8_le(uint8_t* p, uint64_t v) nogil:
94+
u32to8_le(p, <uint32_t>v)
95+
u32to8_le(p + 4, <uint32_t>(v >> 32))
96+
97+
cdef inline uint64_t u8to64_le(uint8_t* p) nogil:
98+
return (<uint64_t>p[0] |
99+
<uint64_t>p[1] << 8 |
100+
<uint64_t>p[2] << 16 |
101+
<uint64_t>p[3] << 24 |
102+
<uint64_t>p[4] << 32 |
103+
<uint64_t>p[5] << 40 |
104+
<uint64_t>p[6] << 48 |
105+
<uint64_t>p[7] << 56)
106+
107+
cdef inline void _sipround(uint64_t* v0, uint64_t* v1,
108+
uint64_t* v2, uint64_t* v3) nogil:
109+
v0[0] += v1[0]
110+
v1[0] = _rotl(v1[0], 13)
111+
v1[0] ^= v0[0]
112+
v0[0] = _rotl(v0[0], 32)
113+
v2[0] += v3[0]
114+
v3[0] = _rotl(v3[0], 16)
115+
v3[0] ^= v2[0]
116+
v0[0] += v3[0]
117+
v3[0] = _rotl(v3[0], 21)
118+
v3[0] ^= v0[0]
119+
v2[0] += v1[0]
120+
v1[0] = _rotl(v1[0], 17)
121+
v1[0] ^= v2[0]
122+
v2[0] = _rotl(v2[0], 32)
123+
124+
cpdef uint64_t siphash(bytes data, bytes key) except? 0:
125+
if len(key) != 16:
126+
raise ValueError(
127+
'key should be a 16-byte bytestring, got {!r} (len {})'.format(
128+
key, len(key)))
129+
return low_level_siphash(data, len(data), key)
130+
131+
132+
@cython.cdivision(True)
133+
cdef uint64_t low_level_siphash(uint8_t* data, size_t datalen,
134+
uint8_t* key) nogil:
135+
cdef uint64_t v0 = 0x736f6d6570736575ULL
136+
cdef uint64_t v1 = 0x646f72616e646f6dULL
137+
cdef uint64_t v2 = 0x6c7967656e657261ULL
138+
cdef uint64_t v3 = 0x7465646279746573ULL
139+
cdef uint64_t b
140+
cdef uint64_t k0 = u8to64_le(key)
141+
cdef uint64_t k1 = u8to64_le(key + 8)
142+
cdef uint64_t m
143+
cdef int i
144+
cdef uint8_t* end = data + datalen - (datalen % sizeof(uint64_t))
145+
cdef int left = datalen & 7
146+
cdef int left_byte
147+
148+
b = (<uint64_t>datalen) << 56
149+
v3 ^= k1
150+
v2 ^= k0
151+
v1 ^= k1
152+
v0 ^= k0
153+
154+
while (data != end):
155+
m = u8to64_le(data)
156+
v3 ^= m
157+
for i in range(cROUNDS):
158+
_sipround(&v0, &v1, &v2, &v3)
159+
v0 ^= m
160+
161+
data += sizeof(uint64_t)
162+
163+
for i in range(left-1, -1, -1):
164+
b |= (<uint64_t>data[i]) << (i * 8)
165+
166+
v3 ^= b
167+
168+
for i in range(cROUNDS):
169+
_sipround(&v0, &v1, &v2, &v3)
170+
171+
v0 ^= b
172+
v2 ^= 0xff
173+
174+
for i in range(dROUNDS):
175+
_sipround(&v0, &v1, &v2, &v3)
176+
177+
b = v0 ^ v1 ^ v2 ^ v3
178+
179+
return b

0 commit comments

Comments
 (0)