Skip to content

Commit 14d0f1e

Browse files
committed
ENH: add data hashing routines
xref dask/dask#1807
1 parent 22d982a commit 14d0f1e

File tree

7 files changed

+387
-4
lines changed

7 files changed

+387
-4
lines changed

doc/source/whatsnew/v0.19.2.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,14 @@ Highlights include:
1616
:backlinks: none
1717

1818

19+
.. _whatsnew_0192.enhancements:
20+
21+
Enhancements
22+
~~~~~~~~~~~~
23+
24+
- ``Series/DataFrame/Index`` gain a ``.hash()`` method to provide a data hash (:issue:`14729`)
25+
26+
1927
.. _whatsnew_0192.performance:
2028

2129
Performance Improvements

pandas/core/base.py

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -795,7 +795,53 @@ def __unicode__(self):
795795
return "%s(%s, dtype='%s')" % (type(self).__name__, prepr, self.dtype)
796796

797797

798-
class IndexOpsMixin(object):
798+
class HashableMixin(object):
799+
""" provide methods for hashable pandas objects """
800+
801+
def hash(self, index=True):
802+
"""
803+
Return a data hash of the Series/DataFrame
804+
This is a 1-d array of unique hashses of all of the elements in that
805+
row, including the Index if desired.
806+
807+
Parameters
808+
----------
809+
index : boolean, default True
810+
include the index in the hash (if Series/DataFrame)
811+
812+
Returns
813+
-------
814+
1d uint64 numpy array of hash values, same length as the
815+
object
816+
817+
Examples
818+
--------
819+
>>> pd.Index([1, 2, 3]).hash()
820+
array([6238072747940578789, 15839785061582574730,
821+
2185194620014831856], dtype=uint64)
822+
823+
>>> pd.Series([1, 2, 3]).hash()
824+
array([267474170112184751, 16863939785269199747,
825+
3948624847917518682], dtype=uint64)
826+
827+
>>> pd.Series([1, 2, 3]).hash(index=False)
828+
array([6238072747940578789, 15839785061582574730,
829+
2185194620014831856], dtype=uint64)
830+
831+
>>> pd.DataFrame({'A': [1, 2, 3]}).hash()
832+
array([267474170112184751, 16863939785269199747,
833+
3948624847917518682], dtype=uint64)
834+
835+
>>> pd.DataFrame({'A': [1, 2, 3], 'B': ['foo', 'bar', 'baz']}).hash()
836+
array([10443389771536978168, 14762841401800935363,
837+
9751253963311919054], dtype=uint64)
838+
839+
"""
840+
from pandas.tools.hashing import hash_pandas_object
841+
return hash_pandas_object(self, index=index)
842+
843+
844+
class IndexOpsMixin(HashableMixin):
799845
""" common ops mixin to support a unified inteface / docs for Series /
800846
Index
801847
"""

pandas/core/frame.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,7 @@
201201
# DataFrame class
202202

203203

204-
class DataFrame(NDFrame):
204+
class DataFrame(NDFrame, base.HashableMixin):
205205
""" Two-dimensional size-mutable, potentially heterogeneous tabular data
206206
structure with labeled axes (rows and columns). Arithmetic operations
207207
align on both row and column labels. Can be thought of as a dict-like

pandas/src/hash.pyx

Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
# cython: profile=False
2+
# Translated from the reference implementation at https://github.com/veorq/SipHash
3+
4+
import cython
5+
cimport numpy as cnp
6+
import numpy as np
7+
from numpy cimport ndarray, uint8_t, uint32_t, uint64_t
8+
9+
from cpython cimport (PyString_Check,
10+
PyBytes_Check,
11+
PyUnicode_Check)
12+
from libc.stdlib cimport malloc, free
13+
14+
DEF cROUNDS = 2
15+
DEF dROUNDS = 4
16+
17+
18+
def hash_object_array(ndarray[object] arr, object key):
19+
"""
20+
Parameters
21+
----------
22+
arr : 1-d object ndarray of objects
23+
24+
Returns
25+
-------
26+
1-d uint64 ndarray of hashes
27+
28+
"""
29+
cdef:
30+
Py_ssize_t i, n
31+
ndarray[uint64_t] result
32+
bytes data, k
33+
uint8_t *kb
34+
object val
35+
36+
k = <bytes>key.encode('utf8')
37+
kb = <uint8_t *>k
38+
n = len(arr)
39+
40+
# create an array of bytes
41+
cdef uint8_t **vecs = <uint8_t**> malloc(n * sizeof(uint8_t*))
42+
cdef uint8_t *lens = <uint8_t*> malloc(n * sizeof(uint8_t))
43+
44+
for i in range(n):
45+
val = arr[i]
46+
if PyString_Check(val):
47+
data = <bytes>val.encode('utf8')
48+
elif PyBytes_Check(val):
49+
data = <bytes>val
50+
elif PyUnicode_Check(val):
51+
data = <bytes>val.encode('utf8')
52+
else:
53+
# non-strings
54+
data = <bytes>str(val).encode('utf8')
55+
vecs[i] = data
56+
lens[i] = len(data)
57+
58+
result = np.empty(n, dtype=np.uint64)
59+
with nogil:
60+
for i in range(n):
61+
result[i] = low_level_siphash(vecs[i], lens[i], kb)
62+
63+
free(vecs)
64+
return result
65+
66+
cdef inline uint64_t _rotl(uint64_t x, uint64_t b) nogil:
67+
return (x << b) | (x >> (64 - b))
68+
69+
cdef inline void u32to8_le(uint8_t* p, uint32_t v) nogil:
70+
p[0] = <uint8_t>(v)
71+
p[1] = <uint8_t>(v >> 8)
72+
p[2] = <uint8_t>(v >> 16)
73+
p[3] = <uint8_t>(v >> 24)
74+
75+
cdef inline void u64to8_le(uint8_t* p, uint64_t v) nogil:
76+
u32to8_le(p, <uint32_t>v)
77+
u32to8_le(p + 4, <uint32_t>(v >> 32))
78+
79+
cdef inline uint64_t u8to64_le(uint8_t* p) nogil:
80+
return (<uint64_t>p[0] |
81+
<uint64_t>p[1] << 8 |
82+
<uint64_t>p[2] << 16 |
83+
<uint64_t>p[3] << 24 |
84+
<uint64_t>p[4] << 32 |
85+
<uint64_t>p[5] << 40 |
86+
<uint64_t>p[6] << 48 |
87+
<uint64_t>p[7] << 56)
88+
89+
cdef inline void _sipround(uint64_t* v0, uint64_t* v1,
90+
uint64_t* v2, uint64_t* v3) nogil:
91+
v0[0] += v1[0]
92+
v1[0] = _rotl(v1[0], 13)
93+
v1[0] ^= v0[0]
94+
v0[0] = _rotl(v0[0], 32)
95+
v2[0] += v3[0]
96+
v3[0] = _rotl(v3[0], 16)
97+
v3[0] ^= v2[0]
98+
v0[0] += v3[0]
99+
v3[0] = _rotl(v3[0], 21)
100+
v3[0] ^= v0[0]
101+
v2[0] += v1[0]
102+
v1[0] = _rotl(v1[0], 17)
103+
v1[0] ^= v2[0]
104+
v2[0] = _rotl(v2[0], 32)
105+
106+
cpdef uint64_t siphash(bytes data, bytes key) except? 0:
107+
if len(key) != 16:
108+
raise ValueError(
109+
'key should be a 16-byte bytestring, got {!r} (len {})'.format(
110+
key, len(key)))
111+
return low_level_siphash(data, len(data), key)
112+
113+
@cython.cdivision(True)
114+
cdef uint64_t low_level_siphash(uint8_t* data, size_t datalen,
115+
uint8_t* key) nogil:
116+
cdef uint64_t v0 = 0x736f6d6570736575ULL
117+
cdef uint64_t v1 = 0x646f72616e646f6dULL
118+
cdef uint64_t v2 = 0x6c7967656e657261ULL
119+
cdef uint64_t v3 = 0x7465646279746573ULL
120+
cdef uint64_t b
121+
cdef uint64_t k0 = u8to64_le(key)
122+
cdef uint64_t k1 = u8to64_le(key + 8)
123+
cdef uint64_t m
124+
cdef int i
125+
cdef uint8_t* end = data + datalen - (datalen % sizeof(uint64_t))
126+
cdef int left = datalen & 7
127+
cdef int left_byte
128+
129+
b = (<uint64_t>datalen) << 56
130+
v3 ^= k1
131+
v2 ^= k0
132+
v1 ^= k1
133+
v0 ^= k0
134+
135+
while (data != end):
136+
m = u8to64_le(data)
137+
v3 ^= m
138+
for i in range(cROUNDS):
139+
_sipround(&v0, &v1, &v2, &v3)
140+
v0 ^= m
141+
142+
data += sizeof(uint64_t)
143+
144+
for i in range(left-1, -1, -1):
145+
b |= (<uint64_t>data[i]) << (i * 8)
146+
147+
v3 ^= b
148+
149+
for i in range(cROUNDS):
150+
_sipround(&v0, &v1, &v2, &v3)
151+
152+
v0 ^= b
153+
v2 ^= 0xff
154+
155+
for i in range(dROUNDS):
156+
_sipround(&v0, &v1, &v2, &v3)
157+
158+
b = v0 ^ v1 ^ v2 ^ v3
159+
160+
return b

pandas/tools/hashing.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
"""
2+
data hash pandas / numpy objects
3+
"""
4+
5+
import numpy as np
6+
from pandas import Series
7+
from pandas import _hash
8+
from pandas.lib import infer_dtype
9+
from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame
10+
from pandas.types.common import is_categorical_dtype
11+
12+
# 16 byte long hashing key
13+
_hash_key = '0123456789123456'
14+
15+
def hash_pandas_object(obj, index=True):
16+
"""
17+
Return a data hash of the Index/Series/DataFrame
18+
19+
Parameters
20+
----------
21+
index : boolean, default True
22+
include the index in the hash (if Series/DataFrame)
23+
24+
Returns
25+
-------
26+
1d uint64 numpy array of hash values, same length as the
27+
object
28+
"""
29+
30+
def adder(h, hashed_to_add):
31+
h = np.multiply(h, np.uint(3), h)
32+
return np.add(h, hashed_to_add, h)
33+
34+
if isinstance(obj, ABCIndexClass):
35+
h = hash_array(obj.values).astype('uint64')
36+
elif isinstance(obj, ABCSeries):
37+
h = hash_array(obj.values).astype('uint64')
38+
if index:
39+
h = adder(h, hash_pandas_object(obj.index))
40+
elif isinstance(obj, ABCDataFrame):
41+
cols = obj.iteritems()
42+
first_series = next(cols)[1]
43+
h = hash_array(first_series.values).astype('uint64')
44+
for _, col in cols:
45+
h = adder(h, hash_array(col.values))
46+
if index:
47+
h = adder(h, hash_pandas_object(obj.index))
48+
else:
49+
raise TypeError("Unexpected type %s" % type(obj))
50+
return h
51+
52+
53+
def hash_array(vals):
54+
"""Given a 1d array, return an array of deterministic integers."""
55+
# work with cagegoricals as ints. (This check is above the complex
56+
# check so that we don't ask numpy if categorical is a subdtype of
57+
# complex, as it will choke.
58+
if is_categorical_dtype(vals.dtype):
59+
vals = vals.codes
60+
61+
# we'll be working with everything as 64-bit values, so handle this
62+
# 128-bit value early
63+
if np.issubdtype(vals.dtype, np.complex128):
64+
return hash_array(vals.real) + 23 * hash_array(vals.imag)
65+
66+
# MAIN LOGIC:
67+
inferred = infer_dtype(vals)
68+
69+
# First, turn whatever array this is into unsigned 64-bit ints, if we can
70+
# manage it.
71+
if inferred == 'boolean':
72+
vals = vals.astype('u8')
73+
74+
if (np.issubdtype(vals.dtype, np.datetime64) or
75+
np.issubdtype(vals.dtype, np.timedelta64) or
76+
np.issubdtype(vals.dtype, np.number)) and vals.dtype.itemsize <= 8:
77+
78+
vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
79+
else:
80+
81+
# we want to stringify
82+
# then apply a consistent hashing scheme
83+
vals = _hash.hash_object_array(vals, _hash_key)
84+
85+
# Then, redistribute these 64-bit ints within the space of 64-bit ints
86+
vals ^= vals >> 30
87+
vals *= np.uint64(0xbf58476d1ce4e5b9)
88+
vals ^= vals >> 27
89+
vals *= np.uint64(0x94d049bb133111eb)
90+
vals ^= vals >> 31
91+
return vals

0 commit comments

Comments
 (0)