Skip to content

Commit 2a1db31

Browse files
committed
add in siphash
1 parent 434c8c1 commit 2a1db31

File tree

4 files changed

+144
-11
lines changed

4 files changed

+144
-11
lines changed

pandas/core/strings.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,7 @@ def _na_map(f, arr, na_result=np.nan, dtype=object):
154154
return _map(f, arr, na_mask=True, na_value=na_result, dtype=dtype)
155155

156156

157-
def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object):
157+
def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object, convert=True):
158158
if not len(arr):
159159
return np.ndarray(0, dtype=dtype)
160160

@@ -165,8 +165,8 @@ def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object):
165165
if na_mask:
166166
mask = isnull(arr)
167167
try:
168-
convert = not all(mask)
169-
result = lib.map_infer_mask(arr, f, mask.view(np.uint8), convert)
168+
_convert = convert and not all(mask)
169+
result = lib.map_infer_mask(arr, f, mask.view(np.uint8), _convert)
170170
except (TypeError, AttributeError):
171171

172172
def g(x):

pandas/src/hash.pyx

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
# cython: profile=False
2+
# Translated from the reference implementation at https://github.com/veorq/SipHash
3+
4+
import cython
5+
cimport numpy as cnp
6+
import numpy as np
7+
from numpy cimport ndarray, uint8_t, uint32_t, uint64_t
8+
9+
DEF cROUNDS = 2
10+
DEF dROUNDS = 4
11+
12+
13+
def hash_object_array(ndarray[object] arr, object key):
14+
"""
15+
Parameters
16+
----------
17+
arr : 1-d object ndarray of objects
18+
19+
Returns
20+
-------
21+
1-d uint64 ndarray of hashes
22+
23+
"""
24+
cdef:
25+
Py_ssize_t i, n
26+
ndarray[uint64_t] result
27+
bytes data, k
28+
29+
k = <bytes>key.encode('utf8')
30+
n = len(arr)
31+
result = np.empty(n, dtype=np.uint64)
32+
for i in range(n):
33+
data = <bytes>(arr[i].encode('utf8'))
34+
result[i] = low_level_siphash(data, len(data), k)
35+
return result
36+
37+
cdef inline uint64_t _rotl(uint64_t x, uint64_t b) nogil:
38+
return (x << b) | (x >> (64 - b))
39+
40+
cdef inline void u32to8_le(uint8_t* p, uint32_t v) nogil:
41+
p[0] = <uint8_t>(v)
42+
p[1] = <uint8_t>(v >> 8)
43+
p[2] = <uint8_t>(v >> 16)
44+
p[3] = <uint8_t>(v >> 24)
45+
46+
cdef inline void u64to8_le(uint8_t* p, uint64_t v) nogil:
47+
u32to8_le(p, <uint32_t>v)
48+
u32to8_le(p + 4, <uint32_t>(v >> 32))
49+
50+
cdef inline uint64_t u8to64_le(uint8_t* p) nogil:
51+
return (<uint64_t>p[0] |
52+
<uint64_t>p[1] << 8 |
53+
<uint64_t>p[2] << 16 |
54+
<uint64_t>p[3] << 24 |
55+
<uint64_t>p[4] << 32 |
56+
<uint64_t>p[5] << 40 |
57+
<uint64_t>p[6] << 48 |
58+
<uint64_t>p[7] << 56)
59+
60+
cdef inline void _sipround(uint64_t* v0, uint64_t* v1,
61+
uint64_t* v2, uint64_t* v3) nogil:
62+
v0[0] += v1[0]
63+
v1[0] = _rotl(v1[0], 13)
64+
v1[0] ^= v0[0]
65+
v0[0] = _rotl(v0[0], 32)
66+
v2[0] += v3[0]
67+
v3[0] = _rotl(v3[0], 16)
68+
v3[0] ^= v2[0]
69+
v0[0] += v3[0]
70+
v3[0] = _rotl(v3[0], 21)
71+
v3[0] ^= v0[0]
72+
v2[0] += v1[0]
73+
v1[0] = _rotl(v1[0], 17)
74+
v1[0] ^= v2[0]
75+
v2[0] = _rotl(v2[0], 32)
76+
77+
cpdef uint64_t siphash(bytes data, bytes key) except? 0:
78+
if len(key) != 16:
79+
raise ValueError(
80+
'key should be a 16-byte bytestring, got {!r} (len {})'.format(
81+
key, len(key)))
82+
return low_level_siphash(data, len(data), key)
83+
84+
@cython.cdivision(True)
85+
cdef uint64_t low_level_siphash(uint8_t* data, size_t datalen,
86+
uint8_t* key) nogil:
87+
cdef uint64_t v0 = 0x736f6d6570736575ULL
88+
cdef uint64_t v1 = 0x646f72616e646f6dULL
89+
cdef uint64_t v2 = 0x6c7967656e657261ULL
90+
cdef uint64_t v3 = 0x7465646279746573ULL
91+
cdef uint64_t b
92+
cdef uint64_t k0 = u8to64_le(key)
93+
cdef uint64_t k1 = u8to64_le(key + 8)
94+
cdef uint64_t m
95+
cdef int i
96+
cdef uint8_t* end = data + datalen - (datalen % sizeof(uint64_t))
97+
cdef int left = datalen & 7
98+
cdef int left_byte
99+
100+
b = (<uint64_t>datalen) << 56
101+
v3 ^= k1
102+
v2 ^= k0
103+
v1 ^= k1
104+
v0 ^= k0
105+
106+
while (data != end):
107+
m = u8to64_le(data)
108+
v3 ^= m
109+
for i in range(cROUNDS):
110+
_sipround(&v0, &v1, &v2, &v3)
111+
v0 ^= m
112+
113+
data += sizeof(uint64_t)
114+
115+
for i in range(left-1, -1, -1):
116+
b |= (<uint64_t>data[i]) << (i * 8)
117+
118+
v3 ^= b
119+
120+
for i in range(cROUNDS):
121+
_sipround(&v0, &v1, &v2, &v3)
122+
123+
v0 ^= b
124+
v2 ^= 0xff
125+
126+
for i in range(dROUNDS):
127+
_sipround(&v0, &v1, &v2, &v3)
128+
129+
b = v0 ^ v1 ^ v2 ^ v3
130+
131+
return b

pandas/tools/hashing.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,10 @@
22
data hash pandas / numpy objects
33
"""
44

5-
from hashlib import md5
65
import numpy as np
76
from pandas import Series
7+
from pandas import _hash
8+
from pandas.lib import infer_dtype
89
from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame
910
from pandas.types.common import is_categorical_dtype
1011

@@ -61,10 +62,11 @@ def hash_array(vals):
6162
return hash_array(vals.real) + 23 * hash_array(vals.imag)
6263

6364
# MAIN LOGIC:
65+
inferred = infer_dtype(vals)
6466

6567
# First, turn whatever array this is into unsigned 64-bit ints, if we can
6668
# manage it.
67-
if vals.dtype == np.bool:
69+
if inferred == 'boolean':
6870
vals = vals.astype('u8')
6971

7072
if (np.issubdtype(vals.dtype, np.datetime64) or
@@ -76,10 +78,7 @@ def hash_array(vals):
7678

7779
# we want to stringify
7880
# then apply a consistent hashing scheme
79-
def f(v):
80-
return int(md5(v).hexdigest(), 16) % (10 ** 8)
81-
vals = Series(vals).astype(str).str.encode('utf8').values
82-
vals = np.array([f(v) for v in vals], dtype='uint64')
81+
vals = _hash.hash_object_array(vals, 'foo')
8382

8483
# Then, redistribute these 64-bit ints within the space of 64-bit ints
8584
vals ^= vals >> 30

setup.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,7 @@ class CheckSDist(sdist_class):
331331
'pandas/src/period.pyx',
332332
'pandas/src/sparse.pyx',
333333
'pandas/src/testing.pyx',
334+
'pandas/src/hash.pyx',
334335
'pandas/io/sas/saslib.pyx']
335336

336337
def initialize_options(self):
@@ -501,10 +502,12 @@ def pxd(name):
501502
'sources': ['pandas/src/parser/tokenizer.c',
502503
'pandas/src/parser/io.c']},
503504
_sparse={'pyxfile': 'src/sparse',
504-
'depends': ([srcpath('sparse', suffix='.pyx')]
505-
+ _pxi_dep['_sparse'])},
505+
'depends': ([srcpath('sparse', suffix='.pyx')] +
506+
_pxi_dep['_sparse'])},
506507
_testing={'pyxfile': 'src/testing',
507508
'depends': [srcpath('testing', suffix='.pyx')]},
509+
_hash={'pyxfile': 'src/hash',
510+
'depends': [srcpath('hash', suffix='.pyx')]},
508511
)
509512

510513
ext_data["io.sas.saslib"] = {'pyxfile': 'io/sas/saslib'}

0 commit comments

Comments
 (0)