Skip to content

Commit 72eea8e

Browse files
committed
using murmur hash for float64 khash-tables
1 parent 27aae22 commit 72eea8e

File tree

2 files changed

+120
-15
lines changed

2 files changed

+120
-15
lines changed
+50
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import pandas as pd
2+
import numpy as np
3+
4+
5+
class IsinWithArange:
6+
params = [
7+
[np.float64, np.int64, np.object], #
8+
[
9+
1_000,
10+
2_000,
11+
8_000,
12+
], # problem when quadratic behavior is triggered: [100_000, 10_000_000],
13+
[-2, 0, 2],
14+
]
15+
param_names = ["dtype", "M", "offset_factor"]
16+
17+
def setup(self, dtype, M, offset_factor):
18+
offset = int(M * offset_factor)
19+
np.random.seed(42)
20+
self.s = pd.Series(np.random.randint(offset, M + offset, 10 ** 6)).astype(dtype)
21+
self.values = np.arange(M).astype(dtype)
22+
23+
def time_isin(self, dtype, M, offset_factor):
24+
self.s.isin(self.values)
25+
26+
27+
class GH28303Example:
28+
def setup(self):
29+
self.df = pd.date_range(
30+
start="1/1/2018", end="1/2/2018", periods=1e6
31+
).to_frame()
32+
self.group_index = np.round(self.df.index.astype(int) / 1e9)
33+
34+
def time_groupby(self):
35+
self.df.groupby(self.group_index).last()
36+
37+
38+
class UniqueAndFactorizeArange:
39+
params = range(4, 16)
40+
param_names = ["exponent"]
41+
42+
def setup(self, exponent):
43+
a = np.arange(10 ** 4, dtype="float64")
44+
self.a2 = (a + 10 ** exponent).repeat(100)
45+
46+
def time_factorize(self, exponent):
47+
pd.factorize(self.a2)
48+
49+
def time_unique(self, exponent):
50+
pd.unique(self.a2)

pandas/_libs/src/klib/khash_python.h

+70-15
Original file line numberDiff line numberDiff line change
@@ -13,25 +13,80 @@
1313
// is 64 bits the truncation causes collission issues. Given all that, we use our own
1414
// simple hash, viewing the double bytes as an int64 and using khash's default
1515
// hash for 64 bit integers.
16-
// GH 13436
16+
// GH 13436 showed that _Py_HashDouble doesn't work well with khash
17+
// GH 28303 showed, that the simple xoring-version isn't good enough
18+
// thus murmur2-hash is used
19+
20+
21+
// specializations of https://github.com/aappleby/smhasher/blob/master/src/MurmurHash2.cpp
22+
// it is possible to have a special x64-version, which would need less operations, but
23+
// using 32bit version always has also some benifits:
24+
// - one code for 32bit and 64bit builds
25+
// - the same case for 32bit and 64bit builds
26+
// - no performance difference could be measured compared to a possible x64-version
27+
28+
khint32_t PANDAS_INLINE murmur2_32_32to32(khint32_t k1, khint32_t k2){
29+
const khint32_t SEED = 0xc70f6907UL;
30+
// 'm' and 'r' are mixing constants generated offline.
31+
// They're not really 'magic', they just happen to work well.
32+
const khint32_t M_32 = 0x5bd1e995;
33+
const int R_32 = 24;
34+
35+
// Initialize the hash to a 'random' value
36+
khint32_t h = SEED ^ 4;
37+
38+
//handle first 4 bytes:
39+
k1 *= M_32;
40+
k1 ^= k1 >> R_32;
41+
k1 *= M_32;
42+
43+
h *= M_32;
44+
h ^= k1;
45+
46+
//handle second 4 bytes:
47+
k2 *= M_32;
48+
k2 ^= k2 >> R_32;
49+
k2 *= M_32;
50+
51+
h *= M_32;
52+
h ^= k2;
53+
54+
// Do a few final mixes of the hash to ensure the "last few
55+
// bytes" are well-incorporated.
56+
h ^= h >> 13;
57+
h *= M_32;
58+
h ^= h >> 15;
59+
return h;
60+
}
61+
62+
khint32_t PANDAS_INLINE murmur2_64to32(khint64_t k){
63+
khint32_t k1=(khint32_t)k;
64+
khint32_t k2=(khint32_t)(k>>32);
65+
66+
return murmur2_32_32to32(k1, k2);
67+
}
68+
1769
khint64_t PANDAS_INLINE asint64(double key) {
18-
khint64_t val;
19-
memcpy(&val, &key, sizeof(double));
20-
return val;
70+
khint64_t val;
71+
memcpy(&val, &key, sizeof(double));
72+
return val;
2173
}
2274

23-
// correct for all inputs but not -0.0 and NaNs
24-
#define kh_float64_hash_func_0_NAN(key) (khint32_t)((asint64(key))>>33^(asint64(key))^(asint64(key))<<11)
2575

26-
// correct for all inputs but not NaNs
27-
#define kh_float64_hash_func_NAN(key) ((key) == 0.0 ? \
28-
kh_float64_hash_func_0_NAN(0.0) : \
29-
kh_float64_hash_func_0_NAN(key))
76+
#define ZERO_HASH 0
77+
#define NAN_HASH 0
78+
79+
khint32_t PANDAS_INLINE kh_float64_hash_func(double val){
80+
if(val==0.0){
81+
return ZERO_HASH;
82+
}
83+
if(val!=val){
84+
return NAN_HASH;
85+
}
86+
khint64_t as_int = asint64(val);
87+
return murmur2_64to32(as_int);
88+
}
3089

31-
// correct for all
32-
#define kh_float64_hash_func(key) ((key) != (key) ? \
33-
kh_float64_hash_func_NAN(Py_NAN) : \
34-
kh_float64_hash_func_NAN(key))
3590

3691
#define kh_float64_hash_equal(a, b) ((a) == (b) || ((b) != (b) && (a) != (a)))
3792

@@ -121,4 +176,4 @@ void PANDAS_INLINE kh_destroy_str_starts(kh_str_starts_t* table) {
121176

122177
void PANDAS_INLINE kh_resize_str_starts(kh_str_starts_t* table, khint_t val) {
123178
kh_resize_str(table->table, val);
124-
}
179+
}

0 commit comments

Comments
 (0)