Skip to content

Commit fea3536

Browse files
committed
PERF: using murmur2 hash for float64 khash-tables
1 parent 27aae22 commit fea3536

File tree

2 files changed

+123
-15
lines changed

2 files changed

+123
-15
lines changed
+51
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
import numpy as np
2+
3+
import pandas as pd
4+
5+
6+
class IsinWithArange:
7+
params = [
8+
[np.float64, np.int64, np.object], #
9+
[
10+
1_000,
11+
2_000,
12+
8_000,
13+
], # problem when quadratic behavior is triggered: [100_000, 10_000_000],
14+
[-2, 0, 2],
15+
]
16+
param_names = ["dtype", "M", "offset_factor"]
17+
18+
def setup(self, dtype, M, offset_factor):
19+
offset = int(M * offset_factor)
20+
np.random.seed(42)
21+
self.s = pd.Series(np.random.randint(offset, M + offset, 10 ** 6)).astype(dtype)
22+
self.values = np.arange(M).astype(dtype)
23+
24+
def time_isin(self, dtype, M, offset_factor):
25+
self.s.isin(self.values)
26+
27+
28+
class GH28303Example:
29+
def setup(self):
30+
self.df = pd.date_range(
31+
start="1/1/2018", end="1/2/2018", periods=1e6
32+
).to_frame()
33+
self.group_index = np.round(self.df.index.astype(int) / 1e9)
34+
35+
def time_groupby(self):
36+
self.df.groupby(self.group_index).last()
37+
38+
39+
class UniqueAndFactorizeArange:
40+
params = range(4, 16)
41+
param_names = ["exponent"]
42+
43+
def setup(self, exponent):
44+
a = np.arange(10 ** 4, dtype="float64")
45+
self.a2 = (a + 10 ** exponent).repeat(100)
46+
47+
def time_factorize(self, exponent):
48+
pd.factorize(self.a2)
49+
50+
def time_unique(self, exponent):
51+
pd.unique(self.a2)

pandas/_libs/src/klib/khash_python.h

+72-15
Original file line numberDiff line numberDiff line change
@@ -13,25 +13,82 @@
1313
// is 64 bits the truncation causes collission issues. Given all that, we use our own
1414
// simple hash, viewing the double bytes as an int64 and using khash's default
1515
// hash for 64 bit integers.
16-
// GH 13436
16+
// GH 13436 showed that _Py_HashDouble doesn't work well with khash
17+
// GH 28303 showed, that the simple xoring-version isn't good enough
18+
// thus murmur2-hash is used
19+
20+
21+
// specializations of https://github.com/aappleby/smhasher/blob/master/src/MurmurHash2.cpp
22+
// it is possible to have a special x64-version, which would need less operations, but
23+
// using 32bit version always has also some benifits:
24+
// - one code for 32bit and 64bit builds
25+
// - the same case for 32bit and 64bit builds
26+
// - no performance difference could be measured compared to a possible x64-version
27+
28+
khint32_t PANDAS_INLINE murmur2_32_32to32(khint32_t k1, khint32_t k2){
29+
const khint32_t SEED = 0xc70f6907UL;
30+
// 'm' and 'r' are mixing constants generated offline.
31+
// They're not really 'magic', they just happen to work well.
32+
const khint32_t M_32 = 0x5bd1e995;
33+
const int R_32 = 24;
34+
35+
// Initialize the hash to a 'random' value
36+
khint32_t h = SEED ^ 4;
37+
38+
//handle first 4 bytes:
39+
k1 *= M_32;
40+
k1 ^= k1 >> R_32;
41+
k1 *= M_32;
42+
43+
h *= M_32;
44+
h ^= k1;
45+
46+
//handle second 4 bytes:
47+
k2 *= M_32;
48+
k2 ^= k2 >> R_32;
49+
k2 *= M_32;
50+
51+
h *= M_32;
52+
h ^= k2;
53+
54+
// Do a few final mixes of the hash to ensure the "last few
55+
// bytes" are well-incorporated.
56+
h ^= h >> 13;
57+
h *= M_32;
58+
h ^= h >> 15;
59+
return h;
60+
}
61+
62+
khint32_t PANDAS_INLINE murmur2_64to32(khint64_t k){
63+
khint32_t k1 = (khint32_t)k;
64+
khint32_t k2 = (khint32_t)(k >> 32);
65+
66+
return murmur2_32_32to32(k1, k2);
67+
}
68+
1769
khint64_t PANDAS_INLINE asint64(double key) {
18-
khint64_t val;
19-
memcpy(&val, &key, sizeof(double));
20-
return val;
70+
khint64_t val;
71+
memcpy(&val, &key, sizeof(double));
72+
return val;
2173
}
2274

23-
// correct for all inputs but not -0.0 and NaNs
24-
#define kh_float64_hash_func_0_NAN(key) (khint32_t)((asint64(key))>>33^(asint64(key))^(asint64(key))<<11)
2575

26-
// correct for all inputs but not NaNs
27-
#define kh_float64_hash_func_NAN(key) ((key) == 0.0 ? \
28-
kh_float64_hash_func_0_NAN(0.0) : \
29-
kh_float64_hash_func_0_NAN(key))
76+
#define ZERO_HASH 0
77+
#define NAN_HASH 0
78+
79+
khint32_t PANDAS_INLINE kh_float64_hash_func(double val){
80+
// 0.0 and -0.0 should have the same hash:
81+
if (val == 0.0){
82+
return ZERO_HASH;
83+
}
84+
// all nans should have the same hash:
85+
if ( val!=val ){
86+
return NAN_HASH;
87+
}
88+
khint64_t as_int = asint64(val);
89+
return murmur2_64to32(as_int);
90+
}
3091

31-
// correct for all
32-
#define kh_float64_hash_func(key) ((key) != (key) ? \
33-
kh_float64_hash_func_NAN(Py_NAN) : \
34-
kh_float64_hash_func_NAN(key))
3592

3693
#define kh_float64_hash_equal(a, b) ((a) == (b) || ((b) != (b) && (a) != (a)))
3794

@@ -121,4 +178,4 @@ void PANDAS_INLINE kh_destroy_str_starts(kh_str_starts_t* table) {
121178

122179
void PANDAS_INLINE kh_resize_str_starts(kh_str_starts_t* table, khint_t val) {
123180
kh_resize_str(table->table, val);
124-
}
181+
}

0 commit comments

Comments
 (0)