Skip to content

Commit 7923c63

Browse files
Shuxin Yangagentzh
Shuxin Yang
authored andcommitted
optimize: lj_str_new: uses randomized hash functions based on crc32 when -msse4.2 is specified.
security wise: ------------- o. crc32 up to 128 bytes, so it is difficult to attach with len <= 128. o. for len >= 128, random 128 bytes are crc32-ed, so it is vulnerable. performance wise: ----------------- o. performance is measured by 'make -C src/x64/test benchmark' o. new hash function is realtively computationally cheaper if len < 120 and about 1.8x as slow if len >= 120. o. for len in [1-3], original hash function has better distribution. need to understand why it is so. Signed-off-by: Yichun Zhang (agentzh) <[email protected]>
1 parent 46ed47e commit 7923c63

File tree

9 files changed

+856
-10
lines changed

9 files changed

+856
-10
lines changed

src/lj_str.c

+34-10
Original file line numberDiff line numberDiff line change
@@ -128,17 +128,16 @@ void lj_str_resize(lua_State *L, MSize newmask)
128128
g->strhash = newhash;
129129
}
130130

131-
/* Intern a string and return string object. */
132-
GCstr *lj_str_new(lua_State *L, const char *str, size_t lenx)
133-
{
134-
global_State *g;
135-
GCstr *s;
136-
GCobj *o;
131+
#include "x64/src/lj_str_hash_x64.h"
132+
133+
#if defined(LJ_ARCH_STR_HASH)
134+
#define LJ_STR_HASH LJ_ARCH_STR_HASH
135+
#else
136+
static MSize
137+
lj_str_original_hash(const char *str, size_t lenx) {
137138
MSize len = (MSize)lenx;
138139
MSize a, b, h = len;
139-
if (lenx >= LJ_MAX_STR)
140-
lj_err_msg(L, LJ_ERR_STROV);
141-
g = G(L);
140+
142141
/* Compute string hash. Constants taken from lookup3 hash by Bob Jenkins. */
143142
if (len >= 4) { /* Caveat: unaligned access! */
144143
a = lj_getu32(str);
@@ -152,11 +151,36 @@ GCstr *lj_str_new(lua_State *L, const char *str, size_t lenx)
152151
b = *(const uint8_t *)(str+(len>>1));
153152
h ^= b; h -= lj_rol(b, 14);
154153
} else {
155-
return &g->strempty;
154+
return 0;
156155
}
156+
157157
a ^= h; a -= lj_rol(h, 11);
158158
b ^= a; b -= lj_rol(a, 25);
159159
h ^= b; h -= lj_rol(b, 16);
160+
161+
return h;
162+
}
163+
#define LJ_STR_HASH lj_str_original_hash
164+
#endif
165+
166+
/* Intern a string and return string object. */
167+
GCstr *lj_str_new(lua_State *L, const char *str, size_t lenx)
168+
{
169+
global_State *g;
170+
GCstr *s;
171+
GCobj *o;
172+
MSize len = (MSize)lenx;
173+
MSize h;
174+
175+
if (lenx >= LJ_MAX_STR)
176+
lj_err_msg(L, LJ_ERR_STROV);
177+
g = G(L);
178+
if (LJ_UNLIKELY(lenx == 0)) {
179+
return &g->strempty;
180+
}
181+
182+
h = LJ_STR_HASH(str, lenx);
183+
160184
/* Check if the string has already been interned. */
161185
o = gcref(g->strhash[h & g->strmask]);
162186
#ifndef LUAJIT_USE_VALGRIND

src/x64/Makefile

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
.PHONY: default test benchmark clean
2+
3+
default:
4+
@echo "make target include: test bechmark clean"
5+
6+
test:
7+
$(MAKE) -C test test
8+
9+
benchmark:
10+
$(MAKE) -C test benchmark
11+
12+
clean:
13+
$(MAKE) -C test clean

src/x64/src/lj_str_hash_x64.h

+266
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,266 @@
1+
/*
2+
* This file defines string hash function using CRC32. It takes advantage of
3+
* Intel hardware support (crc32 instruction, SSE 4.2) to speedup the CRC32
4+
* computation. The hash functions try to compute CRC32 of length and up
5+
* to 128 bytes of given string.
6+
*/
7+
8+
#ifndef _LJ_STR_HASH_X64_H_
9+
#define _LJ_STR_HASH_X64_H_
10+
11+
#if defined(__SSE4_2__) && defined(__x86_64) && defined(__GNUC__)
12+
13+
#include <stdint.h>
14+
#include <sys/types.h>
15+
#include <unistd.h>
16+
#include <time.h>
17+
#include <smmintrin.h>
18+
19+
#include "../../lj_def.h"
20+
21+
#undef LJ_AINLINE
22+
#define LJ_AINLINE
23+
24+
static const uint64_t* cast_uint64p(const char* str)
25+
{
26+
return (const uint64_t*)(void*)str;
27+
}
28+
29+
static const uint32_t* cast_uint32p(const char* str)
30+
{
31+
return (const uint32_t*)(void*)str;
32+
}
33+
34+
/* hash string with len in [1, 4) */
35+
static LJ_AINLINE uint32_t lj_str_hash_1_4(const char* str, uint32_t len)
36+
{
37+
#if 0
38+
/* TODO: The if-1 part (i.e the original algorithm) is working better when
39+
* the load-factor is high, as revealed by conflict benchmark (via
40+
* 'make benchmark' command); need to understand why it's so.
41+
*/
42+
uint32_t v = str[0];
43+
v = (v << 8) | str[len >> 1];
44+
v = (v << 8) | str[len - 1];
45+
v = (v << 8) | len;
46+
return _mm_crc32_u32(0, v);
47+
#else
48+
uint32_t a, b, h = len;
49+
50+
a = *(const uint8_t *)str;
51+
h ^= *(const uint8_t *)(str+len-1);
52+
b = *(const uint8_t *)(str+(len>>1));
53+
h ^= b; h -= lj_rol(b, 14);
54+
55+
a ^= h; a -= lj_rol(h, 11);
56+
b ^= a; b -= lj_rol(a, 25);
57+
h ^= b; h -= lj_rol(b, 16);
58+
59+
return h;
60+
#endif
61+
}
62+
63+
/* hash string with len in [4, 16) */
64+
static LJ_AINLINE uint32_t lj_str_hash_4_16(const char* str, uint32_t len)
65+
{
66+
uint64_t v1, v2, h;
67+
68+
if (len >= 8) {
69+
v1 = *cast_uint64p(str);
70+
v2 = *cast_uint64p(str + len - 8);
71+
} else {
72+
v1 = *cast_uint32p(str);
73+
v2 = *cast_uint32p(str + len - 4);
74+
}
75+
76+
h = _mm_crc32_u32(0, len);
77+
h = _mm_crc32_u64(h, v1);
78+
h = _mm_crc32_u64(h, v2);
79+
return h;
80+
}
81+
82+
/* hash string with length in [16, 128) */
83+
static uint32_t lj_str_hash_16_128(const char* str, uint32_t len)
84+
{
85+
uint64_t h1, h2;
86+
uint32_t i;
87+
88+
h1 = _mm_crc32_u32(0, len);
89+
h2 = 0;
90+
91+
for (i = 0; i < len - 16; i += 16) {
92+
h1 += _mm_crc32_u64(h1, *cast_uint64p(str + i));
93+
h2 += _mm_crc32_u64(h2, *cast_uint64p(str + i + 8));
94+
};
95+
96+
h1 = _mm_crc32_u64(h1, *cast_uint64p(str + len - 16));
97+
h2 = _mm_crc32_u64(h2, *cast_uint64p(str + len - 8));
98+
99+
return _mm_crc32_u32(h1, h2);
100+
}
101+
102+
/* **************************************************************************
103+
*
104+
* Following is code about hashing string with length >= 128
105+
*
106+
* **************************************************************************
107+
*/
108+
static uint32_t random_pos[32][2];
109+
static const int8_t log2_tab[128] = { -1,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,4,4,
110+
4,4,4,4,4,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
111+
5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,
112+
6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
113+
6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6 };
114+
115+
/* return floor(log2(n)) */
116+
static LJ_AINLINE uint32_t log2_floor(uint32_t n)
117+
{
118+
if (n <= 127) {
119+
return log2_tab[n];
120+
}
121+
122+
if ((n >> 8) <= 127) {
123+
return log2_tab[n >> 8] + 8;
124+
}
125+
126+
if ((n >> 16) <= 127) {
127+
return log2_tab[n >> 16] + 16;
128+
}
129+
130+
if ((n >> 24) <= 127) {
131+
return log2_tab[n >> 24] + 24;
132+
}
133+
134+
return 31;
135+
}
136+
137+
#define POW2_MASK(n) ((1L << (n)) - 1)
138+
139+
/* This function is to populate `random_pos` such that random_pos[i][*]
140+
* contains random value in the range of [2**i, 2**(i+1)).
141+
*/
142+
static void x64_init_random(void)
143+
{
144+
int i, seed, rml;
145+
146+
/* Calculate the ceil(log2(RAND_MAX)) */
147+
rml = log2_floor(RAND_MAX);
148+
if (RAND_MAX & (RAND_MAX - 1)) {
149+
rml += 1;
150+
}
151+
152+
/* Init seed */
153+
seed = _mm_crc32_u32(0, getpid());
154+
seed = _mm_crc32_u32(seed, time(NULL));
155+
srandom(seed);
156+
157+
/* Now start to populate the random_pos[][]. */
158+
for (i = 0; i < 3; i++) {
159+
/* No need to provide random value for chunk smaller than 8 bytes */
160+
random_pos[i][0] = random_pos[i][1] = 0;
161+
}
162+
163+
for (; i < rml; i++) {
164+
random_pos[i][0] = random() & POW2_MASK(i+1);
165+
random_pos[i][1] = random() & POW2_MASK(i+1);
166+
}
167+
168+
for (; i < 31; i++) {
169+
int j;
170+
for (j = 0; j < 2; j++) {
171+
uint32_t v, scale;
172+
scale = random_pos[i - rml][0];
173+
if (scale == 0) {
174+
scale = 1;
175+
}
176+
v = (random() * scale) & POW2_MASK(i+1);
177+
random_pos[i][j] = v;
178+
}
179+
}
180+
}
181+
#undef POW2_MASK
182+
183+
void __attribute__((constructor)) x64_init_random_constructor()
184+
{
185+
x64_init_random();
186+
}
187+
188+
/* Return a pre-computed random number in the range of [1**chunk_sz_order,
189+
* 1**(chunk_sz_order+1)). It is "unsafe" in the sense that the return value
190+
* may be greater than chunk-size; it is up to the caller to make sure
191+
* "chunk-base + return-value-of-this-func" has valid virtual address.
192+
*/
193+
static LJ_AINLINE uint32_t get_random_pos_unsafe(uint32_t chunk_sz_order,
194+
uint32_t idx)
195+
{
196+
uint32_t pos = random_pos[chunk_sz_order][idx & 1];
197+
return pos;
198+
}
199+
200+
static LJ_NOINLINE uint32_t lj_str_hash_128_above(const char* str,
201+
uint32_t len)
202+
{
203+
uint32_t chunk_num, chunk_sz, chunk_sz_log2, i, pos1, pos2;
204+
uint64_t h1, h2, v;
205+
const char* chunk_ptr;
206+
207+
chunk_num = 16;
208+
chunk_sz = len / chunk_num;
209+
chunk_sz_log2 = log2_floor(chunk_sz);
210+
211+
pos1 = get_random_pos_unsafe(chunk_sz_log2, 0);
212+
pos2 = get_random_pos_unsafe(chunk_sz_log2, 1);
213+
214+
h1 = _mm_crc32_u32(0, len);
215+
h2 = 0;
216+
217+
/* loop over 14 chunks, 2 chunks at a time */
218+
for (i = 0, chunk_ptr = str; i < (chunk_num / 2 - 1);
219+
chunk_ptr += chunk_sz, i++) {
220+
221+
v = *cast_uint64p(chunk_ptr + pos1);
222+
h1 = _mm_crc32_u64(h1, v);
223+
224+
v = *cast_uint64p(chunk_ptr + chunk_sz + pos2);
225+
h2 = _mm_crc32_u64(h2, v);
226+
}
227+
228+
/* the last two chunks */
229+
v = *cast_uint64p(chunk_ptr + pos1);
230+
h1 = _mm_crc32_u64(h1, v);
231+
232+
v = *cast_uint64p(chunk_ptr + chunk_sz - 8 - pos2);
233+
h2 = _mm_crc32_u64(h2, v);
234+
235+
/* process the trailing part */
236+
h1 = _mm_crc32_u64(h1, *cast_uint64p(str));
237+
h2 = _mm_crc32_u64(h2, *cast_uint64p(str + len - 8));
238+
239+
h1 = _mm_crc32_u32(h1, h2);
240+
return h1;
241+
}
242+
243+
/* NOTE: the "len" should not be zero */
244+
static LJ_AINLINE uint32_t lj_str_hash(const char* str, size_t len)
245+
{
246+
if (len < 128) {
247+
if (len >= 16) { /* [16, 128) */
248+
return lj_str_hash_16_128(str, len);
249+
}
250+
251+
if (len >= 4) { /* [4, 16) */
252+
return lj_str_hash_4_16(str, len);
253+
}
254+
255+
/* [0, 4) */
256+
return lj_str_hash_1_4(str, len);
257+
}
258+
/* [128, inf) */
259+
return lj_str_hash_128_above(str, len);
260+
}
261+
262+
#define LJ_ARCH_STR_HASH lj_str_hash
263+
#else
264+
#undef LJ_ARCH_STR_HASH
265+
#endif
266+
#endif /*_LJ_STR_HASH_X64_H_*/

src/x64/test/Makefile

+47
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
.PHONY: default test benchmark
2+
3+
default: test benchmark
4+
5+
COMMON_OBJ := test_util.o
6+
7+
TEST_PROGRAM := ht_test
8+
BENCHMARK_PROGRAM := ht_benchmark
9+
10+
TEST_PROGRAM_OBJ := $(COMMON_OBJ) test.o
11+
BENCHMARK_PROGRAM_OBJ := $(COMMON_OBJ) benchmark.o
12+
13+
ifeq ($(WITH_VALGRIND), 1)
14+
VALGRIND := valgrind --leak-check=full
15+
else
16+
VALGRIND :=
17+
endif
18+
19+
CXXFLAGS := -O3 -MD -g -msse4.2 -Wall -I../src -I../../../src
20+
21+
%.o: %.cxx
22+
$(CXX) $(CXXFLAGS) -MD -c $<
23+
24+
test: $(TEST_PROGRAM)
25+
@echo "some unit test"
26+
$(VALGRIND) ./$(TEST_PROGRAM)
27+
28+
@echo "smoke test"
29+
../../luajit test_str_comp.lua
30+
31+
benchmark: $(BENCHMARK_PROGRAM)
32+
# micro benchmark
33+
./$(BENCHMARK_PROGRAM)
34+
35+
$(TEST_PROGRAM) : $(TEST_PROGRAM_OBJ)
36+
cat $(TEST_PROGRAM_OBJ:.o=.d) > dep1.txt
37+
$(CXX) $+ $(CXXFLAGS) -lm -o $@
38+
39+
$(BENCHMARK_PROGRAM): $(BENCHMARK_PROGRAM_OBJ)
40+
cat $(BENCHMARK_PROGRAM_OBJ:.o=.d) > dep2.txt
41+
$(CXX) $+ $(CXXFLAGS) -o $@
42+
43+
-include dep1.txt
44+
-include dep2.txt
45+
46+
clean:
47+
-rm -f *.o *.d dep*.txt $(BENCHMARK_PROGRAM) $(TEST_PROGRAM)

0 commit comments

Comments
 (0)