-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlib.rs
339 lines (309 loc) · 10.5 KB
/
lib.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
#![feature(test, str_internals, core_intrinsics)]
extern crate test;
use core::mem;
// strings pulled from rust-lang.org and localizations. Both should be 700b,
// although it won't matter a ton if this varies a bit.
const EN_MEDIUM: &str = "\
A language empowering everyone to build reliable and efficient software. \
Rust is blazingly fast and memory-efficient: with no runtime or garbage \
collector, it can power performance-critical services, run on embedded \
devices, and easily integrate with other languages. Rust’s rich type system \
and ownership model guarantee memory-safety and thread-safety — enabling you \
to eliminate many classes of bugs at compile-time. Rust has great \
documentation, a friendly compiler with useful error messages, and top-notch \
tooling — an integrated package manager and build tool, smart multi-editor \
support with auto-completion and type inspections, an auto-formatter, and \
more. Performance, reliability
";
const ZH_MEDIUM: &str ="\
讓每個人都能打造出可靠又高效軟體的程式語言Rust不僅速度驚人,而且節省記憶\
體。由於不需要執行時函式庫或垃圾回收機制,Rust可以加速高效能需求的服務、\
執行在嵌入式裝置,並且輕鬆地與其他語言整合。Rust 豐富的型別系統與所有權模型\
確保了記憶體以及執行緒的安全,讓您在編譯時期就能夠解決各式各樣的錯誤。\
Rust 擁有完整的技術文件、友善的編譯器與清晰的錯誤訊息,還整合了一流的工具 —\
包含套件管理工具、建構工具、支援多種編輯器的自動補齊、型別檢測、自動格式化\
程式碼,以及更多等等。讓每個人都能\
";
// Both 30b
const EN_SMALL: &str = "A language empowering everyone";
const ZH_SMALL: &str = "讓每個人都能打造出。";
// libcore's old impl. LLVM autovectorizes this, although the result has a ton
// of code bloat
fn char_count_old(s: &str) -> usize {
#[inline]
fn utf8_is_cont_byte(byte: u8) -> bool {
const CONT_MASK: u8 = 0b0011_1111;
const TAG_CONT_U8: u8 = 0b1000_0000;
(byte & !CONT_MASK) == TAG_CONT_U8
}
let bytes_len = s.len();
let mut cont_bytes = 0;
for &byte in s.as_bytes() {
//(b as i8) >= -0x40
cont_bytes += utf8_is_cont_byte(byte) as usize;
}
bytes_len - cont_bytes
}
fn iter_ignore(s: &str) -> usize {
let mut c = 0;
for _ in s.chars() {
c += 1;
}
c
}
fn manual_utf8_char_width(s: &str) -> usize {
let s = s.as_bytes();
let mut c = 0;
let mut i = 0;
let l = s.len();
while i < l {
let b = s[i];
if b < 0x80 {
i += 1;
} else if b < 0xe0 {
i += 2;
} else if b < 0xf0 {
i += 3;
} else {
i += 4;
}
c += 1;
}
c
}
fn core_utf8_char_width_lut(s: &str) -> usize {
let s = s.as_bytes();
let mut c = 0;
let mut i = 0;
let l = s.len();
while i < l {
// uses a lookup table
let step = core::str::utf8_char_width(s[i]);
debug_assert_ne!(step, 0);
i += step;
c += 1;
}
c
}
fn core_utf8_char_width_lut2(s: &str) -> usize {
let s = s.as_bytes();
let mut c = 0;
let mut i = 0;
let l = s.len();
while i < l {
let b = s[i];
if b < 192 { i += 1; c += 1; continue; }
// uses a lookup table
i += core::str::utf8_char_width(b);
c += 1;
}
c
}
fn core_utf8_char_width_lut3(s: &str) -> usize {
let s = s.as_bytes();
let mut c = 0;
let mut i = 0;
let l = s.len();
'outer:
while i < l {
let mut b: u8;
while { b = *unsafe { s.get_unchecked(i) }; b < 192 } {
c += (b < 0x80) as usize;
i += 1;
if i >= l {
break 'outer;
}
}
// uses a lookup table
i += core::str::utf8_char_width(b);
c += 1;
}
c
}
fn char_count_swar_usize(s: &str) -> usize {
fn count_noncontinuation_bytes(s: &[u8]) -> usize {
let mut c = 0;
for &byte in s {
c += (byte as i8 >= -0x40) as usize;
}
c
}
// so that I can quickly chek perf diff vs
type Word = usize;
#[inline]
fn is_noncontinuation_byte_swar(w: Word) -> Word {
const LSB: Word = 0x0101_0101_0101_0101u64 as Word;
// We want a 1 in the LSB of each byte matching `0b10_??_??_??`,
((!w >> 7) | (w >> 6)) & LSB
}
#[inline]
fn hsum_bytes_in_word(values: Word) -> usize {
const LSB_SHORTS: Word = 0x0001_0001_0001_0001_u64 as Word;
const SKIP_BYTES: Word = 0x00ff_00ff_00ff_00ff_u64 as Word;
let pair_sum = (values & SKIP_BYTES) + ((values >> 8) & SKIP_BYTES);
(pair_sum.wrapping_mul(LSB_SHORTS) >> ((mem::size_of::<Word>() - 2) * 8)) as usize
}
// Experimentally determined to be the sweet spot. (If you change this you
// still need to manually change the inner loop -- this is mostly here for
// clarity).
const UNROLL: usize = 4;
// CHUNK_SIZE needs to be:
// - Less than or equal to 255 (otherwise we'll overflow bytes in `leads`).
// - A multiple of UNROLL
// - Relatively cheap to % against (although 192 seems be better on all
// benches than 128)
// - Large enough to reduce the cost of the hsum, which is not the cheapest
// thing here.
const CHUNK_SIZE: usize = 192;
let (head, body, tail) = unsafe { s.as_bytes().align_to::<Word>() };
let mut total = count_noncontinuation_bytes(head) + count_noncontinuation_bytes(tail);
for chunk in body.chunks(CHUNK_SIZE) {
let mut counts = 0;
for words in chunk.chunks_exact(UNROLL) {
counts += is_noncontinuation_byte_swar(words[0]);
counts += is_noncontinuation_byte_swar(words[1]);
counts += is_noncontinuation_byte_swar(words[2]);
counts += is_noncontinuation_byte_swar(words[3]);
}
total += hsum_bytes_in_word(counts);
if (chunk.len() % UNROLL) != 0 {
let mut counts = 0;
let end = &chunk[(chunk.len() - (chunk.len() % UNROLL))..];
for &word in end {
counts += is_noncontinuation_byte_swar(word);
}
total += hsum_bytes_in_word(counts);
// The break helps LLVM out, but is only correct if:
const _: [(); 0] = [(); CHUNK_SIZE % UNROLL];
break;
}
}
total
}
fn char_count_sse2(s: &str) -> usize {
#[cfg(all(
target_feature = "sse2",
any(target_arch = "x86", target_arch = "x86_64")
))]
{
unsafe { char_count_sse2_impl(s) }
}
#[cfg(not(target_feature = "sse2"))]
{
char_count_swar_usize(s)
}
}
#[cfg(all(
target_feature = "sse2",
any(target_arch = "x86", target_arch = "x86_64")
))]
unsafe fn char_count_sse2_impl(s: &str) -> usize {
fn count_noncontinuation_bytes(s: &[u8]) -> usize {
let mut c = 0;
for &byte in s {
c += (byte as i8 >= -0x40) as usize;
}
c
}
#[cfg(target_arch = "x86")]
use core::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;
// const VEC_SIZE: usize = mem::size_of::<__m128i>();
const UNROLL: usize = 4;
const CHUNK_SIZE: usize = 192;
let (head, body, tail) = s.as_bytes().align_to::<__m128i>();
let mut total = count_noncontinuation_bytes(head) + count_noncontinuation_bytes(tail);
let cont_test = _mm_set1_epi8(-0x40);
for chunk in body.chunks(CHUNK_SIZE) {
let mut counts = _mm_setzero_si128();
for vecs in chunk.chunks_exact(UNROLL) {
counts = _mm_sub_epi8(counts, _mm_cmplt_epi8(cont_test, vecs[0]));
counts = _mm_sub_epi8(counts, _mm_cmplt_epi8(cont_test, vecs[1]));
counts = _mm_sub_epi8(counts, _mm_cmplt_epi8(cont_test, vecs[2]));
counts = _mm_sub_epi8(counts, _mm_cmplt_epi8(cont_test, vecs[3]));
}
let sums = _mm_sad_epu8(counts, _mm_setzero_si128());
total += (_mm_extract_epi32(sums, 0) + _mm_extract_epi32(sums, 2)) as usize;
if (chunk.len() % UNROLL) != 0 {
let mut counts = _mm_setzero_si128();
let end = &chunk[(chunk.len() - (chunk.len() % UNROLL))..];
for &vec in end {
counts = _mm_sub_epi8(counts, _mm_cmplt_epi8(cont_test, vec));
}
let sums = _mm_sad_epu8(counts, _mm_setzero_si128());
total += (_mm_extract_epi32(sums, 0) + _mm_extract_epi32(sums, 2)) as usize;
// The break helps LLVM out, but is only correct if:
const _: [(); 0] = [(); CHUNK_SIZE % UNROLL];
break;
}
}
total
}
macro_rules! bench_matrix {
(
@funcs: $funcs:tt $(,)?
@inputs: {$($input_mod_name:ident : $exp:expr),+ $(,)?} $(,)?
) => {
$(
bench_matrix!{
@inner
@funcs: $funcs
@inputs: { $input_mod_name : $exp }
}
)*
};
(
@inner
@funcs: [$($func:ident),+ $(,)?] $(,)?
@inputs: {$input_mod_name:ident : $exp:expr}
) => {
mod $input_mod_name {
use super::*;
// Note: needs to be outside inner $(...)+
fn bench_input() -> String {
$exp.into()
}
$(
#[bench]
fn $func(bencher: &mut test::Bencher) {
let mut input: String = bench_input();
bencher.bytes = input.len() as u64;
bencher.iter(|| {
let slice = test::black_box(&mut input);
test::black_box(super::$func(slice))
});
}
)+
}
};
}
bench_matrix! {
@funcs: [
char_count_old,
iter_ignore,
manual_utf8_char_width,
core_utf8_char_width_lut,
core_utf8_char_width_lut2,
core_utf8_char_width_lut3,
char_count_swar_usize,
char_count_sse2,
],
@inputs: {
en_30b: EN_SMALL,
zh_30b: ZH_SMALL,
en_600b: EN_MEDIUM,
zh_600b: ZH_MEDIUM,
en_5kb: EN_MEDIUM.repeat(8),
zh_5kb: ZH_MEDIUM.repeat(8),
en_300kb: EN_MEDIUM.repeat(512),
zh_300kb: ZH_MEDIUM.repeat(512),
mixed_6kb: format!(
"{}{}{}{}",
EN_SMALL,
ZH_SMALL,
EN_MEDIUM,
ZH_MEDIUM,
).repeat(4),
},
}