Skip to content

Commit f61ae49

Browse files
committed
Tweaked how xmul is exposed, added p*::widening_mul
Multiplication, and carry-less multiplication, are inherently a widening operation. Unfortunately, at the time of writing, the types in Rust don't capture this well, being built around fixed-width wrapping multiplication. Rust's stdlib can rely on compiler-level optimizations to clean up performance issues from unnecessarily-wide multiplications, but this becomes a bit of an issue for our library, especially for u64 types, since we rely on intrinsics, which may be hard for compilers to optimize around. This commit adds widening_mul, based on a proposal to add widening_mul to Rust's primitive types: rust-lang/rust#85532 As well as several other tweaks to how xmul is provided, moving more arch-level details into xmul, but still limiting when it is emitted.
1 parent 344170f commit f61ae49

File tree

12 files changed

+692
-279
lines changed

12 files changed

+692
-279
lines changed

Cargo.toml

Lines changed: 6 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -42,46 +42,15 @@ harness = false
4242
# tracking issue:
4343
# https://github.com/rust-lang/rust/issues/48556
4444
#
45-
use-nightly-features = []
45+
nightly = []
4646

47-
# Makes p* types use a naive carry-less multiplication implementation
48-
# using shifts and xors. Mainly useful for testing/benchmarking.
47+
# Disable carry-less multiplication instructions, forcing the use
48+
# of naive bitwise implementations
4949
#
50-
# By default hardware xmul is used if available, falling back to a naive
51-
# implementation.
50+
# This is mostly available for testing, and in the case that hardware
51+
# xmul is bugged (or more likely this crate is bugged).
5252
#
53-
use-naive-xmul = ["gf256-macros/use-naive-xmul"]
54-
55-
# Makes p* types require hardware-accelerated carry-less multiplication,
56-
# causing a compile error if carry-less multiplication instructions aren't
57-
# available in the current architecture.
58-
#
59-
# By default hardware xmul is used if available, falling back to a naive
60-
# implementation.
61-
#
62-
use-hardware-xmul = ["gf256-macros/use-hardware-xmul"]
63-
64-
# Make gf* types use a naive multiplication implementation using shifts
65-
# and xors. Mainly useful for testing/benchmarking.
66-
#
67-
# By default log/antilog tables are used.
68-
#
69-
use-naive-gfmul = ["gf256-macros/use-naive-gfmul"]
70-
71-
# Make gf* types use precompiled log/antilog tables.
72-
#
73-
# By default log/antilog tables are used.
74-
#
75-
use-table-gfmul = ["gf256-macros/use-table-gfmul"]
76-
77-
# Makes gf* types use (potentially hardware accelerated) polynomial
78-
# multiplication with Barret reduction. This is generally slower than using
79-
# log/antilog tables, but may be useful if constant-time operations are
80-
# required.
81-
#
82-
# By default log/antilog tables are used.
83-
#
84-
use-barret-gfmul = ["gf256-macros/use-barret-gfmul"]
53+
no-xmul = ["gf256-macros/no-xmul"]
8554

8655
[dev-dependencies]
8756
criterion = {version="0.3", features=["html_reports"]}

Makefile

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,20 +3,21 @@ override ENV += RUSTFLAGS="-Ctarget-cpu=native"
33

44
.PHONY: all build
55
all build:
6-
$(ENV) cargo build
6+
$(ENV) cargo +nightly build --features nightly
77

88
.PHONY: test
99
test:
10-
$(ENV) cargo test --lib
11-
$(ENV) cargo test --example find-p
12-
$(ENV) cargo run --example crc
13-
$(ENV) cargo run --example shamir
14-
$(ENV) cargo run --example raid
15-
$(ENV) cargo run --example rs
10+
$(ENV) cargo +nightly test --features nightly --lib
11+
$(ENV) cargo +nightly test --features nightly --example find-p
12+
$(ENV) cargo +nightly run --features nightly --example find-p -- -w9 -n4 -m1
13+
$(ENV) cargo +nightly run --features nightly --example crc
14+
$(ENV) cargo +nightly run --features nightly --example shamir
15+
$(ENV) cargo +nightly run --features nightly --example raid
16+
$(ENV) cargo +nightly run --features nightly --example rs
1617

1718
.PHONY: bench
1819
bench:
19-
$(ENV) cargo +nightly bench --features use-nightly-features
20+
$(ENV) cargo +nightly bench --features nightly
2021

2122
.PHONY: clean
2223
clean:

benches/crc.rs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,20 @@ fn bench_crc(c: &mut Criterion) {
7878
|data| crc::word_barret_crc(data),
7979
BatchSize::SmallInput
8080
));
81+
82+
let mut xs = xorshift64(42).map(|x| x as u8);
83+
group.bench_function("reversed_barret_crc", |b| b.iter_batched_ref(
84+
|| (&mut xs).take(SIZE).collect::<Vec<u8>>(),
85+
|data| crc::reversed_barret_crc(data),
86+
BatchSize::SmallInput
87+
));
88+
89+
let mut xs = xorshift64(42).map(|x| x as u8);
90+
group.bench_function("word_reversed_barret_crc", |b| b.iter_batched_ref(
91+
|| (&mut xs).take(SIZE).collect::<Vec<u8>>(),
92+
|data| crc::word_reversed_barret_crc(data),
93+
BatchSize::SmallInput
94+
));
8195
}
8296

8397
criterion_group!(benches, bench_crc);

examples/crc.rs

Lines changed: 161 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -164,17 +164,32 @@ pub fn small_table_crc(data: &[u8]) -> u32 {
164164
/// compile-time.
165165
///
166166
pub fn barret_crc(data: &[u8]) -> u32 {
167-
const BARRET_CONSTANT: p64 = {
168-
p64(p128(0x10000000000000000)
169-
.naive_div(p128(POLYNOMIAL.0 as u128)).0 as u64)
167+
// Normally this would be 0x10000000000000000 / __polynomial, but
168+
// we eagerly do one step of division so we avoid needing a 4x wide
169+
// type. We can also drop the highest bit if we add the high bits
170+
// manually we use use this constant.
171+
//
172+
// = x % p
173+
// = 0xffffffff & (x + p*(((x >> 32) * [0x10000000000000000/p]) >> 32))
174+
// = 0xffffffff & (x + p*(((x >> 32) * [(p << 32)/p + 0x100000000]) >> 32))
175+
// = 0xffffffff & (x + p*((((x >> 32) * [(p << 32)/p]) >> 32) + (x >> 32)))
176+
// \-----+-----/
177+
// '-- Barret constant
178+
//
179+
// Note that the shifts and masks can go away if we operate on u32s,
180+
// leaving 2 xmuls and 2 xors.
181+
//
182+
const BARRET_CONSTANT: p32 = {
183+
p32(p64(POLYNOMIAL.0 << 32).naive_div(POLYNOMIAL).0 as u32)
170184
};
171185

172186
let mut crc = p32(0xffffffff);
173187

174188
for b in data {
175189
crc = crc ^ (p32::from(b.reverse_bits()) << 24);
176-
let q = (p64::from(crc >> 24)*BARRET_CONSTANT) >> 32;
177-
crc = p32::from_lossy(q*POLYNOMIAL) + (crc << 8);
190+
crc = (crc << 8)
191+
+ ((crc >> 24u32).widening_mul(BARRET_CONSTANT).1 + (crc >> 24u32))
192+
.wrapping_mul(p32::from_lossy(POLYNOMIAL));
178193
}
179194

180195
u32::from(crc).reverse_bits() ^ 0xffffffff
@@ -184,9 +199,23 @@ pub fn barret_crc(data: &[u8]) -> u32 {
184199
/// barret_crc, but operating on a 32-bit word at a time
185200
///
186201
pub fn word_barret_crc(data: &[u8]) -> u32 {
187-
const BARRET_CONSTANT: p64 = {
188-
p64(p128(0x10000000000000000)
189-
.naive_div(p128(POLYNOMIAL.0 as u128)).0 as u64)
202+
// Normally this would be 0x10000000000000000 / __polynomial, but
203+
// we eagerly do one step of division so we avoid needing a 4x wide
204+
// type. We can also drop the highest bit if we add the high bits
205+
// manually we use use this constant.
206+
//
207+
// = x % p
208+
// = 0xffffffff & (x + p*(((x >> 32) * [0x10000000000000000/p]) >> 32))
209+
// = 0xffffffff & (x + p*(((x >> 32) * [(p << 32)/p + 0x100000000]) >> 32))
210+
// = 0xffffffff & (x + p*((((x >> 32) * [(p << 32)/p]) >> 32) + (x >> 32)))
211+
// \-----+-----/
212+
// '-- Barret constant
213+
//
214+
// Note that the shifts and masks can go away if we operate on u32s,
215+
// leaving 2 xmuls and 2 xors.
216+
//
217+
const BARRET_CONSTANT: p32 = {
218+
p32(p64(POLYNOMIAL.0 << 32).naive_div(POLYNOMIAL).0 as u32)
190219
};
191220

192221
let mut crc = p32(0xffffffff);
@@ -196,19 +225,118 @@ pub fn word_barret_crc(data: &[u8]) -> u32 {
196225
for word in &mut words {
197226
let word = <[u8; 4]>::try_from(word).unwrap();
198227
crc = crc ^ p32::from_le_bytes(word).reverse_bits();
199-
let q = (p64::from(crc)*BARRET_CONSTANT) >> 32;
200-
crc = p32::from_lossy(q*POLYNOMIAL);
228+
crc = (crc.widening_mul(BARRET_CONSTANT).1 + crc)
229+
.wrapping_mul(p32::from_lossy(POLYNOMIAL));
201230
}
202231

203232
for b in words.remainder() {
204233
crc = crc ^ (p32::from(b.reverse_bits()) << 24);
205-
let q = (p64::from(crc >> 24)*BARRET_CONSTANT) >> 32;
206-
crc = p32::from_lossy(q*POLYNOMIAL) + (crc << 8);
234+
crc = (crc << 8)
235+
+ ((crc >> 24u32).widening_mul(BARRET_CONSTANT).1 + (crc >> 24u32))
236+
.wrapping_mul(p32::from_lossy(POLYNOMIAL));
207237
}
208238

209239
u32::from(crc).reverse_bits() ^ 0xffffffff
210240
}
211241

242+
/// A hardware-accelerated CRC implementation using Barret reduction without
243+
/// needing to bit-reverse the internal representation
244+
///
245+
/// CRC32 and polynomial multiplication instructions unfortunately are defined
246+
/// with different bit-endianness. This would normally mean we need to
247+
/// bit-reverse the incoming data before we can use polynomial multiplication.
248+
///
249+
/// However, polynomial multiplication has the odd property that it is
250+
/// symmetric, brev(a) * brev(b) = brev((a * b) << 1)
251+
///
252+
/// This means we can rewrite our Barret reduction CRC to operate entirely
253+
/// on a bit-reversed representation, shaving off several instructions.
254+
///
255+
/// In theory this should be faster, but measurements show this as actually
256+
/// being slightly slower, perhaps the extra 1-bit shift costs more on
257+
/// machines with bit-reverse instructions?
258+
///
259+
pub fn reversed_barret_crc(data: &[u8]) -> u32 {
260+
// Normally this would be 0x10000000000000000 / __polynomial, but
261+
// we eagerly do one step of division so we avoid needing a 4x wide
262+
// type. We can also drop the highest bit if we add the high bits
263+
// manually we use use this constant.
264+
//
265+
// = x % p
266+
// = 0xffffffff & (x + p*(((x >> 32) * [0x10000000000000000/p]) >> 32))
267+
// = 0xffffffff & (x + p*(((x >> 32) * [(p << 32)/p + 0x100000000]) >> 32))
268+
// = 0xffffffff & (x + p*((((x >> 32) * [(p << 32)/p]) >> 32) + (x >> 32)))
269+
// \-----+-----/
270+
// '-- Barret constant
271+
//
272+
// Note that the shifts and masks can go away if we operate on u32s,
273+
// leaving 2 xmuls and 2 xors.
274+
//
275+
const BARRET_CONSTANT: p32 = {
276+
p32(p64(POLYNOMIAL.0 << 32).naive_div(POLYNOMIAL).0 as u32)
277+
};
278+
const POLYNOMIAL_REV: p32 = p32(POLYNOMIAL.0 as u32).reverse_bits();
279+
const BARRET_CONSTANT_REV: p32 = BARRET_CONSTANT.reverse_bits();
280+
281+
let mut crc = p32(0xffffffff);
282+
283+
for b in data {
284+
crc = crc ^ p32::from(*b);
285+
let (lo, _) = (crc << 24u32).widening_mul(BARRET_CONSTANT_REV);
286+
let (lo, hi) = ((lo << 1u32) + (crc << 24u32)).widening_mul(POLYNOMIAL_REV);
287+
crc = (crc >> 8u32) + ((hi << 1u32) | (lo >> 31u32));
288+
}
289+
290+
u32::from(crc) ^ 0xffffffff
291+
}
292+
293+
/// A hardware-accelerated CRC implementation using the same technique as
294+
/// reversed_barret_crc, but operating on a 32-bit word at a time
295+
///
296+
pub fn word_reversed_barret_crc(data: &[u8]) -> u32 {
297+
// Normally this would be 0x10000000000000000 / __polynomial, but
298+
// we eagerly do one step of division so we avoid needing a 4x wide
299+
// type. We can also drop the highest bit if we add the high bits
300+
// manually we use use this constant.
301+
//
302+
// = x % p
303+
// = 0xffffffff & (x + p*(((x >> 32) * [0x10000000000000000/p]) >> 32))
304+
// = 0xffffffff & (x + p*(((x >> 32) * [(p << 32)/p + 0x100000000]) >> 32))
305+
// = 0xffffffff & (x + p*((((x >> 32) * [(p << 32)/p]) >> 32) + (x >> 32)))
306+
// \-----+-----/
307+
// '-- Barret constant
308+
//
309+
// Note that the shifts and masks can go away if we operate on u32s,
310+
// leaving 2 xmuls and 2 xors.
311+
//
312+
const BARRET_CONSTANT: p32 = {
313+
p32(p64(POLYNOMIAL.0 << 32).naive_div(POLYNOMIAL).0 as u32)
314+
};
315+
const POLYNOMIAL_REV: p32 = p32(POLYNOMIAL.0 as u32).reverse_bits();
316+
const BARRET_CONSTANT_REV: p32 = BARRET_CONSTANT.reverse_bits();
317+
318+
let mut crc = p32(0xffffffff);
319+
320+
// iterate over 4-byte words
321+
let mut words = data.chunks_exact(4);
322+
for word in &mut words {
323+
let word = <[u8; 4]>::try_from(word).unwrap();
324+
crc = crc ^ p32::from_le_bytes(word);
325+
let (lo, _) = crc.widening_mul(BARRET_CONSTANT_REV);
326+
let (lo, hi) = ((lo << 1u32) + crc).widening_mul(POLYNOMIAL_REV);
327+
crc = (hi << 1u32) | (lo >> 31u32);
328+
}
329+
330+
for b in words.remainder() {
331+
crc = crc ^ p32::from(*b);
332+
let (lo, _) = (crc << 24u32).widening_mul(BARRET_CONSTANT_REV);
333+
let (lo, hi) = ((lo << 1u32) + (crc << 24u32)).widening_mul(POLYNOMIAL_REV);
334+
crc = (crc >> 8u32) + ((hi << 1u32) | (lo >> 31u32));
335+
}
336+
337+
u32::from(crc) ^ 0xffffffff
338+
}
339+
212340

213341
fn main() {
214342
let input = b"Hello World!";
@@ -217,31 +345,39 @@ fn main() {
217345
println!("testing crc({:?})", String::from_utf8_lossy(input));
218346

219347
let output = naive_crc(input);
220-
println!("{:<19} => 0x{:08x}", "naive_crc", output);
348+
println!("{:<24} => 0x{:08x}", "naive_crc", output);
221349
assert_eq!(output, expected);
222350

223-
let output = naive_crc(input);
224-
println!("{:<19} => 0x{:08x}", "less_naive_crc", output);
351+
let output = less_naive_crc(input);
352+
println!("{:<24} => 0x{:08x}", "less_naive_crc", output);
225353
assert_eq!(output, expected);
226354

227-
let output = naive_crc(input);
228-
println!("{:<19} => 0x{:08x}", "word_less_naive_crc", output);
355+
let output = word_less_naive_crc(input);
356+
println!("{:<24} => 0x{:08x}", "word_less_naive_crc", output);
229357
assert_eq!(output, expected);
230358

231-
let output = naive_crc(input);
232-
println!("{:<19} => 0x{:08x}", "table_crc", output);
359+
let output = table_crc(input);
360+
println!("{:<24} => 0x{:08x}", "table_crc", output);
233361
assert_eq!(output, expected);
234362

235-
let output = naive_crc(input);
236-
println!("{:<19} => 0x{:08x}", "small_table_crc", output);
363+
let output = small_table_crc(input);
364+
println!("{:<24} => 0x{:08x}", "small_table_crc", output);
237365
assert_eq!(output, expected);
238366

239-
let output = naive_crc(input);
240-
println!("{:<19} => 0x{:08x}", "barret_crc", output);
367+
let output = barret_crc(input);
368+
println!("{:<24} => 0x{:08x}", "barret_crc", output);
241369
assert_eq!(output, expected);
242370

243-
let output = naive_crc(input);
244-
println!("{:<19} => 0x{:08x}", "word_barret_crc", output);
371+
let output = word_barret_crc(input);
372+
println!("{:<24} => 0x{:08x}", "word_barret_crc", output);
373+
assert_eq!(output, expected);
374+
375+
let output = reversed_barret_crc(input);
376+
println!("{:<24} => 0x{:08x}", "reversed_barret_crc", output);
377+
assert_eq!(output, expected);
378+
379+
let output = word_reversed_barret_crc(input);
380+
println!("{:<24} => 0x{:08x}", "word_reversed_barret_crc", output);
245381
assert_eq!(output, expected);
246382

247383
println!();

examples/find-p.rs

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -86,20 +86,10 @@ pub fn is_generator(g: p128, p: p128) -> bool {
8686
//
8787
let width = (128-p.leading_zeros()) - 1;
8888

89-
// We're going to do a lot of multiplications, so it helps to precalculate
90-
// Barret's constant for Barret reduction. This trades a modulus operation
91-
// for 2 multiplication, but means we can leverage carry-less multiplication
92-
// hardware instructions.
93-
//
94-
// normally this is just (1 << (2*width)) / p, but we can precompute
95-
// one step of division to avoid needing a 4x wide type
96-
//
97-
let mask = (1u128 << width) - 1;
98-
let barret_constant = (((mask & p) << width) / p) + (p128(1) << width);
89+
// Multiplication uses carry-less multiplicatio modulo our irreducible
90+
// polynomial
9991
let gfmul = |a: p128, b: p128| -> p128 {
100-
let x = a * b;
101-
let q = ((x >> width) * barret_constant) >> width;
102-
mask & ((q * p) + x)
92+
(a * b) % p
10393
};
10494

10595
// Exponentiation via squaring

gf256-macros/Cargo.toml

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,7 @@ proc-macro = true
1010

1111
[features]
1212
# See gf256/Cargo.toml for documentation over these features
13-
use-naive-xmul = []
14-
use-hardware-xmul = []
15-
use-naive-gfmul = []
16-
use-table-gfmul = []
17-
use-barret-gfmul = []
13+
no-xmul = []
1814

1915
[dependencies]
2016
syn = {version="1.0.73", features=["full"]}

0 commit comments

Comments
 (0)