Skip to content

Commit 5de9a1e

Browse files
committed
feature(float_mul_add_fast): expose llvm.fmuladd.* semantics
Add intrinsics `fmuladd{f32,f64}` and `{f32,f64}::mul_add_fast`. This computes `(self * a) + b`, to be fused if the code generator determines that (a) the target instruction set has support for a fused operation, and (b) that the fused operation is more efficient than the equivalent, separate pair of `mul` and `add` instructions. https://llvm.org/docs/LangRef.html#llvm-fmuladd-intrinsic The codegen_cranelift uses the `fma` function from libc, which is a correct implementation, but without the desired performance semantic. I think this requires an update to cranelift to expose a suitable instruction in its IR. I have not tested with codegen_gcc, but it should behave the same way (using `fma` from libc).
1 parent 0f40f14 commit 5de9a1e

File tree

14 files changed

+145
-1
lines changed

14 files changed

+145
-1
lines changed

compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs

+3-1
Original file line numberDiff line numberDiff line change
@@ -328,6 +328,8 @@ fn codegen_float_intrinsic_call<'tcx>(
328328
sym::fabsf64 => ("fabs", 1, fx.tcx.types.f64, types::F64),
329329
sym::fmaf32 => ("fmaf", 3, fx.tcx.types.f32, types::F32),
330330
sym::fmaf64 => ("fma", 3, fx.tcx.types.f64, types::F64),
331+
sym::fmuladdf32 => ("fmaf", 3, fx.tcx.types.f32, types::F32), // NOTE: pessimal without FMA target feature
332+
sym::fmuladdf64 => ("fma", 3, fx.tcx.types.f64, types::F64), // NOTE: pessimal without FMA target feature
331333
sym::copysignf32 => ("copysignf", 2, fx.tcx.types.f32, types::F32),
332334
sym::copysignf64 => ("copysign", 2, fx.tcx.types.f64, types::F64),
333335
sym::floorf32 => ("floorf", 1, fx.tcx.types.f32, types::F32),
@@ -381,7 +383,7 @@ fn codegen_float_intrinsic_call<'tcx>(
381383

382384
let layout = fx.layout_of(ty);
383385
let res = match intrinsic {
384-
sym::fmaf32 | sym::fmaf64 => {
386+
sym::fmaf32 | sym::fmaf64 | sym::fmuladdf32 | sym::fmuladdf64 => {
385387
CValue::by_val(fx.bcx.ins().fma(args[0], args[1], args[2]), layout)
386388
}
387389
sym::copysignf32 | sym::copysignf64 => {

compiler/rustc_codegen_gcc/src/intrinsic/mod.rs

+2
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,8 @@ fn get_simple_intrinsic<'gcc, 'tcx>(
6666
sym::log2f64 => "log2",
6767
sym::fmaf32 => "fmaf",
6868
sym::fmaf64 => "fma",
69+
sym::fmuladdf32 => "fmaf", // NOTE: pessimal without FMA target feature
70+
sym::fmuladdf64 => "fma", // NOTE: pessimal without FMA target feature
6971
sym::fabsf32 => "fabsf",
7072
sym::fabsf64 => "fabs",
7173
sym::minnumf32 => "fminf",

compiler/rustc_codegen_llvm/src/context.rs

+5
Original file line numberDiff line numberDiff line change
@@ -783,6 +783,11 @@ impl<'ll> CodegenCx<'ll, '_> {
783783
ifn!("llvm.fma.f64", fn(t_f64, t_f64, t_f64) -> t_f64);
784784
ifn!("llvm.fma.f128", fn(t_f128, t_f128, t_f128) -> t_f128);
785785

786+
ifn!("llvm.fmuladd.f16", fn(t_f16, t_f16, t_f16) -> t_f16);
787+
ifn!("llvm.fmuladd.f32", fn(t_f32, t_f32, t_f32) -> t_f32);
788+
ifn!("llvm.fmuladd.f64", fn(t_f64, t_f64, t_f64) -> t_f64);
789+
ifn!("llvm.fmuladd.f128", fn(t_f128, t_f128, t_f128) -> t_f128);
790+
786791
ifn!("llvm.fabs.f16", fn(t_f16) -> t_f16);
787792
ifn!("llvm.fabs.f32", fn(t_f32) -> t_f32);
788793
ifn!("llvm.fabs.f64", fn(t_f64) -> t_f64);

compiler/rustc_codegen_llvm/src/intrinsic.rs

+5
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,11 @@ fn get_simple_intrinsic<'ll>(
8383
sym::fmaf64 => "llvm.fma.f64",
8484
sym::fmaf128 => "llvm.fma.f128",
8585

86+
sym::fmuladdf16 => "llvm.fmuladd.f16",
87+
sym::fmuladdf32 => "llvm.fmuladd.f32",
88+
sym::fmuladdf64 => "llvm.fmuladd.f64",
89+
sym::fmuladdf128 => "llvm.fmuladd.f128",
90+
8691
sym::fabsf16 => "llvm.fabs.f16",
8792
sym::fabsf32 => "llvm.fabs.f32",
8893
sym::fabsf64 => "llvm.fabs.f64",

compiler/rustc_hir_analysis/src/check/intrinsic.rs

+13
Original file line numberDiff line numberDiff line change
@@ -350,6 +350,19 @@ pub fn check_intrinsic_type(
350350
(0, 0, vec![tcx.types.f128, tcx.types.f128, tcx.types.f128], tcx.types.f128)
351351
}
352352

353+
sym::fmuladdf16 => {
354+
(0, 0, vec![tcx.types.f16, tcx.types.f16, tcx.types.f16], tcx.types.f16)
355+
}
356+
sym::fmuladdf32 => {
357+
(0, 0, vec![tcx.types.f32, tcx.types.f32, tcx.types.f32], tcx.types.f32)
358+
}
359+
sym::fmuladdf64 => {
360+
(0, 0, vec![tcx.types.f64, tcx.types.f64, tcx.types.f64], tcx.types.f64)
361+
}
362+
sym::fmuladdf128 => {
363+
(0, 0, vec![tcx.types.f128, tcx.types.f128, tcx.types.f128], tcx.types.f128)
364+
}
365+
353366
sym::fabsf16 => (0, 0, vec![tcx.types.f16], tcx.types.f16),
354367
sym::fabsf32 => (0, 0, vec![tcx.types.f32], tcx.types.f32),
355368
sym::fabsf64 => (0, 0, vec![tcx.types.f64], tcx.types.f64),

compiler/rustc_span/src/symbol.rs

+4
Original file line numberDiff line numberDiff line change
@@ -865,6 +865,10 @@ symbols! {
865865
fmt,
866866
fmul_algebraic,
867867
fmul_fast,
868+
fmuladdf128,
869+
fmuladdf16,
870+
fmuladdf32,
871+
fmuladdf64,
868872
fn_align,
869873
fn_delegation,
870874
fn_must_use,

library/core/src/intrinsics.rs

+19
Original file line numberDiff line numberDiff line change
@@ -1724,6 +1724,25 @@ extern "rust-intrinsic" {
17241724
#[rustc_nounwind]
17251725
pub fn fmaf64(a: f64, b: f64, c: f64) -> f64;
17261726

1727+
/// Returns `a * b + c` for `f32` values.
1728+
///
1729+
/// The operation is fused if the code generator determines that target
1730+
/// instruction set has support for a fused operation, and that the fused
1731+
/// operation is more efficient than the equivalent, separate pair of mul
1732+
/// and add instructions.
1733+
#[rustc_nounwind]
1734+
#[cfg(not(bootstrap))]
1735+
pub fn fmuladdf32(a: f32, b: f32, c: f32) -> f32;
1736+
/// Returns `a * b + c` for `f64` values.
1737+
///
1738+
/// The operation is fused if the code generator determines that target
1739+
/// instruction set has support for a fused operation, and that the fused
1740+
/// operation is more efficient than the equivalent, separate pair of mul
1741+
/// and add instructions.
1742+
#[rustc_nounwind]
1743+
#[cfg(not(bootstrap))]
1744+
pub fn fmuladdf64(a: f64, b: f64, c: f64) -> f64;
1745+
17271746
/// Returns the absolute value of an `f32`.
17281747
///
17291748
/// The stabilized version of this intrinsic is

library/std/src/f32.rs

+28
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,34 @@ impl f32 {
293293
unsafe { intrinsics::fmaf32(self, a, b) }
294294
}
295295

296+
/// Possibly-fused multiply-add. Computes `(self * a) + b` that can be
297+
/// fused if the code generator determines that (a) the target instruction
298+
/// set has support for a fused operation, and (b) that the fused operation
299+
/// is more efficient than the equivalent, separate pair of mul and add
300+
/// instructions.
301+
///
302+
/// # Examples
303+
///
304+
/// ```
305+
/// #![feature(float_mul_add_fast)]
306+
/// let m = 10.0_f32;
307+
/// let x = 4.0_f32;
308+
/// let b = 60.0_f32;
309+
///
310+
/// // 100.0
311+
/// let abs_difference = (m.mul_add_fast(x, b) - ((m * x) + b)).abs();
312+
///
313+
/// assert!(abs_difference <= f32::EPSILON);
314+
/// ```
315+
#[cfg(not(bootstrap))]
316+
#[cfg_attr(not(bootstrap), rustc_allow_incoherent_impl)]
317+
#[must_use = "method returns a new number and does not mutate the original value"]
318+
#[inline]
319+
#[unstable(feature = "float_mul_add_fast", issue = "none")]
320+
pub fn mul_add_fast(self, a: f32, b: f32) -> f32 {
321+
unsafe { intrinsics::fmuladdf32(self, a, b) }
322+
}
323+
296324
/// Calculates Euclidean division, the matching method for `rem_euclid`.
297325
///
298326
/// This computes the integer `n` such that

library/std/src/f32/tests.rs

+16
Original file line numberDiff line numberDiff line change
@@ -409,6 +409,22 @@ fn test_mul_add() {
409409
assert_eq!((-3.2f32).mul_add(2.4, neg_inf), neg_inf);
410410
}
411411

412+
#[test]
413+
fn test_mul_add_fast() {
414+
let nan: f32 = f32::NAN;
415+
let inf: f32 = f32::INFINITY;
416+
let neg_inf: f32 = f32::NEG_INFINITY;
417+
assert_approx_eq!(12.3f32.mul_add_fast(4.5, 6.7), 62.05);
418+
assert_approx_eq!((-12.3f32).mul_add_fast(-4.5, -6.7), 48.65);
419+
assert_approx_eq!(0.0f32.mul_add_fast(8.9, 1.2), 1.2);
420+
assert_approx_eq!(3.4f32.mul_add_fast(-0.0, 5.6), 5.6);
421+
assert!(nan.mul_add_fast(7.8, 9.0).is_nan());
422+
assert_eq!(inf.mul_add_fast(7.8, 9.0), inf);
423+
assert_eq!(neg_inf.mul_add_fast(7.8, 9.0), neg_inf);
424+
assert_eq!(8.9f32.mul_add_fast(inf, 3.2), inf);
425+
assert_eq!((-3.2f32).mul_add_fast(2.4, neg_inf), neg_inf);
426+
}
427+
412428
#[test]
413429
fn test_recip() {
414430
let nan: f32 = f32::NAN;

library/std/src/f64.rs

+28
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,34 @@ impl f64 {
293293
unsafe { intrinsics::fmaf64(self, a, b) }
294294
}
295295

296+
/// Possibly-fused multiply-add. Computes `(self * a) + b` that can be
297+
/// fused if the code generator determines that (a) the target instruction
298+
/// set has support for a fused operation, and (b) that the fused operation
299+
/// is more efficient than the equivalent, separate pair of mul and add
300+
/// instructions.
301+
///
302+
/// # Examples
303+
///
304+
/// ```
305+
/// #![feature(float_mul_add_fast)]
306+
/// let m = 10.0_f64;
307+
/// let x = 4.0_f64;
308+
/// let b = 60.0_f64;
309+
///
310+
/// // 100.0
311+
/// let abs_difference = (m.mul_add_fast(x, b) - ((m * x) + b)).abs();
312+
///
313+
/// assert!(abs_difference < 1e-10);
314+
/// ```
315+
#[cfg(not(bootstrap))]
316+
#[cfg_attr(not(bootstrap), rustc_allow_incoherent_impl)]
317+
#[must_use = "method returns a new number and does not mutate the original value"]
318+
#[inline]
319+
#[unstable(feature = "float_mul_add_fast", issue = "none")]
320+
pub fn mul_add_fast(self, a: f64, b: f64) -> f64 {
321+
unsafe { intrinsics::fmuladdf64(self, a, b) }
322+
}
323+
296324
/// Calculates Euclidean division, the matching method for `rem_euclid`.
297325
///
298326
/// This computes the integer `n` such that

library/std/src/f64/tests.rs

+16
Original file line numberDiff line numberDiff line change
@@ -397,6 +397,22 @@ fn test_mul_add() {
397397
assert_eq!((-3.2f64).mul_add(2.4, neg_inf), neg_inf);
398398
}
399399

400+
#[test]
401+
fn test_mul_add_fast() {
402+
let nan: f64 = f64::NAN;
403+
let inf: f64 = f64::INFINITY;
404+
let neg_inf: f64 = f64::NEG_INFINITY;
405+
assert_approx_eq!(12.3f64.mul_add_fast(4.5, 6.7), 62.05);
406+
assert_approx_eq!((-12.3f64).mul_add_fast(-4.5, -6.7), 48.65);
407+
assert_approx_eq!(0.0f64.mul_add_fast(8.9, 1.2), 1.2);
408+
assert_approx_eq!(3.4f64.mul_add_fast(-0.0, 5.6), 5.6);
409+
assert!(nan.mul_add_fast(7.8, 9.0).is_nan());
410+
assert_eq!(inf.mul_add_fast(7.8, 9.0), inf);
411+
assert_eq!(neg_inf.mul_add_fast(7.8, 9.0), neg_inf);
412+
assert_eq!(8.9f64.mul_add_fast(inf, 3.2), inf);
413+
assert_eq!((-3.2f64).mul_add_fast(2.4, neg_inf), neg_inf);
414+
}
415+
400416
#[test]
401417
fn test_recip() {
402418
let nan: f64 = f64::NAN;

library/std/src/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -318,6 +318,7 @@
318318
//
319319
// Library features (core):
320320
// tidy-alphabetical-start
321+
#![cfg_attr(not(bootstrap), feature(float_mul_add_fast))]
321322
#![feature(c_str_module)]
322323
#![feature(char_internals)]
323324
#![feature(core_intrinsics)]

src/tools/miri/src/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#![feature(cell_update)]
33
#![feature(const_option)]
44
#![feature(float_gamma)]
5+
#![feature(float_mul_add_fast)]
56
#![feature(map_try_insert)]
67
#![feature(never_type)]
78
#![feature(try_blocks)]

tests/ui/intrinsics/intrinsics-math.rs

+4
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
//@ run-pass
22
//@ ignore-emscripten fma not implemented in emscripten
3+
#![feature(float_mul_add_fast)]
34

45
macro_rules! assert_approx_eq {
56
($a:expr, $b:expr) => ({
@@ -46,6 +47,9 @@ pub fn main() {
4647
assert_approx_eq!(1.0f32.mul_add(2.0f32, 5.0f32), 7.0f32);
4748
assert_approx_eq!(0.0f64.mul_add(-2.0f64, f64::consts::E), f64::consts::E);
4849

50+
assert_approx_eq!(1.0f32.mul_add_fast(2.0f32, 5.0f32), 7.0f32);
51+
assert_approx_eq!(0.0f64.mul_add_fast(-2.0f64, f64::consts::E), f64::consts::E);
52+
4953
assert_approx_eq!((-1.0f32).abs(), 1.0f32);
5054
assert_approx_eq!(34.2f64.abs(), 34.2f64);
5155

0 commit comments

Comments
 (0)