feature(float_mul_add_fast): expose llvm.fmuladd.* semantics

jedbrown · jedbrown · commit 5de9a1e5a467 · 2024-05-07T20:44:19.000-06:00
Add intrinsics `fmuladd{f32,f64}` and `{f32,f64}::mul_add_fast`. This computes `(self * a) + b`, to be fused if the code generator determines that (a) the target instruction set has support for a fused operation, and (b) that the fused operation is more efficient than the equivalent, separate pair of `mul` and `add` instructions. https://llvm.org/docs/LangRef.html#llvm-fmuladd-intrinsic The codegen_cranelift uses the `fma` function from libc, which is a correct implementation, but without the desired performance semantic. I think this requires an update to cranelift to expose a suitable instruction in its IR. I have not tested with codegen_gcc, but it should behave the same way (using `fma` from libc).
diff --git a/compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs b/compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs
@@ -328,6 +328,8 @@ fn codegen_float_intrinsic_call<'tcx>(
         sym::fabsf64 => ("fabs", 1, fx.tcx.types.f64, types::F64),
         sym::fmaf32 => ("fmaf", 3, fx.tcx.types.f32, types::F32),
         sym::fmaf64 => ("fma", 3, fx.tcx.types.f64, types::F64),
+        sym::fmuladdf32 => ("fmaf", 3, fx.tcx.types.f32, types::F32), // NOTE: pessimal without FMA target feature
+        sym::fmuladdf64 => ("fma", 3, fx.tcx.types.f64, types::F64), // NOTE: pessimal without FMA target feature
         sym::copysignf32 => ("copysignf", 2, fx.tcx.types.f32, types::F32),
         sym::copysignf64 => ("copysign", 2, fx.tcx.types.f64, types::F64),
         sym::floorf32 => ("floorf", 1, fx.tcx.types.f32, types::F32),
@@ -381,7 +383,7 @@ fn codegen_float_intrinsic_call<'tcx>(
 
     let layout = fx.layout_of(ty);
     let res = match intrinsic {
-        sym::fmaf32 | sym::fmaf64 => {
+        sym::fmaf32 | sym::fmaf64 | sym::fmuladdf32 | sym::fmuladdf64 => {
             CValue::by_val(fx.bcx.ins().fma(args[0], args[1], args[2]), layout)
         }
         sym::copysignf32 | sym::copysignf64 => {
diff --git a/compiler/rustc_codegen_gcc/src/intrinsic/mod.rs b/compiler/rustc_codegen_gcc/src/intrinsic/mod.rs
@@ -66,6 +66,8 @@ fn get_simple_intrinsic<'gcc, 'tcx>(
         sym::log2f64 => "log2",
         sym::fmaf32 => "fmaf",
         sym::fmaf64 => "fma",
+        sym::fmuladdf32 => "fmaf", // NOTE: pessimal without FMA target feature
+        sym::fmuladdf64 => "fma", // NOTE: pessimal without FMA target feature
         sym::fabsf32 => "fabsf",
         sym::fabsf64 => "fabs",
         sym::minnumf32 => "fminf",
diff --git a/compiler/rustc_codegen_llvm/src/context.rs b/compiler/rustc_codegen_llvm/src/context.rs
@@ -783,6 +783,11 @@ impl<'ll> CodegenCx<'ll, '_> {
         ifn!("llvm.fma.f64", fn(t_f64, t_f64, t_f64) -> t_f64);
         ifn!("llvm.fma.f128", fn(t_f128, t_f128, t_f128) -> t_f128);
 
+        ifn!("llvm.fmuladd.f16", fn(t_f16, t_f16, t_f16) -> t_f16);
+        ifn!("llvm.fmuladd.f32", fn(t_f32, t_f32, t_f32) -> t_f32);
+        ifn!("llvm.fmuladd.f64", fn(t_f64, t_f64, t_f64) -> t_f64);
+        ifn!("llvm.fmuladd.f128", fn(t_f128, t_f128, t_f128) -> t_f128);
+
         ifn!("llvm.fabs.f16", fn(t_f16) -> t_f16);
         ifn!("llvm.fabs.f32", fn(t_f32) -> t_f32);
         ifn!("llvm.fabs.f64", fn(t_f64) -> t_f64);
diff --git a/compiler/rustc_codegen_llvm/src/intrinsic.rs b/compiler/rustc_codegen_llvm/src/intrinsic.rs
@@ -83,6 +83,11 @@ fn get_simple_intrinsic<'ll>(
         sym::fmaf64 => "llvm.fma.f64",
         sym::fmaf128 => "llvm.fma.f128",
 
+        sym::fmuladdf16 => "llvm.fmuladd.f16",
+        sym::fmuladdf32 => "llvm.fmuladd.f32",
+        sym::fmuladdf64 => "llvm.fmuladd.f64",
+        sym::fmuladdf128 => "llvm.fmuladd.f128",
+
         sym::fabsf16 => "llvm.fabs.f16",
         sym::fabsf32 => "llvm.fabs.f32",
         sym::fabsf64 => "llvm.fabs.f64",
diff --git a/compiler/rustc_hir_analysis/src/check/intrinsic.rs b/compiler/rustc_hir_analysis/src/check/intrinsic.rs
@@ -350,6 +350,19 @@ pub fn check_intrinsic_type(
                 (0, 0, vec![tcx.types.f128, tcx.types.f128, tcx.types.f128], tcx.types.f128)
             }
 
+            sym::fmuladdf16 => {
+                (0, 0, vec![tcx.types.f16, tcx.types.f16, tcx.types.f16], tcx.types.f16)
+            }
+            sym::fmuladdf32 => {
+                (0, 0, vec![tcx.types.f32, tcx.types.f32, tcx.types.f32], tcx.types.f32)
+            }
+            sym::fmuladdf64 => {
+                (0, 0, vec![tcx.types.f64, tcx.types.f64, tcx.types.f64], tcx.types.f64)
+            }
+            sym::fmuladdf128 => {
+                (0, 0, vec![tcx.types.f128, tcx.types.f128, tcx.types.f128], tcx.types.f128)
+            }
+
             sym::fabsf16 => (0, 0, vec![tcx.types.f16], tcx.types.f16),
             sym::fabsf32 => (0, 0, vec![tcx.types.f32], tcx.types.f32),
             sym::fabsf64 => (0, 0, vec![tcx.types.f64], tcx.types.f64),
diff --git a/compiler/rustc_span/src/symbol.rs b/compiler/rustc_span/src/symbol.rs
@@ -865,6 +865,10 @@ symbols! {
         fmt,
         fmul_algebraic,
         fmul_fast,
+        fmuladdf128,
+        fmuladdf16,
+        fmuladdf32,
+        fmuladdf64,
         fn_align,
         fn_delegation,
         fn_must_use,
diff --git a/library/core/src/intrinsics.rs b/library/core/src/intrinsics.rs
@@ -1724,6 +1724,25 @@ extern "rust-intrinsic" {
     #[rustc_nounwind]
     pub fn fmaf64(a: f64, b: f64, c: f64) -> f64;
 
+    /// Returns `a * b + c` for `f32` values.
+    ///
+    /// The operation is fused if the code generator determines that target
+    /// instruction set has support for a fused operation, and that the fused
+    /// operation is more efficient than the equivalent, separate pair of mul
+    /// and add instructions.
+    #[rustc_nounwind]
+    #[cfg(not(bootstrap))]
+    pub fn fmuladdf32(a: f32, b: f32, c: f32) -> f32;
+    /// Returns `a * b + c` for `f64` values.
+    ///
+    /// The operation is fused if the code generator determines that target
+    /// instruction set has support for a fused operation, and that the fused
+    /// operation is more efficient than the equivalent, separate pair of mul
+    /// and add instructions.
+    #[rustc_nounwind]
+    #[cfg(not(bootstrap))]
+    pub fn fmuladdf64(a: f64, b: f64, c: f64) -> f64;
+
     /// Returns the absolute value of an `f32`.
     ///
     /// The stabilized version of this intrinsic is
diff --git a/library/std/src/f32.rs b/library/std/src/f32.rs
@@ -293,6 +293,34 @@ impl f32 {
         unsafe { intrinsics::fmaf32(self, a, b) }
     }
 
+    /// Possibly-fused multiply-add. Computes `(self * a) + b` that can be
+    /// fused if the code generator determines that (a) the target instruction
+    /// set has support for a fused operation, and (b) that the fused operation
+    /// is more efficient than the equivalent, separate pair of mul and add
+    /// instructions.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #![feature(float_mul_add_fast)]
+    /// let m = 10.0_f32;
+    /// let x = 4.0_f32;
+    /// let b = 60.0_f32;
+    ///
+    /// // 100.0
+    /// let abs_difference = (m.mul_add_fast(x, b) - ((m * x) + b)).abs();
+    ///
+    /// assert!(abs_difference <= f32::EPSILON);
+    /// ```
+    #[cfg(not(bootstrap))]
+    #[cfg_attr(not(bootstrap), rustc_allow_incoherent_impl)]
+    #[must_use = "method returns a new number and does not mutate the original value"]
+    #[inline]
+    #[unstable(feature = "float_mul_add_fast", issue = "none")]
+    pub fn mul_add_fast(self, a: f32, b: f32) -> f32 {
+        unsafe { intrinsics::fmuladdf32(self, a, b) }
+    }
+
     /// Calculates Euclidean division, the matching method for `rem_euclid`.
     ///
     /// This computes the integer `n` such that
diff --git a/library/std/src/f32/tests.rs b/library/std/src/f32/tests.rs
@@ -409,6 +409,22 @@ fn test_mul_add() {
     assert_eq!((-3.2f32).mul_add(2.4, neg_inf), neg_inf);
 }
 
+#[test]
+fn test_mul_add_fast() {
+    let nan: f32 = f32::NAN;
+    let inf: f32 = f32::INFINITY;
+    let neg_inf: f32 = f32::NEG_INFINITY;
+    assert_approx_eq!(12.3f32.mul_add_fast(4.5, 6.7), 62.05);
+    assert_approx_eq!((-12.3f32).mul_add_fast(-4.5, -6.7), 48.65);
+    assert_approx_eq!(0.0f32.mul_add_fast(8.9, 1.2), 1.2);
+    assert_approx_eq!(3.4f32.mul_add_fast(-0.0, 5.6), 5.6);
+    assert!(nan.mul_add_fast(7.8, 9.0).is_nan());
+    assert_eq!(inf.mul_add_fast(7.8, 9.0), inf);
+    assert_eq!(neg_inf.mul_add_fast(7.8, 9.0), neg_inf);
+    assert_eq!(8.9f32.mul_add_fast(inf, 3.2), inf);
+    assert_eq!((-3.2f32).mul_add_fast(2.4, neg_inf), neg_inf);
+}
+
 #[test]
 fn test_recip() {
     let nan: f32 = f32::NAN;
diff --git a/library/std/src/f64.rs b/library/std/src/f64.rs
@@ -293,6 +293,34 @@ impl f64 {
         unsafe { intrinsics::fmaf64(self, a, b) }
     }
 
+    /// Possibly-fused multiply-add. Computes `(self * a) + b` that can be
+    /// fused if the code generator determines that (a) the target instruction
+    /// set has support for a fused operation, and (b) that the fused operation
+    /// is more efficient than the equivalent, separate pair of mul and add
+    /// instructions.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #![feature(float_mul_add_fast)]
+    /// let m = 10.0_f64;
+    /// let x = 4.0_f64;
+    /// let b = 60.0_f64;
+    ///
+    /// // 100.0
+    /// let abs_difference = (m.mul_add_fast(x, b) - ((m * x) + b)).abs();
+    ///
+    /// assert!(abs_difference < 1e-10);
+    /// ```
+    #[cfg(not(bootstrap))]
+    #[cfg_attr(not(bootstrap), rustc_allow_incoherent_impl)]
+    #[must_use = "method returns a new number and does not mutate the original value"]
+    #[inline]
+    #[unstable(feature = "float_mul_add_fast", issue = "none")]
+    pub fn mul_add_fast(self, a: f64, b: f64) -> f64 {
+        unsafe { intrinsics::fmuladdf64(self, a, b) }
+    }
+
     /// Calculates Euclidean division, the matching method for `rem_euclid`.
     ///
     /// This computes the integer `n` such that
diff --git a/library/std/src/f64/tests.rs b/library/std/src/f64/tests.rs
@@ -397,6 +397,22 @@ fn test_mul_add() {
     assert_eq!((-3.2f64).mul_add(2.4, neg_inf), neg_inf);
 }
 
+#[test]
+fn test_mul_add_fast() {
+    let nan: f64 = f64::NAN;
+    let inf: f64 = f64::INFINITY;
+    let neg_inf: f64 = f64::NEG_INFINITY;
+    assert_approx_eq!(12.3f64.mul_add_fast(4.5, 6.7), 62.05);
+    assert_approx_eq!((-12.3f64).mul_add_fast(-4.5, -6.7), 48.65);
+    assert_approx_eq!(0.0f64.mul_add_fast(8.9, 1.2), 1.2);
+    assert_approx_eq!(3.4f64.mul_add_fast(-0.0, 5.6), 5.6);
+    assert!(nan.mul_add_fast(7.8, 9.0).is_nan());
+    assert_eq!(inf.mul_add_fast(7.8, 9.0), inf);
+    assert_eq!(neg_inf.mul_add_fast(7.8, 9.0), neg_inf);
+    assert_eq!(8.9f64.mul_add_fast(inf, 3.2), inf);
+    assert_eq!((-3.2f64).mul_add_fast(2.4, neg_inf), neg_inf);
+}
+
 #[test]
 fn test_recip() {
     let nan: f64 = f64::NAN;
diff --git a/library/std/src/lib.rs b/library/std/src/lib.rs
@@ -318,6 +318,7 @@
 //
 // Library features (core):
 // tidy-alphabetical-start
+#![cfg_attr(not(bootstrap), feature(float_mul_add_fast))]
 #![feature(c_str_module)]
 #![feature(char_internals)]
 #![feature(core_intrinsics)]
diff --git a/src/tools/miri/src/lib.rs b/src/tools/miri/src/lib.rs
@@ -2,6 +2,7 @@
 #![feature(cell_update)]
 #![feature(const_option)]
 #![feature(float_gamma)]
+#![feature(float_mul_add_fast)]
 #![feature(map_try_insert)]
 #![feature(never_type)]
 #![feature(try_blocks)]
diff --git a/tests/ui/intrinsics/intrinsics-math.rs b/tests/ui/intrinsics/intrinsics-math.rs
@@ -1,5 +1,6 @@
 //@ run-pass
 //@ ignore-emscripten fma not implemented in emscripten
+#![feature(float_mul_add_fast)]
 
 macro_rules! assert_approx_eq {
     ($a:expr, $b:expr) => ({
@@ -46,6 +47,9 @@ pub fn main() {
     assert_approx_eq!(1.0f32.mul_add(2.0f32, 5.0f32), 7.0f32);
     assert_approx_eq!(0.0f64.mul_add(-2.0f64, f64::consts::E), f64::consts::E);
 
+    assert_approx_eq!(1.0f32.mul_add_fast(2.0f32, 5.0f32), 7.0f32);
+    assert_approx_eq!(0.0f64.mul_add_fast(-2.0f64, f64::consts::E), f64::consts::E);
+
     assert_approx_eq!((-1.0f32).abs(), 1.0f32);
     assert_approx_eq!(34.2f64.abs(), 34.2f64);