Use f64::mul_add if the std feature is enabled

- May give better performance on some target architectures
ajtribick · Nov 24, 2024 · 3e96bbd · 3e96bbd
1 parent bde0398
commit 3e96bbd
Show file tree

Hide file tree

Showing 5 changed files with 36 additions and 12 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,10 @@
 # Changelog
 
+## Version 0.8.1
+
+* Re-enable use of `std::mul_add` if the `std` feature is enabled (except on
+  MinGW).
+
 ## Version 0.8
 
 * Always use libm functions.

diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "twofloat"
-version = "0.8.0"
+version = "0.8.1"
 authors = ["Andrew Tribick", "Individual contributors"]
 keywords = ["float", "precision", "numerics", "floating-point", "arithmetic"]
 categories = ["algorithms", "mathematics", "science"]

diff --git a/src/arithmetic.rs b/src/arithmetic.rs
@@ -6,6 +6,19 @@ use core::ops::{
 
 use crate::TwoFloat;
 
+// MinGW FMA seems to be inaccurate, use libm even if std is enabled.
+#[cfg(all(feature = "std", not(all(windows, target_env = "gnu"))))]
+#[inline(always)]
+fn fma(x: f64, y: f64, z: f64) -> f64 {
+    f64::mul_add(x, y, z)
+}
+
+#[cfg(not(all(feature = "std", not(all(windows, target_env = "gnu")))))]
+#[inline(always)]
+fn fma(x: f64, y: f64, z: f64) -> f64 {
+    libm::fma(x, y, z)
+}
+
 pub(crate) fn fast_two_sum(a: f64, b: f64) -> TwoFloat {
     // Joldes et al. (2017) Algorithm 1
     let s = a + b;
@@ -43,7 +56,7 @@ impl TwoFloat {
         let p = a * b;
         Self {
             hi: p,
-            lo: libm::fma(a, b, -p),
+            lo: fma(a, b, -p),
         }
     }
 
@@ -128,15 +141,15 @@ binary_ops! {
     /// (2017) Algorithm 9.
     fn Mul::mul<'a, 'b>(self: &'a TwoFloat, rhs: &'b f64) -> TwoFloat {
         let (ch, cl1) = TwoFloat::new_mul(self.hi, *rhs).into();
-        let cl3 = libm::fma(self.lo, *rhs, cl1);
+        let cl3 = fma(self.lo, *rhs, cl1);
         fast_two_sum(ch, cl3)
     }
 
     /// Implements multiplication of `TwoFloat` and `f64` using Joldes et al.
     /// (2017) Algorithm 9.
     fn Mul::mul<'a, 'b>(self: &'a f64, rhs: &'b TwoFloat) -> TwoFloat {
         let (ch, cl1) = TwoFloat::new_mul(rhs.hi, *self).into();
-        let cl3 = libm::fma(rhs.lo, *self, cl1);
+        let cl3 = fma(rhs.lo, *self, cl1);
         fast_two_sum(ch, cl3)
     }
 
@@ -145,8 +158,8 @@ binary_ops! {
     fn Mul::mul<'a, 'b>(self: &'a TwoFloat, rhs: &'b TwoFloat) -> TwoFloat {
         let (ch, cl1) = TwoFloat::new_mul(self.hi, rhs.hi).into();
         let tl0 = self.lo * rhs.lo;
-        let tl1 = libm::fma(self.hi, rhs.lo, tl0);
-        let cl2 = libm::fma(self.lo, rhs.hi, tl1);
+        let tl1 = fma(self.hi, rhs.lo, tl0);
+        let cl2 = fma(self.lo, rhs.hi, tl1);
         let cl3 = cl1 + cl2;
         fast_two_sum(ch, cl3)
     }
@@ -175,7 +188,7 @@ binary_ops! {
         let d = e * th;
         let m = d + th;
         let (ch, cl1) = TwoFloat::new_mul(m.hi, *self).into();
-        let cl3 = libm::fma(m.lo, *self, cl1);
+        let cl3 = fma(m.lo, *self, cl1);
         fast_two_sum(ch, cl3)
     }
 
@@ -253,7 +266,7 @@ assign_ops! {
     /// (2017) Algorithm 9.
     fn MulAssign::mul_assign<'a>(self: &mut TwoFloat, rhs: &'a f64) {
         let (ch, cl1) = TwoFloat::new_mul(self.hi, *rhs).into();
-        let cl3 = libm::fma(self.lo, *rhs, cl1);
+        let cl3 = fma(self.lo, *rhs, cl1);
         *self = fast_two_sum(ch, cl3);
     }
 
@@ -262,8 +275,8 @@ assign_ops! {
     fn MulAssign::mul_assign<'a>(self: &mut TwoFloat, rhs: &'a TwoFloat) {
         let (ch, cl1) = TwoFloat::new_mul(self.hi, rhs.hi).into();
         let tl0 = self.lo * rhs.lo;
-        let tl1 = libm::fma(self.hi, rhs.lo, tl0);
-        let cl2 = libm::fma(self.lo, rhs.hi, tl1);
+        let tl1 = fma(self.hi, rhs.lo, tl0);
+        let cl2 = fma(self.lo, rhs.hi, tl1);
         let cl3 = cl1 + cl2;
         *self = fast_two_sum(ch, cl3)
     }

diff --git a/src/lib.rs b/src/lib.rs
@@ -32,13 +32,19 @@ as preliminary.
 Operations on non-finite values are not supported. At the moment this is not
 automatically checked. The `is_valid()` method is provided for this purpose.
 
+If the `std` feature is enabled (as it is by default), the fused multiply-add
+operation from the standard library is used. This *may* be more performant if
+the target architecture has a dedicated instruction for this. See the
+documentation of [`f64::mul_add`] for details. Otherwise the libm
+implementation is used.
+
 If the `serde` feature is enabled, serialization and deserialization is
 possible through the Serde library.
 
 ## Known issues
 
 * The MinGW `fma` implementation appears to give incorrect results in some
-  cases, so the libm implementation is always used on this platform.
+  cases, so the libm function is always used on this platform.
 
 ## References
 

diff --git a/tests/common/mod.rs b/tests/common/mod.rs
@@ -7,7 +7,7 @@ use twofloat::{TwoFloat, TwoFloatError};
 // This runs substantially slower than the other CI targets, so reduce the
 // number of iterations
 #[cfg(all(target_arch = "aarch64", target_os = "linux"))]
-const TEST_ITERS: usize = 1000;
+const TEST_ITERS: usize = 10000;
 
 #[cfg(not(all(target_arch = "aarch64", target_os = "linux")))]
 const TEST_ITERS: usize = 100000;