@@ -13,7 +13,7 @@ using namespace intx;
1313namespace
1414{
1515// / Adds y to x: x[] += y[]. The result is truncated to the size of x.
16- constexpr void add (std::span<uint64_t > x, std::span<const uint64_t > y) noexcept
16+ constexpr bool add (std::span<uint64_t > x, std::span<const uint64_t > y) noexcept
1717{
1818 assert (x.size () >= y.size ());
1919
@@ -22,6 +22,7 @@ constexpr void add(std::span<uint64_t> x, std::span<const uint64_t> y) noexcept
2222 std::tie (x[i], carry) = addc (x[i], y[i], carry);
2323 for (size_t i = y.size (); carry && i < x.size (); ++i)
2424 std::tie (x[i], carry) = addc (x[i], uint64_t {0 }, carry);
25+ return carry;
2526}
2627
2728// / Subtracts y from x: x[] -= y[]. The result is truncated to the size of x.
@@ -81,17 +82,6 @@ constexpr void mul(
8182 addmul (r.subspan (j), r.subspan (j), x.first (r.size () - j), y[j]);
8283}
8384
84- // / Computes x[] = 2 - x[].
85- constexpr void neg_add2 (std::span<uint64_t > x) noexcept
86- {
87- assert (!x.empty ());
88- bool c = false ;
89-
90- std::tie (x[0 ], c) = intx::subc (2 , x[0 ]);
91- for (auto it = x.begin () + 1 ; it != x.end (); ++it)
92- std::tie (*it, c) = intx::subc (0 , *it, c);
93- }
94-
9585// / Trims a little-endian word array to significant words.
9686template <typename T>
9787constexpr std::span<T> trim (std::span<T> x) noexcept
@@ -489,25 +479,26 @@ void modinv_pow2(
489479 assert (!r.empty ());
490480 assert (scratch.size () >= 2 * r.size ());
491481
492- r[0 ] = evmmax::modinv (x[0 ]); // Good start: 64 correct bits.
482+ r[0 ] = evmmax::modinv (x[0 ]); // Good start: 64 correct bits.
483+ std::ranges::fill (r.subspan (1 ), uint64_t {0 }); // Zero the rest for correct final subtraction.
493484
494- // Each iteration doubles the number of correct bits in the inverse. See evmmax::modinv().
485+ // Newton-Raphson iteration for modular inverse: inv' = inv * (2 - x * inv).
486+ // Rearranged as: inv' = 2 * inv - x * inv^2, which avoids the (2 - x) negation helper
487+ // and computes the result directly into r (no copy needed).
488+ // Each iteration doubles the number of correct bits. See evmmax::modinv().
495489 for (size_t i = 1 ; i < r.size (); i *= 2 )
496490 {
497- // At the start of the iteration we have i-word correct inverse in r[0-i].
498- // The iteration performs the Newton-Raphson step with double the precision (n=2i).
491+ // We have i-word correct inverse in r[0..i). Double the precision to n = min(2i, r.size()).
499492 const auto n = std::min (i * 2 , r.size ());
493+ assert (n > i);
500494 const auto t1 = scratch.subspan (0 , n);
501495 const auto t2 = scratch.subspan (n, n);
502496
503- // Clamp x to available words: high words beyond x.size() are implicitly zero.
504- mul (t1, x.subspan (0 , std::min (n, x.size ())), r.subspan (0 , i)); // t1 = x * inv
505- neg_add2 (t1); // t1 = 2 - x * inv
506- mul (t2, t1, r.subspan (0 , i)); // t2 = inv * (2 - x * inv)
507- // TODO: Consider implementing the step as (inv << 1) - (x * inv * inv).
508-
509- // TODO: Avoid copy by swapping buffers.
510- std::ranges::copy (t2, r.begin ());
497+ const auto inv = r.first (i);
498+ mul (t1, inv, inv); // t1 = inv^2
499+ mul (t2, t1, x.first (std::min (n, x.size ()))); // t2 = inv^2 * x, x clamped to n words.
500+ r[i] = uint64_t {add (inv, inv)}; // r[0..i] = 2 * inv (carry into r[i])
501+ sub (r.first (n), t2); // r[0..n) = 2 * inv - x * inv^2
511502 }
512503}
513504
0 commit comments