Skip to content

Commit 6e5a998

Browse files
committed
crypto: Use local accumulator t[] in mul_amm_256 to avoid aliasing
Use a local array t[4] instead of writing directly to the output span r, which may alias the inputs x or y. This allows the compiler to keep the accumulator in registers without reloading after stores. The result is copied to r at the end. ~6% speedup on 256-bit modexp benchmarks (25846 → 24256 cycles).
1 parent 27a16a1 commit 6e5a998

File tree

1 file changed

+23
-18
lines changed

1 file changed

+23
-18
lines changed

lib/evmone_precompiles/mulmod.cpp

Lines changed: 23 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -10,40 +10,45 @@ void mul_amm_256(std::span<uint64_t, 4> r, std::span<const uint64_t, 4> x,
1010
std::span<const uint64_t, 4> y, std::span<const uint64_t, 4> mod, uint64_t mod_inv) noexcept
1111
{
1212
static constexpr size_t N = 4;
13-
const auto r_lo = r.subspan<0, 3>();
14-
const auto r_hi = r.subspan<1>();
13+
14+
// Local accumulator t[] avoids aliasing penalties when r overlaps x or y.
15+
std::array<uint64_t, N> t; // NOLINT(*-pro-type-member-init)
16+
const auto t_lo = std::span{t}.subspan<0, N - 1>();
17+
const auto t_hi = std::span{t}.subspan<1>();
1518
const auto mod_hi = mod.subspan<1>();
1619

17-
// First iteration: r is uninitialized, so use mul instead of addmul.
18-
bool r_carry = false;
20+
// First iteration: t is uninitialized, so use mul instead of addmul.
21+
bool t_carry = false;
1922
{
20-
const auto c1 = mul(r, x, y[0]);
23+
const auto c1 = mul(t, x, y[0]);
2124

22-
const auto m = r[0] * mod_inv;
23-
const auto c2 = (umul(mod[0], m) + r[0])[1];
25+
const auto m = t[0] * mod_inv;
26+
const auto c2 = (umul(mod[0], m) + t[0])[1];
2427

25-
const auto c3 = addmul(r_lo, r_hi, mod_hi, m, c2);
26-
std::tie(r[N - 1], r_carry) = addc(c1, c3);
28+
const auto c3 = addmul(t_lo, t_hi, mod_hi, m, c2);
29+
std::tie(t[N - 1], t_carry) = addc(c1, c3);
2730
}
2831

2932
// Remaining 3 iterations.
3033
#pragma GCC unroll N - 1
3134
for (size_t i = 1; i != N; ++i)
3235
{
33-
const auto c1 = addmul(r, r, x, y[i]);
34-
const auto [sum1, d1] = addc(c1, uint64_t{r_carry});
36+
const auto c1 = addmul(t, t, x, y[i]);
37+
const auto [sum1, d1] = addc(c1, uint64_t{t_carry});
3538

36-
const auto m = r[0] * mod_inv;
37-
const auto c2 = (umul(mod[0], m) + r[0])[1];
39+
const auto m = t[0] * mod_inv;
40+
const auto c2 = (umul(mod[0], m) + t[0])[1];
3841

39-
const auto c3 = addmul(r_lo, r_hi, mod_hi, m, c2);
42+
const auto c3 = addmul(t_lo, t_hi, mod_hi, m, c2);
4043
const auto [sum2, d2] = addc(sum1, c3);
41-
r[N - 1] = sum2;
44+
t[N - 1] = sum2;
4245
assert(!(d1 && d2));
43-
r_carry = d1 || d2;
46+
t_carry = d1 || d2;
4447
}
4548

46-
if (r_carry)
47-
sub(r, mod);
49+
if (t_carry)
50+
sub(t, mod);
51+
52+
std::ranges::copy(t, r.begin());
4853
}
4954
} // namespace evmone::crypto

0 commit comments

Comments
 (0)