Skip to content

Commit a274b30

Browse files
committed
crypto/internal/bigmod: provide assembly addMulVVW* for riscv64
This provides an assembly implementation of addMulVVW* for riscv64, processing four words per loop, resulting in a performance gain of 23%+ for RSA decryption/signing on a StarFive VisionFive 2: │ rsa1 │ rsa2 │ │ sec/op │ sec/op vs base │ DecryptPKCS1v15/2048-4 24.29m ± 0% 18.65m ± 0% -23.24% (p=0.000 n=10) DecryptPKCS1v15/3072-4 73.28m ± 0% 54.08m ± 0% -26.20% (p=0.000 n=10) DecryptPKCS1v15/4096-4 163.5m ± 0% 119.1m ± 0% -27.17% (p=0.000 n=10) EncryptPKCS1v15/2048-4 1.505m ± 0% 1.446m ± 0% -3.93% (p=0.000 n=10) DecryptOAEP/2048-4 24.37m ± 0% 18.72m ± 0% -23.17% (p=0.000 n=10) EncryptOAEP/2048-4 1.570m ± 0% 1.510m ± 0% -3.84% (p=0.000 n=10) SignPKCS1v15/2048-4 24.52m ± 0% 18.80m ± 0% -23.36% (p=0.000 n=10) VerifyPKCS1v15/2048-4 1.491m ± 0% 1.431m ± 0% -4.00% (p=0.000 n=10) SignPSS/2048-4 24.60m ± 0% 18.89m ± 0% -23.21% (p=0.000 n=10) VerifyPSS/2048-4 1.565m ± 0% 1.504m ± 0% -3.87% (p=0.000 n=10) geomean 10.90m 9.066m -16.79% Change-Id: I8414ba0028b0781a945610abe02c285d2387aef3 Reviewed-on: https://go-review.googlesource.com/c/go/+/516536 Reviewed-by: Mark Ryan <[email protected]> Reviewed-by: Filippo Valsorda <[email protected]> Reviewed-by: Dmitri Shuralyov <[email protected]> Reviewed-by: M Zhuo <[email protected]> Reviewed-by: Michael Knyszek <[email protected]> Run-TryBot: Joel Sing <[email protected]> TryBot-Result: Gopher Robot <[email protected]>
1 parent f278122 commit a274b30

File tree

3 files changed

+93
-2
lines changed

3 files changed

+93
-2
lines changed

src/crypto/internal/bigmod/nat_asm.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
// Use of this source code is governed by a BSD-style
33
// license that can be found in the LICENSE file.
44

5-
//go:build !purego && (386 || amd64 || arm || arm64 || ppc64 || ppc64le || s390x)
5+
//go:build !purego && (386 || amd64 || arm || arm64 || ppc64 || ppc64le || riscv64 || s390x)
66

77
package bigmod
88

src/crypto/internal/bigmod/nat_noasm.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
// Use of this source code is governed by a BSD-style
33
// license that can be found in the LICENSE file.
44

5-
//go:build purego || !(386 || amd64 || arm || arm64 || ppc64 || ppc64le || s390x)
5+
//go:build purego || !(386 || amd64 || arm || arm64 || ppc64 || ppc64le || riscv64 || s390x)
66

77
package bigmod
88

+91
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
// Copyright 2023 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
//go:build !purego
6+
7+
#include "textflag.h"
8+
9+
// func addMulVVW1024(z, x *uint, y uint) (c uint)
10+
TEXT ·addMulVVW1024(SB),$0-32
11+
MOV $16, X30
12+
JMP addMulVVWx(SB)
13+
14+
// func addMulVVW1536(z, x *uint, y uint) (c uint)
15+
TEXT ·addMulVVW1536(SB),$0-32
16+
MOV $24, X30
17+
JMP addMulVVWx(SB)
18+
19+
// func addMulVVW2048(z, x *uint, y uint) (c uint)
20+
TEXT ·addMulVVW2048(SB),$0-32
21+
MOV $32, X30
22+
JMP addMulVVWx(SB)
23+
24+
TEXT addMulVVWx(SB),NOFRAME|NOSPLIT,$0
25+
MOV z+0(FP), X5
26+
MOV x+8(FP), X7
27+
MOV y+16(FP), X6
28+
MOV $0, X29
29+
30+
BEQZ X30, done
31+
loop:
32+
MOV 0*8(X5), X10 // z[0]
33+
MOV 1*8(X5), X13 // z[1]
34+
MOV 2*8(X5), X16 // z[2]
35+
MOV 3*8(X5), X19 // z[3]
36+
37+
MOV 0*8(X7), X8 // x[0]
38+
MOV 1*8(X7), X11 // x[1]
39+
MOV 2*8(X7), X14 // x[2]
40+
MOV 3*8(X7), X17 // x[3]
41+
42+
MULHU X8, X6, X9 // z_hi[0] = x[0] * y
43+
MUL X8, X6, X8 // z_lo[0] = x[0] * y
44+
ADD X8, X10, X21 // z_lo[0] = x[0] * y + z[0]
45+
SLTU X8, X21, X22
46+
ADD X9, X22, X9 // z_hi[0] = x[0] * y + z[0]
47+
ADD X21, X29, X10 // z_lo[0] = x[0] * y + z[0] + c
48+
SLTU X21, X10, X22
49+
ADD X9, X22, X29 // next c
50+
51+
MULHU X11, X6, X12 // z_hi[1] = x[1] * y
52+
MUL X11, X6, X11 // z_lo[1] = x[1] * y
53+
ADD X11, X13, X21 // z_lo[1] = x[1] * y + z[1]
54+
SLTU X11, X21, X22
55+
ADD X12, X22, X12 // z_hi[1] = x[1] * y + z[1]
56+
ADD X21, X29, X13 // z_lo[1] = x[1] * y + z[1] + c
57+
SLTU X21, X13, X22
58+
ADD X12, X22, X29 // next c
59+
60+
MULHU X14, X6, X15 // z_hi[2] = x[2] * y
61+
MUL X14, X6, X14 // z_lo[2] = x[2] * y
62+
ADD X14, X16, X21 // z_lo[2] = x[2] * y + z[2]
63+
SLTU X14, X21, X22
64+
ADD X15, X22, X15 // z_hi[2] = x[2] * y + z[2]
65+
ADD X21, X29, X16 // z_lo[2] = x[2] * y + z[2] + c
66+
SLTU X21, X16, X22
67+
ADD X15, X22, X29 // next c
68+
69+
MULHU X17, X6, X18 // z_hi[3] = x[3] * y
70+
MUL X17, X6, X17 // z_lo[3] = x[3] * y
71+
ADD X17, X19, X21 // z_lo[3] = x[3] * y + z[3]
72+
SLTU X17, X21, X22
73+
ADD X18, X22, X18 // z_hi[3] = x[3] * y + z[3]
74+
ADD X21, X29, X19 // z_lo[3] = x[3] * y + z[3] + c
75+
SLTU X21, X19, X22
76+
ADD X18, X22, X29 // next c
77+
78+
MOV X10, 0*8(X5) // z[0]
79+
MOV X13, 1*8(X5) // z[1]
80+
MOV X16, 2*8(X5) // z[2]
81+
MOV X19, 3*8(X5) // z[3]
82+
83+
ADDI $32, X5
84+
ADDI $32, X7
85+
86+
ADDI $-4, X30
87+
BNEZ X30, loop
88+
89+
done:
90+
MOV X29, c+24(FP)
91+
RET

0 commit comments

Comments
 (0)