Skip to content

Commit a304446

Browse files
qwe661234jserv
authored andcommitted
Reduce instruction dispatch by tail-call elimination (#95)
To meet the tail-call optimization requirement, we must convert the function emulate into a recursive version (TCO). To accomplish this, we add a variable tailcall to the struct rv_insn_t to assist us in determining whether or not the basic block is terminated. As a result, we can rewrite function emulate into a self-recursive function using this variable. However, after performing performance analysis, we discovered that the emulator required a significant amount of time to calculate the jumping address. As a result, we stick with the wasm3 implementation, which separates all instruction emulations, and modify struct rv_insn_t so that we can directly assign instruction emulation to IR by adding member impl. CoreMark results: | Model | Compiler | f2da162 | TCO | Speedup | |--------------+----------+---------+---------+---------| | Core i7-8700 | clang-15 | 836.484 | 971.951 | +13.9% | |--------------+----------+---------+---------+---------| | Core i7-8700 | gcc-12 | 888.342 | 963.336 | +7.8% | |--------------+----------+---------+---------+---------| | eMAG 8180 | clang-15 | 286.000 | 335.396 | +20.5% | |--------------+----------+---------+---------+---------| | eMAG 8180 | gcc-12 | 259.638 | 332.561 | +14.0% | Previously, when function "emulate" terminated, it returned to function "block_emulate" because the previous calling sequence was rv_step -> block_emulate -> emulate -> block_emulate -> emulate -> ... As a result, a function stack frame was created each time function "emulate" was invoked. In addition, the jumping address had to be calculated using a method such as switch-case, computed-goto in function "emulate". However, because we can now invoke instruction emulation directly and the current calling route is rv_step -> instruction emulation -> instruction emulation -> ... The instruction emulation an now use the same function stack frame due to TCO. That is, any instruction in a basic block can emulate a function by using the same function stack frame, saving the overhead of creating function stack frames.
1 parent c762de5 commit a304446

File tree

4 files changed

+1071
-1164
lines changed

4 files changed

+1071
-1164
lines changed

Makefile

+5
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ OUT ?= build
55
BIN := $(OUT)/rv32emu
66

77
CFLAGS = -std=gnu99 -O2 -Wall -Wextra
8+
CFLAGS += -Wno-unused-label
89
CFLAGS += -include src/common.h
910

1011
# Set the default stack pointer
@@ -88,6 +89,10 @@ gdbstub-test: $(BIN)
8889
$(Q)tests/gdbstub.sh && $(call notice, [OK])
8990
endif
9091

92+
# For tail-call elimination, we need a specific set of build flags applied.
93+
# FIXME: On macOS + Apple Silicon, -fno-stack-protector might have a negative impact.
94+
$(OUT)/emulate.o: CFLAGS += -fomit-frame-pointer -fno-stack-check -fno-stack-protector
95+
9196
# Clear the .DEFAULT_GOAL special variable, so that the following turns
9297
# to the first target after .DEFAULT_GOAL is not set.
9398
.DEFAULT_GOAL :=

src/common.h

+10
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,16 @@
2424
#define __ALIGNED(x)
2525
#endif
2626

27+
/* There is no tail-call optimization(TCO) in non-optimized builds. To work
28+
* around this, we attempts to use a compiler attribute called musttail that
29+
* forces the compiler to TCO even when optimizations aren't on.
30+
*/
31+
#if defined(__has_attribute) && __has_attribute(musttail)
32+
#define MUST_TAIL __attribute__((musttail))
33+
#else
34+
#define MUST_TAIL
35+
#endif
36+
2737
/* Pattern Matching for C macros.
2838
* https://github.com/pfultz2/Cloak/wiki/C-Preprocessor-tricks,-tips,-and-idioms
2939
*/

src/decode.h

+145-127
Original file line numberDiff line numberDiff line change
@@ -8,158 +8,160 @@
88
#include <stdbool.h>
99
#include <stdint.h>
1010

11-
/* RISC-V instruction list */
11+
#include "riscv.h"
12+
13+
/* RISC-V instruction list in format _(instruction-name, can-branch) */
1214
/* clang-format off */
1315
#define RISCV_INSN_LIST \
14-
_(nop) \
16+
_(nop, 0) \
1517
/* RV32I Base Instruction Set */ \
16-
_(lui) \
17-
_(auipc) \
18-
_(jal) \
19-
_(jalr) \
20-
_(beq) \
21-
_(bne) \
22-
_(blt) \
23-
_(bge) \
24-
_(bltu) \
25-
_(bgeu) \
26-
_(lb) \
27-
_(lh) \
28-
_(lw) \
29-
_(lbu) \
30-
_(lhu) \
31-
_(sb) \
32-
_(sh) \
33-
_(sw) \
34-
_(addi) \
35-
_(slti) \
36-
_(sltiu) \
37-
_(xori) \
38-
_(ori) \
39-
_(andi) \
40-
_(slli) \
41-
_(srli) \
42-
_(srai) \
43-
_(add) \
44-
_(sub) \
45-
_(sll) \
46-
_(slt) \
47-
_(sltu) \
48-
_(xor) \
49-
_(srl) \
50-
_(sra) \
51-
_(or) \
52-
_(and) \
53-
_(ecall) \
54-
_(ebreak) \
18+
_(lui, 0) \
19+
_(auipc, 0) \
20+
_(jal, 1) \
21+
_(jalr, 1) \
22+
_(beq, 1) \
23+
_(bne, 1) \
24+
_(blt, 1) \
25+
_(bge, 1) \
26+
_(bltu, 1) \
27+
_(bgeu, 1) \
28+
_(lb, 0) \
29+
_(lh, 0) \
30+
_(lw, 0) \
31+
_(lbu, 0) \
32+
_(lhu, 0) \
33+
_(sb, 0) \
34+
_(sh, 0) \
35+
_(sw, 0) \
36+
_(addi, 0) \
37+
_(slti, 0) \
38+
_(sltiu, 0) \
39+
_(xori, 0) \
40+
_(ori, 0) \
41+
_(andi, 0) \
42+
_(slli, 0) \
43+
_(srli, 0) \
44+
_(srai, 0) \
45+
_(add, 0) \
46+
_(sub, 0) \
47+
_(sll, 0) \
48+
_(slt, 0) \
49+
_(sltu, 0) \
50+
_(xor, 0) \
51+
_(srl, 0) \
52+
_(sra, 0) \
53+
_(or, 0) \
54+
_(and, 0) \
55+
_(ecall, 1) \
56+
_(ebreak, 1) \
5557
/* RISC-V Privileged Instruction */ \
56-
_(wfi) \
57-
_(uret) \
58-
_(sret) \
59-
_(hret) \
60-
_(mret) \
58+
_(wfi, 0) \
59+
_(uret, 0) \
60+
_(sret, 0) \
61+
_(hret, 0) \
62+
_(mret, 1) \
6163
/* RV32 Zifencei Standard Extension */ \
6264
IIF(RV32_HAS(Zifencei))( \
63-
_(fencei) \
65+
_(fencei, 0) \
6466
) \
6567
/* RV32 Zicsr Standard Extension */ \
6668
IIF(RV32_HAS(Zicsr))( \
67-
_(csrrw) \
68-
_(csrrs) \
69-
_(csrrc) \
70-
_(csrrwi) \
71-
_(csrrsi) \
72-
_(csrrci) \
69+
_(csrrw, 0) \
70+
_(csrrs, 0) \
71+
_(csrrc, 0) \
72+
_(csrrwi, 0) \
73+
_(csrrsi, 0) \
74+
_(csrrci, 0) \
7375
) \
7476
/* RV32M Standard Extension */ \
7577
IIF(RV32_HAS(EXT_M))( \
76-
_(mul) \
77-
_(mulh) \
78-
_(mulhsu) \
79-
_(mulhu) \
80-
_(div) \
81-
_(divu) \
82-
_(rem) \
83-
_(remu) \
78+
_(mul, 0) \
79+
_(mulh, 0) \
80+
_(mulhsu, 0) \
81+
_(mulhu, 0) \
82+
_(div, 0) \
83+
_(divu, 0) \
84+
_(rem, 0) \
85+
_(remu, 0) \
8486
) \
8587
/* RV32A Standard Extension */ \
8688
IIF(RV32_HAS(EXT_A))( \
87-
_(lrw) \
88-
_(scw) \
89-
_(amoswapw) \
90-
_(amoaddw) \
91-
_(amoxorw) \
92-
_(amoandw) \
93-
_(amoorw) \
94-
_(amominw) \
95-
_(amomaxw) \
96-
_(amominuw) \
97-
_(amomaxuw) \
89+
_(lrw, 0) \
90+
_(scw, 0) \
91+
_(amoswapw, 0) \
92+
_(amoaddw, 0) \
93+
_(amoxorw, 0) \
94+
_(amoandw, 0) \
95+
_(amoorw, 0) \
96+
_(amominw, 0) \
97+
_(amomaxw, 0) \
98+
_(amominuw, 0) \
99+
_(amomaxuw, 0) \
98100
) \
99101
/* RV32F Standard Extension */ \
100102
IIF(RV32_HAS(EXT_F))( \
101-
_(flw) \
102-
_(fsw) \
103-
_(fmadds) \
104-
_(fmsubs) \
105-
_(fnmsubs) \
106-
_(fnmadds) \
107-
_(fadds) \
108-
_(fsubs) \
109-
_(fmuls) \
110-
_(fdivs) \
111-
_(fsqrts) \
112-
_(fsgnjs) \
113-
_(fsgnjns) \
114-
_(fsgnjxs) \
115-
_(fmins) \
116-
_(fmaxs) \
117-
_(fcvtws) \
118-
_(fcvtwus) \
119-
_(fmvxw) \
120-
_(feqs) \
121-
_(flts) \
122-
_(fles) \
123-
_(fclasss) \
124-
_(fcvtsw) \
125-
_(fcvtswu) \
126-
_(fmvwx) \
103+
_(flw, 0) \
104+
_(fsw, 0) \
105+
_(fmadds, 0) \
106+
_(fmsubs, 0) \
107+
_(fnmsubs, 0) \
108+
_(fnmadds, 0) \
109+
_(fadds, 0) \
110+
_(fsubs, 0) \
111+
_(fmuls, 0) \
112+
_(fdivs, 0) \
113+
_(fsqrts, 0) \
114+
_(fsgnjs, 0) \
115+
_(fsgnjns, 0) \
116+
_(fsgnjxs, 0) \
117+
_(fmins, 0) \
118+
_(fmaxs, 0) \
119+
_(fcvtws, 0) \
120+
_(fcvtwus, 0) \
121+
_(fmvxw, 0) \
122+
_(feqs, 0) \
123+
_(flts, 0) \
124+
_(fles, 0) \
125+
_(fclasss, 0) \
126+
_(fcvtsw, 0) \
127+
_(fcvtswu, 0) \
128+
_(fmvwx, 0) \
127129
) \
128130
/* RV32C Standard Extension */ \
129131
IIF(RV32_HAS(EXT_C))( \
130-
_(caddi4spn) \
131-
_(clw) \
132-
_(csw) \
133-
_(cnop) \
134-
_(caddi) \
135-
_(cjal) \
136-
_(cli) \
137-
_(caddi16sp) \
138-
_(clui) \
139-
_(csrli) \
140-
_(csrai) \
141-
_(candi) \
142-
_(csub) \
143-
_(cxor) \
144-
_(cor) \
145-
_(cand) \
146-
_(cj) \
147-
_(cbeqz) \
148-
_(cbnez) \
149-
_(cslli) \
150-
_(clwsp) \
151-
_(cjr) \
152-
_(cmv) \
153-
_(cebreak) \
154-
_(cjalr) \
155-
_(cadd) \
156-
_(cswsp) \
132+
_(caddi4spn, 0) \
133+
_(clw, 0) \
134+
_(csw, 0) \
135+
_(cnop, 0) \
136+
_(caddi, 0) \
137+
_(cjal, 1) \
138+
_(cli, 0) \
139+
_(caddi16sp, 0) \
140+
_(clui, 0) \
141+
_(csrli, 0) \
142+
_(csrai, 0) \
143+
_(candi, 0) \
144+
_(csub, 0) \
145+
_(cxor, 0) \
146+
_(cor, 0) \
147+
_(cand, 0) \
148+
_(cj, 1) \
149+
_(cbeqz, 1) \
150+
_(cbnez, 1) \
151+
_(cslli, 0) \
152+
_(clwsp, 0) \
153+
_(cjr, 1) \
154+
_(cmv, 0) \
155+
_(cebreak, 1) \
156+
_(cjalr, 1) \
157+
_(cadd, 0) \
158+
_(cswsp, 0) \
157159
)
158160
/* clang-format on */
159161

160162
/* IR list */
161163
enum {
162-
#define _(inst) rv_insn_##inst,
164+
#define _(inst, can_branch) rv_insn_##inst,
163165
RISCV_INSN_LIST
164166
#undef _
165167
};
@@ -226,7 +228,7 @@ enum {
226228
INSN_32 = 4,
227229
};
228230

229-
typedef struct {
231+
typedef struct rv_insn {
230232
union {
231233
int32_t imm;
232234
uint8_t rs3;
@@ -241,6 +243,22 @@ typedef struct {
241243

242244
/* instruction length */
243245
uint8_t insn_len;
246+
247+
/* According to tail-call optimization (TCO), if a C function ends with
248+
* a function call to another function or itself and simply returns that
249+
* function's result, the compiler can substitute a simple jump to the
250+
* other function for the 'call' and 'return' instructions . The self
251+
* -recursive function can therefore use the same function stack frame.
252+
*
253+
* Using member tailcall, we can tell whether an IR is the final IR in
254+
* a basic block. Additionally, member 'impl' allows us to invoke next
255+
* instruction emulation directly without computing the jumping address.
256+
* In order to enable the compiler to perform TCO, we can use these two
257+
* members to rewrite all instruction emulations into a self-recursive
258+
* version.
259+
*/
260+
bool tailcall;
261+
bool (*impl)(riscv_t *, const struct rv_insn *);
244262
} rv_insn_t;
245263

246264
/* decode the RISC-V instruction */

0 commit comments

Comments
 (0)