Skip to content

Commit 3f84ce5

Browse files
committed
Introduce macro operation fusion
Through our observations, we have identified certain patterns in instruction sequences. By converting these specific RISC-V instruction patterns into faster and equivalent code, we can significantly improve execution efficiency. In our current analysis, we focus on a commonly used benchmark and have found the following frequently occurring instruction patterns: auipc + addi, auipc + add, multiple sw, and multiple lw. | Metric | commit fba5802 | macro fuse operation |Speedup| |----------+--------------------------+---------------------------+-------| | CoreMark | 1351.065 (Iterations/Sec)| 1352.843 (Iterations/Sec)|+0.13% | | dhrystone| 1073 DMIPS | 1146 DMIPS | +6.8% | | nqueens | 8295 msec | 7824 msec | +6.0% |
1 parent fba5802 commit 3f84ce5

File tree

3 files changed

+151
-2
lines changed

3 files changed

+151
-2
lines changed

src/decode.h

+17-1
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,15 @@
156156
_(cjalr, 1) \
157157
_(cadd, 0) \
158158
_(cswsp, 0) \
159-
)
159+
) \
160+
/* macro operation fusion: convert specific RISC-V instruction patterns
161+
* into faster and equivalent code
162+
*/ \
163+
_(fuse1, 0) \
164+
_(fuse2, 0) \
165+
_(fuse3, 0) \
166+
_(fuse4, 0) \
167+
_(empty, 0)
160168
/* clang-format on */
161169

162170
/* IR list */
@@ -228,6 +236,11 @@ enum {
228236
INSN_32 = 4,
229237
};
230238

239+
typedef struct {
240+
int32_t imm;
241+
uint8_t rd, rs1, rs2;
242+
} opcode_fuse_t;
243+
231244
typedef struct rv_insn {
232245
union {
233246
int32_t imm;
@@ -240,6 +253,9 @@ typedef struct rv_insn {
240253
#if RV32_HAS(EXT_C)
241254
uint8_t shamt;
242255
#endif
256+
/* fuse operation */
257+
int32_t imm2;
258+
opcode_fuse_t *fuse;
243259

244260
/* instruction length */
245261
uint8_t insn_len;

src/emulate.c

+130-1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ extern struct target_ops gdbstub_ops;
3131
#include "decode.h"
3232
#include "riscv.h"
3333
#include "riscv_private.h"
34+
#include "state.h"
3435
#include "utils.h"
3536

3637
/* RISC-V exception code list */
@@ -1219,6 +1220,60 @@ RVOP(cswsp, {
12191220
})
12201221
#endif
12211222

1223+
/* auipc + addi */
1224+
RVOP(fuse1, {
1225+
rv->X[ir->rd] = (int32_t) (rv->PC + ir->imm + ir->imm2);
1226+
rv->PC += ir->insn_len;
1227+
})
1228+
1229+
/* auipc + add */
1230+
RVOP(fuse2, {
1231+
rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) + (int32_t) (rv->PC + ir->imm);
1232+
rv->PC += ir->insn_len;
1233+
})
1234+
1235+
/* multiple sw */
1236+
RVOP(fuse3, {
1237+
opcode_fuse_t *fuse = ir->fuse;
1238+
uint32_t addr = rv->X[fuse[0].rs1] + fuse[0].imm;
1239+
/* the memory addresses of the sw instructions are contiguous, so we only
1240+
* need to check the first sw instruction to determine if its memory address
1241+
* is misaligned or if the memory chunk does not exist.
1242+
*/
1243+
RV_EXC_MISALIGN_HANDLER(3, store, false, 1);
1244+
rv->io.mem_write_w(rv, addr, rv->X[fuse[0].rs2]);
1245+
for (int i = 1; i < ir->imm2; i++) {
1246+
addr = rv->X[fuse[i].rs1] + fuse[i].imm;
1247+
rv->io.mem_write_w(rv, addr, rv->X[fuse[i].rs2]);
1248+
}
1249+
rv->PC += ir->insn_len * (ir->imm2 - 1);
1250+
})
1251+
1252+
/* multiple lw */
1253+
RVOP(fuse4, {
1254+
opcode_fuse_t *fuse = ir->fuse;
1255+
uint32_t addr = rv->X[fuse[0].rs1] + fuse[0].imm;
1256+
/* the memory addresses of the lw instructions are contiguous, so we only
1257+
* need to check the first lw instruction to determine if its memory address
1258+
* is misaligned or if the memory chunk does not exist.
1259+
*/
1260+
RV_EXC_MISALIGN_HANDLER(3, load, false, 1);
1261+
rv->X[fuse[0].rd] = rv->io.mem_read_w(rv, addr);
1262+
for (int i = 1; i < ir->imm2; i++) {
1263+
addr = rv->X[fuse[i].rs1] + fuse[i].imm;
1264+
rv->X[fuse[i].rd] = rv->io.mem_read_w(rv, addr);
1265+
}
1266+
rv->PC += ir->insn_len * (ir->imm2 - 1);
1267+
})
1268+
1269+
static bool do_empty(riscv_t *rv, const rv_insn_t *ir)
1270+
{
1271+
rv->X[rv_reg_zero] = 0;
1272+
rv->csr_cycle++;
1273+
const rv_insn_t *next = ir + 1;
1274+
MUST_TAIL return next->impl(rv, next);
1275+
}
1276+
12221277
static const void *dispatch_table[] = {
12231278
#define _(inst, can_branch) [rv_insn_##inst] = do_##inst,
12241279
RISCV_INSN_LIST
@@ -1337,7 +1392,6 @@ static void block_translate(riscv_t *rv, block_t *block)
13371392
/* compute the end of pc */
13381393
block->pc_end += ir->insn_len;
13391394
block->n_insn++;
1340-
13411395
/* stop on branch */
13421396
if (insn_is_branch(ir->opcode)) {
13431397
/* recursive jump translation */
@@ -1356,6 +1410,78 @@ static void block_translate(riscv_t *rv, block_t *block)
13561410
block->ir[block->n_insn - 1].tailcall = true;
13571411
}
13581412

1413+
#define pack_memory_operation(RW) \
1414+
count = 1; \
1415+
next_ir = ir + 1; \
1416+
if (next_ir->opcode != IIF(RW)(rv_insn_lw, rv_insn_sw)) \
1417+
break; \
1418+
sign = (ir->imm - next_ir->imm) >> 31 ? -1 : 1; \
1419+
for (uint32_t j = 1; j < block->n_insn - 1 - i; j++) { \
1420+
next_ir = ir + j; \
1421+
if (next_ir->opcode != IIF(RW)(rv_insn_lw, rv_insn_sw) || \
1422+
ir->rs1 != next_ir->rs1 || ir->imm - next_ir->imm != 4 * sign) \
1423+
break; \
1424+
count++; \
1425+
} \
1426+
if (count > 1) { \
1427+
ir->opcode = IIF(RW)(rv_insn_fuse4, rv_insn_fuse3); \
1428+
ir->fuse = malloc(count * sizeof(opcode_fuse_t)); \
1429+
ir->imm2 = count; \
1430+
memcpy(ir->fuse, ir, sizeof(opcode_fuse_t)); \
1431+
ir->impl = dispatch_table[ir->opcode]; \
1432+
for (int j = 1; j < count; j++) { \
1433+
next_ir = ir + j; \
1434+
memcpy(ir->fuse + j, next_ir, sizeof(opcode_fuse_t)); \
1435+
next_ir->opcode = rv_insn_empty; \
1436+
next_ir->impl = dispatch_table[next_ir->opcode]; \
1437+
} \
1438+
} \
1439+
break;
1440+
1441+
1442+
/* examine whether instructions in a block match a specific pattern. If so,
1443+
* rewrite them into fused instructions. */
1444+
static void match_pattern(block_t *block)
1445+
{
1446+
for (uint32_t i = 0; i < block->n_insn - 1; i++) {
1447+
rv_insn_t *ir = block->ir + i, *next_ir = NULL;
1448+
int32_t count = 0, sign = 1;
1449+
switch (ir->opcode) {
1450+
case rv_insn_auipc:
1451+
next_ir = ir + 1;
1452+
if (next_ir->opcode == rv_insn_addi && ir->rd == next_ir->rs1) {
1453+
/* the destination register of instruction auipc is equal to the
1454+
* source register 1 of next instruction addi */
1455+
ir->opcode = rv_insn_fuse1;
1456+
ir->rd = next_ir->rd;
1457+
ir->imm2 = next_ir->imm;
1458+
ir->impl = dispatch_table[ir->opcode];
1459+
next_ir->opcode = rv_insn_empty;
1460+
next_ir->impl = dispatch_table[next_ir->opcode];
1461+
} else if (next_ir->opcode == rv_insn_add &&
1462+
ir->rd == next_ir->rs2) {
1463+
/* the destination register of instruction auipc is equal to the
1464+
* source register 2 of next instruction add */
1465+
ir->opcode = rv_insn_fuse2;
1466+
ir->rd = next_ir->rd;
1467+
ir->rs1 = next_ir->rs1;
1468+
ir->impl = dispatch_table[ir->opcode];
1469+
next_ir->opcode = rv_insn_empty;
1470+
next_ir->impl = dispatch_table[next_ir->opcode];
1471+
}
1472+
break;
1473+
case rv_insn_sw:
1474+
/* If the memory addresses of a sequence of store instructions for
1475+
* data are contiguous, pack these instructions. */
1476+
pack_memory_operation(0);
1477+
case rv_insn_lw:
1478+
/* If the memory addresses of a sequence of load instructions for
1479+
* data are contiguous, pack these instructions. */
1480+
pack_memory_operation(1);
1481+
}
1482+
}
1483+
}
1484+
13591485
static block_t *prev = NULL;
13601486
static block_t *block_find_or_translate(riscv_t *rv)
13611487
{
@@ -1375,6 +1501,9 @@ static block_t *block_find_or_translate(riscv_t *rv)
13751501
/* translate the basic block */
13761502
block_translate(rv, next);
13771503

1504+
/* macro operation fusion */
1505+
match_pattern(next);
1506+
13781507
/* insert the block into block map */
13791508
block_insert(&rv->block_map, next);
13801509

src/riscv.c

+4
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@ void block_map_clear(block_map_t *map)
2525
block_t *block = map->map[i];
2626
if (!block)
2727
continue;
28+
for (uint32_t i = 0; i < block->n_insn; i++) {
29+
if (block->ir[i].fuse)
30+
free(block->ir[i].fuse);
31+
}
2832
free(block->ir);
2933
free(block);
3034
map->map[i] = NULL;

0 commit comments

Comments
 (0)