Skip to content

Commit bdac249

Browse files
author
Yen-Fu Chen
committed
Refine instruction fusion and add new one
1. Refine origin fused instruction by skipping insturction nop and correctly updating value to register. 2. Add new fused insturction lui + addi. Benchmark dhrystone gains about 3% performance improvement base on this modification. Close: #177
1 parent 3baf584 commit bdac249

File tree

2 files changed

+89
-20
lines changed

2 files changed

+89
-20
lines changed

src/decode.h

+3-2
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,8 @@
163163
_(fuse1, 0) \
164164
_(fuse2, 0) \
165165
_(fuse3, 0) \
166-
_(fuse4, 0)
166+
_(fuse4, 0) \
167+
_(fuse5, 0)
167168
/* clang-format on */
168169

169170
/* IR list */
@@ -253,7 +254,7 @@ typedef struct rv_insn {
253254
uint8_t shamt;
254255
#endif
255256
/* fuse operation */
256-
int16_t imm2;
257+
int32_t imm2;
257258
opcode_fuse_t *fuse;
258259

259260
/* instruction length */

src/emulate.c

+86-18
Original file line numberDiff line numberDiff line change
@@ -1251,15 +1251,38 @@ RVOP(cswsp, {
12511251
#endif
12521252

12531253
/* auipc + addi */
1254-
RVOP(fuse1, { rv->X[ir->rd] = (int32_t) (rv->PC + ir->imm + ir->imm2); })
1254+
static bool do_fuse1(riscv_t *rv, const rv_insn_t *ir)
1255+
{
1256+
rv->X[rv_reg_zero] = 0;
1257+
rv->csr_cycle += 2;
1258+
rv->X[ir->rd] = rv->PC + ir->imm;
1259+
rv->X[ir->rs1] = rv->X[ir->rd] + ir->imm2;
1260+
rv->PC += 2 * ir->insn_len;
1261+
if (unlikely(RVOP_NO_NEXT(ir)))
1262+
return true;
1263+
const rv_insn_t *next = ir + 2;
1264+
MUST_TAIL return next->impl(rv, next);
1265+
}
12551266

12561267
/* auipc + add */
1257-
RVOP(fuse2, {
1258-
rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) + (int32_t) (rv->PC + ir->imm);
1259-
})
1268+
static bool do_fuse2(riscv_t *rv, const rv_insn_t *ir)
1269+
{
1270+
rv->X[rv_reg_zero] = 0;
1271+
rv->csr_cycle += 2;
1272+
rv->X[ir->rd] = rv->PC + ir->imm;
1273+
rv->X[ir->rs2] = rv->X[ir->rd] + rv->X[ir->rs1];
1274+
rv->PC += 2 * ir->insn_len;
1275+
if (unlikely(RVOP_NO_NEXT(ir)))
1276+
return true;
1277+
const rv_insn_t *next = ir + 2;
1278+
MUST_TAIL return next->impl(rv, next);
1279+
}
12601280

12611281
/* multiple sw */
1262-
RVOP(fuse3, {
1282+
static bool do_fuse3(riscv_t *rv, const rv_insn_t *ir)
1283+
{
1284+
rv->X[rv_reg_zero] = 0;
1285+
rv->csr_cycle += ir->imm2;
12631286
opcode_fuse_t *fuse = ir->fuse;
12641287
uint32_t addr = rv->X[fuse[0].rs1] + fuse[0].imm;
12651288
/* the memory addresses of the sw instructions are contiguous, so we only
@@ -1272,10 +1295,18 @@ RVOP(fuse3, {
12721295
addr = rv->X[fuse[i].rs1] + fuse[i].imm;
12731296
rv->io.mem_write_w(addr, rv->X[fuse[i].rs2]);
12741297
}
1275-
})
1298+
rv->PC += ir->imm2 * ir->insn_len;
1299+
if (unlikely(RVOP_NO_NEXT(ir)))
1300+
return true;
1301+
const rv_insn_t *next = ir + ir->imm2;
1302+
MUST_TAIL return next->impl(rv, next);
1303+
}
12761304

12771305
/* multiple lw */
1278-
RVOP(fuse4, {
1306+
static bool do_fuse4(riscv_t *rv, const rv_insn_t *ir)
1307+
{
1308+
rv->X[rv_reg_zero] = 0;
1309+
rv->csr_cycle += ir->imm2;
12791310
opcode_fuse_t *fuse = ir->fuse;
12801311
uint32_t addr = rv->X[fuse[0].rs1] + fuse[0].imm;
12811312
/* the memory addresses of the lw instructions are contiguous, so we only
@@ -1288,7 +1319,26 @@ RVOP(fuse4, {
12881319
addr = rv->X[fuse[i].rs1] + fuse[i].imm;
12891320
rv->X[fuse[i].rd] = rv->io.mem_read_w(addr);
12901321
}
1291-
})
1322+
rv->PC += ir->imm2 * ir->insn_len;
1323+
if (unlikely(RVOP_NO_NEXT(ir)))
1324+
return true;
1325+
const rv_insn_t *next = ir + ir->imm2;
1326+
MUST_TAIL return next->impl(rv, next);
1327+
}
1328+
1329+
/* lui + addi */
1330+
static bool do_fuse5(riscv_t *rv, const rv_insn_t *ir)
1331+
{
1332+
rv->X[rv_reg_zero] = 0;
1333+
rv->csr_cycle += 2;
1334+
rv->X[ir->rd] = ir->imm;
1335+
rv->X[ir->rs1] = ir->imm + ir->imm2;
1336+
rv->PC += 2 * ir->insn_len;
1337+
if (unlikely(RVOP_NO_NEXT(ir)))
1338+
return true;
1339+
const rv_insn_t *next = ir + 2;
1340+
MUST_TAIL return next->impl(rv, next);
1341+
}
12921342

12931343
static const void *dispatch_table[] = {
12941344
#define _(inst, can_branch) [rv_insn_##inst] = do_##inst,
@@ -1448,9 +1498,8 @@ static void block_translate(riscv_t *rv, block_t *block)
14481498
for (int j = 1; j < count; j++) { \
14491499
next_ir = ir + j; \
14501500
memcpy(ir->fuse + j, next_ir, sizeof(opcode_fuse_t)); \
1451-
next_ir->opcode = rv_insn_nop; \
1452-
next_ir->impl = dispatch_table[next_ir->opcode]; \
14531501
} \
1502+
ir->tailcall = next_ir->tailcall; \
14541503
}
14551504

14561505
/* examine whether instructions in a block match a specific pattern. If so,
@@ -1469,25 +1518,32 @@ static void match_pattern(block_t *block)
14691518
next_ir = ir + 1;
14701519
if (next_ir->opcode == rv_insn_addi && ir->rd == next_ir->rs1) {
14711520
/* the destination register of instruction auipc is equal to the
1472-
* source register 1 of next instruction addi */
1521+
* source register 1 of next instruction addi.
1522+
*/
14731523
ir->opcode = rv_insn_fuse1;
1474-
ir->rd = next_ir->rd;
1524+
ir->rs1 = next_ir->rd;
14751525
ir->imm2 = next_ir->imm;
14761526
ir->impl = dispatch_table[ir->opcode];
1477-
next_ir->opcode = rv_insn_nop;
1478-
next_ir->impl = dispatch_table[next_ir->opcode];
1527+
ir->tailcall = next_ir->tailcall;
14791528
} else if (next_ir->opcode == rv_insn_add &&
14801529
ir->rd == next_ir->rs2) {
14811530
/* the destination register of instruction auipc is equal to the
14821531
* source register 2 of next instruction add */
14831532
ir->opcode = rv_insn_fuse2;
1484-
ir->rd = next_ir->rd;
1533+
ir->rs2 = next_ir->rd;
14851534
ir->rs1 = next_ir->rs1;
14861535
ir->impl = dispatch_table[ir->opcode];
1487-
next_ir->opcode = rv_insn_nop;
1488-
next_ir->impl = dispatch_table[next_ir->opcode];
1536+
} else if (next_ir->opcode == rv_insn_add &&
1537+
ir->rd == next_ir->rs1) {
1538+
/* the destination register of instruction auipc is equal to the
1539+
* source register 1 of next instruction add */
1540+
ir->opcode = rv_insn_fuse2;
1541+
ir->rs2 = next_ir->rd;
1542+
ir->rs1 = next_ir->rs2;
1543+
ir->impl = dispatch_table[ir->opcode];
14891544
}
14901545
break;
1546+
14911547
/* If the memory addresses of a sequence of store or load instructions
14921548
* are contiguous, combine these instructions.
14931549
*/
@@ -1497,7 +1553,19 @@ static void match_pattern(block_t *block)
14971553
case rv_insn_lw:
14981554
COMBINE_MEM_OPS(1);
14991555
break;
1500-
/* FIXME: lui + addi */
1556+
case rv_insn_lui:
1557+
next_ir = ir + 1;
1558+
if (next_ir->opcode == rv_insn_addi && ir->rd == next_ir->rs1) {
1559+
/* the destination register of instruction lui is equal to
1560+
* the source register 1 of next instruction addi.
1561+
*/
1562+
ir->opcode = rv_insn_fuse5;
1563+
ir->rs1 = next_ir->rd;
1564+
ir->imm2 = next_ir->imm;
1565+
ir->impl = dispatch_table[ir->opcode];
1566+
ir->tailcall = next_ir->tailcall;
1567+
}
1568+
break;
15011569
/* TODO: mixture of sw and lw */
15021570
/* TODO: reorder insturction to match pattern */
15031571
}

0 commit comments

Comments
 (0)