Skip to content

Commit 0031224

Browse files
authored
Reduce memory usage for instruction block (#232)
The original memory allocation strategy for instruction blocks was found to be inefficient, leading to excessive memory usage. In the previous approach, a fixed amount of memory was allocated for each block, resulting in significant wastage. To address this issue, we have implemented a more efficient memory allocation scheme. Instead of allocating a fixed size for each block, we now maintain a pool of rv_insn_t and allocate memory only when needed. This new approach minimizes heap allocations and optimizes memory usage. We have introduced a parameter, BLOCK_POOL_SIZE, which allows us to control the balance between the number of calloc calls and memory consumption. This flexibility ensures that memory allocation occurs only when the pool is depleted. As a result of these changes, the heap memory allocation has significantly improved. For example, in the puzzle.elf example, we observed a reduction in heap memory allocation from 20,306,989 bytes to just 313,461 bytes. While this design may lead to some discontinuity in memory spaces for instructions in sequence, the impact on random access is minimal, as random access is primarily required for certain fuse operations. In cases where random access is needed, we can employ linear search method. The potential cache locality issues resulting from the discontinuous memory spaces can also be mitigated by adjusting the BLOCK_POOL_SIZE parameter for better performance.
1 parent a207574 commit 0031224

File tree

5 files changed

+116
-60
lines changed

5 files changed

+116
-60
lines changed

src/decode.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,8 @@ typedef struct rv_insn {
298298
* specific IR array without the need for additional copying.
299299
*/
300300
struct rv_insn *branch_taken, *branch_untaken;
301+
302+
struct rv_insn *next;
301303
} rv_insn_t;
302304

303305
/* decode the RISC-V instruction */

src/emulate.c

Lines changed: 79 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
#include <assert.h>
77
#include <stdbool.h>
8+
#include <stdint.h>
89
#include <stdio.h>
910
#include <stdlib.h>
1011
#include <string.h>
@@ -30,6 +31,7 @@ extern struct target_ops gdbstub_ops;
3031
#endif
3132

3233
#include "decode.h"
34+
#include "mpool.h"
3335
#include "riscv.h"
3436
#include "riscv_private.h"
3537
#include "state.h"
@@ -277,16 +279,17 @@ static inline uint32_t hash(size_t k)
277279
return k;
278280
}
279281

282+
static void block_translate(riscv_t *rv, block_map_t *map, block_t *block);
280283
/* allocate a basic block */
281-
static block_t *block_alloc(const uint8_t bits)
284+
static block_t *block_alloc(riscv_t *rv, block_map_t *map)
282285
{
283-
block_t *block = malloc(sizeof(struct block));
286+
block_t *block = mpool_alloc(map->block_mp);
284287
assert(block);
285-
block->insn_capacity = 1 << bits;
286288
block->n_insn = 0;
287289
block->predict = NULL;
288-
block->ir = malloc(block->insn_capacity * sizeof(rv_insn_t));
289-
assert(block->ir);
290+
291+
/* Initialize remaining part of block_t */
292+
block_translate(rv, map, block);
290293
return block;
291294
}
292295

@@ -366,7 +369,7 @@ static uint32_t last_pc = 0;
366369
rv->PC += ir->insn_len; \
367370
if (unlikely(RVOP_NO_NEXT(ir))) \
368371
return true; \
369-
const rv_insn_t *next = ir + 1; \
372+
const rv_insn_t *next = ir->next; \
370373
MUST_TAIL return next->impl(rv, next); \
371374
}
372375

@@ -395,36 +398,47 @@ enum {
395398
#undef _
396399
};
397400

401+
/* FIXME: This will simply find the n-th instruction by iterating
402+
* the linked list linearly, we may want to find better approach. */
403+
FORCE_INLINE rv_insn_t *next_nth_insn(rv_insn_t *ir, int32_t n)
404+
{
405+
rv_insn_t *tmp = ir;
406+
for (int32_t iter = 0; iter < n; iter++)
407+
tmp = tmp->next;
408+
return tmp;
409+
}
410+
398411
/* multiple lui */
399-
static bool do_fuse1(riscv_t *rv, const rv_insn_t *ir)
412+
static bool do_fuse1(riscv_t *rv, rv_insn_t *ir)
400413
{
401414
rv->csr_cycle += ir->imm2;
402-
for (int i = 0; i < ir->imm2; i++) {
403-
const rv_insn_t *cur_ir = ir + i;
415+
int i;
416+
rv_insn_t *cur_ir;
417+
for (i = 0, cur_ir = ir; i < ir->imm2; i++, cur_ir = cur_ir->next) {
404418
rv->X[cur_ir->rd] = cur_ir->imm;
405419
}
406420
rv->PC += ir->imm2 * ir->insn_len;
407421
if (unlikely(RVOP_NO_NEXT(ir)))
408422
return true;
409-
const rv_insn_t *next = ir + ir->imm2;
423+
const rv_insn_t *next = next_nth_insn(ir, ir->imm2);
410424
MUST_TAIL return next->impl(rv, next);
411425
}
412426

413427
/* LUI + ADD */
414-
static bool do_fuse2(riscv_t *rv, const rv_insn_t *ir)
428+
static bool do_fuse2(riscv_t *rv, rv_insn_t *ir)
415429
{
416430
rv->csr_cycle += 2;
417431
rv->X[ir->rd] = ir->imm;
418432
rv->X[ir->rs2] = rv->X[ir->rd] + rv->X[ir->rs1];
419433
rv->PC += 2 * ir->insn_len;
420434
if (unlikely(RVOP_NO_NEXT(ir)))
421435
return true;
422-
const rv_insn_t *next = ir + 2;
436+
const rv_insn_t *next = next_nth_insn(ir, 2);
423437
MUST_TAIL return next->impl(rv, next);
424438
}
425439

426440
/* multiple SW */
427-
static bool do_fuse3(riscv_t *rv, const rv_insn_t *ir)
441+
static bool do_fuse3(riscv_t *rv, rv_insn_t *ir)
428442
{
429443
rv->csr_cycle += ir->imm2;
430444
opcode_fuse_t *fuse = ir->fuse;
@@ -442,12 +456,12 @@ static bool do_fuse3(riscv_t *rv, const rv_insn_t *ir)
442456
rv->PC += ir->imm2 * ir->insn_len;
443457
if (unlikely(RVOP_NO_NEXT(ir)))
444458
return true;
445-
const rv_insn_t *next = ir + ir->imm2;
459+
const rv_insn_t *next = next_nth_insn(ir, ir->imm2);
446460
MUST_TAIL return next->impl(rv, next);
447461
}
448462

449463
/* multiple LW */
450-
static bool do_fuse4(riscv_t *rv, const rv_insn_t *ir)
464+
static bool do_fuse4(riscv_t *rv, rv_insn_t *ir)
451465
{
452466
rv->csr_cycle += ir->imm2;
453467
opcode_fuse_t *fuse = ir->fuse;
@@ -465,7 +479,7 @@ static bool do_fuse4(riscv_t *rv, const rv_insn_t *ir)
465479
rv->PC += ir->imm2 * ir->insn_len;
466480
if (unlikely(RVOP_NO_NEXT(ir)))
467481
return true;
468-
const rv_insn_t *next = ir + ir->imm2;
482+
const rv_insn_t *next = next_nth_insn(ir, ir->imm2);
469483
MUST_TAIL return next->impl(rv, next);
470484
}
471485

@@ -479,7 +493,7 @@ static bool do_fuse5(riscv_t *rv, const rv_insn_t *ir)
479493
rv->PC = rv->X[rv_reg_ra] & ~1U;
480494
if (unlikely(RVOP_NO_NEXT(ir)))
481495
return true;
482-
const rv_insn_t *next = ir + 1;
496+
const rv_insn_t *next = ir->next;
483497
MUST_TAIL return next->impl(rv, next);
484498
}
485499

@@ -493,7 +507,7 @@ static bool do_fuse6(riscv_t *rv, const rv_insn_t *ir)
493507
rv->PC = rv->X[rv_reg_ra] & ~1U;
494508
if (unlikely(RVOP_NO_NEXT(ir)))
495509
return true;
496-
const rv_insn_t *next = ir + 1;
510+
const rv_insn_t *next = ir->next;
497511
MUST_TAIL return next->impl(rv, next);
498512
}
499513

@@ -541,15 +555,21 @@ FORCE_INLINE bool insn_is_unconditional_branch(uint8_t opcode)
541555
return false;
542556
}
543557

544-
static void block_translate(riscv_t *rv, block_t *block)
558+
static void block_translate(riscv_t *rv, block_map_t *map, block_t *block)
545559
{
546560
block->pc_start = block->pc_end = rv->PC;
547561

562+
rv_insn_t *prev_ir = NULL;
563+
rv_insn_t *ir = mpool_alloc(map->block_ir_mp);
564+
block->ir_head = ir;
565+
548566
/* translate the basic block */
549-
while (block->n_insn < block->insn_capacity) {
550-
rv_insn_t *ir = block->ir + block->n_insn;
567+
while (true) {
551568
memset(ir, 0, sizeof(rv_insn_t));
552569

570+
if (prev_ir)
571+
prev_ir->next = ir;
572+
553573
/* fetch the next instruction */
554574
const uint32_t insn = rv->io.mem_ifetch(block->pc_end);
555575

@@ -564,21 +584,29 @@ static void block_translate(riscv_t *rv, block_t *block)
564584
/* compute the end of pc */
565585
block->pc_end += ir->insn_len;
566586
block->n_insn++;
587+
prev_ir = ir;
567588
/* stop on branch */
568589
if (insn_is_branch(ir->opcode))
569590
break;
591+
592+
ir = mpool_alloc(map->block_ir_mp);
570593
}
571-
block->ir[block->n_insn - 1].tailcall = true;
594+
595+
assert(prev_ir);
596+
block->ir_tail = prev_ir;
597+
block->ir_tail->tailcall = true;
572598
}
573599

574600
#define COMBINE_MEM_OPS(RW) \
575601
count = 1; \
576-
next_ir = ir + 1; \
602+
next_ir = ir->next; \
603+
tmp_ir = next_ir; \
577604
if (next_ir->opcode != IIF(RW)(rv_insn_lw, rv_insn_sw)) \
578605
break; \
579606
sign = (ir->imm - next_ir->imm) >> 31 ? -1 : 1; \
580-
for (uint32_t j = 1; j < block->n_insn - 1 - i; j++) { \
581-
next_ir = ir + j; \
607+
next_ir = tmp_ir; \
608+
for (uint32_t j = 1; j < block->n_insn - 1 - i; \
609+
j++, next_ir = next_ir->next) { \
582610
if (next_ir->opcode != IIF(RW)(rv_insn_lw, rv_insn_sw) || \
583611
ir->rs1 != next_ir->rs1 || ir->imm - next_ir->imm != 4 * sign) \
584612
break; \
@@ -590,8 +618,8 @@ static void block_translate(riscv_t *rv, block_t *block)
590618
ir->imm2 = count; \
591619
memcpy(ir->fuse, ir, sizeof(opcode_fuse_t)); \
592620
ir->impl = dispatch_table[ir->opcode]; \
593-
for (int j = 1; j < count; j++) { \
594-
next_ir = ir + j; \
621+
next_ir = tmp_ir; \
622+
for (int j = 1; j < count; j++, next_ir = next_ir->next) { \
595623
memcpy(ir->fuse + j, next_ir, sizeof(opcode_fuse_t)); \
596624
} \
597625
ir->tailcall = next_ir->tailcall; \
@@ -825,7 +853,7 @@ static bool detect_memcpy(riscv_t *rv, int lib)
825853

826854
static bool libc_substitute(riscv_t *rv, block_t *block)
827855
{
828-
rv_insn_t *ir = block->ir, *next_ir = NULL;
856+
rv_insn_t *ir = block->ir_head, *next_ir = NULL;
829857
switch (ir->opcode) {
830858
case rv_insn_addi:
831859
/* Compare the target block with the first basic block of
@@ -835,10 +863,10 @@ static bool libc_substitute(riscv_t *rv, block_t *block)
835863
* instruction sequence.
836864
*/
837865
if (ir->imm == 15 && ir->rd == rv_reg_t1 && ir->rs1 == rv_reg_zero) {
838-
next_ir = ir + 1;
866+
next_ir = ir->next;
839867
if (next_ir->opcode == rv_insn_addi && next_ir->rd == rv_reg_a4 &&
840868
next_ir->rs1 == rv_reg_a0 && next_ir->rs2 == rv_reg_zero) {
841-
next_ir = next_ir + 1;
869+
next_ir = next_ir->next;
842870
if (next_ir->opcode == rv_insn_bgeu && next_ir->imm == 60 &&
843871
next_ir->rs1 == rv_reg_t1 && next_ir->rs2 == rv_reg_a2) {
844872
if (detect_memset(rv, 1)) {
@@ -851,7 +879,7 @@ static bool libc_substitute(riscv_t *rv, block_t *block)
851879
}
852880
} else if (ir->imm == 0 && ir->rd == rv_reg_t1 &&
853881
ir->rs1 == rv_reg_a0) {
854-
next_ir = ir + 1;
882+
next_ir = ir->next;
855883
if (next_ir->opcode == rv_insn_beq && next_ir->rs1 == rv_reg_a2 &&
856884
next_ir->rs2 == rv_reg_zero) {
857885
if (next_ir->imm == 20 && detect_memset(rv, 2)) {
@@ -876,14 +904,14 @@ static bool libc_substitute(riscv_t *rv, block_t *block)
876904
*/
877905
if (ir->rd == rv_reg_a5 && ir->rs1 == rv_reg_a0 &&
878906
ir->rs2 == rv_reg_a1) {
879-
next_ir = ir + 1;
907+
next_ir = ir->next;
880908
if (next_ir->opcode == rv_insn_andi && next_ir->imm == 3 &&
881909
next_ir->rd == rv_reg_a5 && next_ir->rs1 == rv_reg_a5) {
882-
next_ir = next_ir + 1;
910+
next_ir = next_ir->next;
883911
if (next_ir->opcode == rv_insn_add &&
884912
next_ir->rd == rv_reg_a7 && next_ir->rs1 == rv_reg_a0 &&
885913
next_ir->rs2 == rv_reg_a2) {
886-
next_ir = next_ir + 1;
914+
next_ir = next_ir->next;
887915
if (next_ir->opcode == rv_insn_bne && next_ir->imm == 104 &&
888916
next_ir->rs1 == rv_reg_a5 &&
889917
next_ir->rs2 == rv_reg_zero) {
@@ -912,12 +940,15 @@ static bool libc_substitute(riscv_t *rv, block_t *block)
912940
*/
913941
static void match_pattern(block_t *block)
914942
{
915-
for (uint32_t i = 0; i < block->n_insn - 1; i++) {
916-
rv_insn_t *ir = block->ir + i, *next_ir = NULL;
943+
uint32_t i;
944+
rv_insn_t *ir;
945+
for (i = 0, ir = block->ir_head; i < block->n_insn - 1;
946+
i++, ir = ir->next) {
947+
rv_insn_t *next_ir = NULL, *tmp_ir = NULL;
917948
int32_t count = 0, sign = 1;
918949
switch (ir->opcode) {
919950
case rv_insn_lui:
920-
next_ir = ir + 1;
951+
next_ir = ir->next;
921952
switch (next_ir->opcode) {
922953
case rv_insn_add:
923954
if (ir->rd == next_ir->rs2 || ir->rd == next_ir->rs1) {
@@ -940,7 +971,7 @@ static void match_pattern(block_t *block)
940971
count++;
941972
if (next_ir->tailcall)
942973
break;
943-
next_ir++;
974+
next_ir = next_ir->next;
944975
}
945976
ir->imm2 = count;
946977
ir->opcode = rv_insn_fuse1;
@@ -994,8 +1025,10 @@ static void optimize_constant(riscv_t *rv, block_t *block)
9941025
constopt_info_t constopt_info = {0};
9951026
constopt_info.is_constant[0] = true;
9961027
assert(rv->X[0] == 0);
997-
for (uint32_t i = 0; i < block->n_insn; i++) {
998-
rv_insn_t *ir = block->ir + i;
1028+
1029+
uint32_t i;
1030+
rv_insn_t *ir;
1031+
for (i = 0, ir = block->ir_head; i < block->n_insn; i++, ir = ir->next) {
9991032
((constopt_func_t) constopt_table[ir->opcode])(ir, &constopt_info);
10001033
}
10011034
}
@@ -1014,10 +1047,7 @@ static block_t *block_find_or_translate(riscv_t *rv)
10141047
}
10151048

10161049
/* allocate a new block */
1017-
next = block_alloc(10);
1018-
1019-
/* translate the basic block */
1020-
block_translate(rv, next);
1050+
next = block_alloc(rv, map);
10211051

10221052
if (!libc_substitute(rv, next)) {
10231053
optimize_constant(rv, next);
@@ -1075,27 +1105,27 @@ void rv_step(riscv_t *rv, int32_t cycles)
10751105
if (prev->pc_start != last_pc)
10761106
prev = block_find(&rv->block_map, last_pc);
10771107

1078-
rv_insn_t *last_ir = prev->ir + prev->n_insn - 1;
1108+
rv_insn_t *last_ir = prev->ir_tail;
10791109
/* chain block */
10801110
if (!insn_is_unconditional_branch(last_ir->opcode)) {
10811111
if (branch_taken && !last_ir->branch_taken)
1082-
last_ir->branch_taken = block->ir;
1112+
last_ir->branch_taken = block->ir_head;
10831113
else if (!last_ir->branch_untaken)
1084-
last_ir->branch_untaken = block->ir;
1114+
last_ir->branch_untaken = block->ir_head;
10851115
} else if (last_ir->opcode == rv_insn_jal
10861116
#if RV32_HAS(EXT_C)
10871117
|| last_ir->opcode == rv_insn_cj ||
10881118
last_ir->opcode == rv_insn_cjal
10891119
#endif
10901120
) {
10911121
if (!last_ir->branch_taken)
1092-
last_ir->branch_taken = block->ir;
1122+
last_ir->branch_taken = block->ir_head;
10931123
}
10941124
}
10951125
last_pc = rv->PC;
10961126

10971127
/* execute the block */
1098-
const rv_insn_t *ir = block->ir;
1128+
const rv_insn_t *ir = block->ir_head;
10991129
if (unlikely(!ir->impl(rv, ir)))
11001130
break;
11011131

0 commit comments

Comments
 (0)