From b24431ca25e71f7e8d9c409e83ad1d06e8404ebc Mon Sep 17 00:00:00 2001 From: Yen-Fu Chen Date: Tue, 21 Nov 2023 17:11:52 +0800 Subject: [PATCH] Apply branch prediction for indirect jump Previously, it was necessary to perform a block cache lookup at the end of an indirect jump emulation; however, the associated overhead of this operation proved to be substantial. To mitigate this overhead, we have introduced a branch history table that captures the historical data of indirect jump targets. Given the limited number of entries in the branch history table, the lookup overhead is significantly reduced. As shown in the performance analysis provided below, the branch history table has demonstrably enhanced the overall performance. | Metric | original | proposed | |-----------+--------------+--------------| | Dhrystone | 2932.3 DMIPS | 2985.2 DMIPS | | CoreMark | 2231 iter/s | 2236 iter/s | | Stream | 76.04 sec | 75.299 sec | | Nqueens | 4.069 sec | 3.933 sec | --- src/decode.h | 9 ++++++++- src/emulate.c | 40 ++++++++++++++++++++++++---------------- src/riscv.c | 1 + src/rv32_template.c | 36 +++++++++++++++++++++++++++--------- 4 files changed, 60 insertions(+), 26 deletions(-) diff --git a/src/decode.h b/src/decode.h index 874d8fe47..159e36e2b 100644 --- a/src/decode.h +++ b/src/decode.h @@ -252,6 +252,12 @@ typedef struct { uint8_t opcode; } opcode_fuse_t; +#define HISTORY_SIZE 16 +typedef struct { + uint32_t PC; + struct rv_insn *branch_target; +} branch_history_entry_t; + typedef struct rv_insn { union { int32_t imm; @@ -294,7 +300,8 @@ typedef struct rv_insn { * specific IR array without the need for additional copying. */ struct rv_insn *branch_taken, *branch_untaken; - + uint8_t branch_table_count; + branch_history_entry_t *branch_table; } rv_insn_t; /* decode the RISC-V instruction */ diff --git a/src/emulate.c b/src/emulate.c index b5c45e954..197b88c3b 100644 --- a/src/emulate.c +++ b/src/emulate.c @@ -371,21 +371,21 @@ static bool is_branch_taken = false; static uint32_t last_pc = 0; /* Interpreter-based execution path */ -#define RVOP(inst, code) \ - static bool do_##inst(riscv_t *rv, const rv_insn_t *ir, uint64_t cycle, \ - uint32_t PC) \ - { \ - cycle++; \ - code; \ - nextop: \ - PC += __rv_insn_##inst##_len; \ - if (unlikely(RVOP_NO_NEXT(ir))) { \ - rv->csr_cycle = cycle; \ - rv->PC = PC; \ - return true; \ - } \ - const rv_insn_t *next = ir->next; \ - MUST_TAIL return next->impl(rv, next, cycle, PC); \ +#define RVOP(inst, code) \ + static bool do_##inst(riscv_t *rv, rv_insn_t *ir, uint64_t cycle, \ + uint32_t PC) \ + { \ + cycle++; \ + code; \ + nextop: \ + PC += __rv_insn_##inst##_len; \ + if (unlikely(RVOP_NO_NEXT(ir))) { \ + rv->csr_cycle = cycle; \ + rv->PC = PC; \ + return true; \ + } \ + const rv_insn_t *next = ir->next; \ + MUST_TAIL return next->impl(rv, next, cycle, PC); \ } #include "rv32_template.c" @@ -633,8 +633,16 @@ static void block_translate(riscv_t *rv, block_t *block) block->n_insn++; prev_ir = ir; /* stop on branch */ - if (insn_is_branch(ir->opcode)) + if (insn_is_branch(ir->opcode)) { + if (ir->opcode == rv_insn_jalr +#if RV32_HAS(EXT_C) + || ir->opcode == rv_insn_cjalr || ir->opcode == rv_insn_cjr +#endif + ) + ir->branch_table = + calloc(1, HISTORY_SIZE * sizeof(branch_history_entry_t)); break; + } ir = mpool_alloc(rv->block_ir_mp); } diff --git a/src/riscv.c b/src/riscv.c index 68104b3fa..f4807758f 100644 --- a/src/riscv.c +++ b/src/riscv.c @@ -34,6 +34,7 @@ void block_map_clear(riscv_t *rv) for (idx = 0, ir = block->ir_head; idx < block->n_insn; idx++, ir = next) { free(ir->fuse); + free(ir->branch_table); next = ir->next; mpool_free(rv->block_ir_mp, ir); } diff --git a/src/rv32_template.c b/src/rv32_template.c index f69ea96d5..8d08f0443 100644 --- a/src/rv32_template.c +++ b/src/rv32_template.c @@ -41,6 +41,30 @@ RVOP(jal, { return true; }) +/* The branch history table records historical data pertaining to indirect jump + * targets. This functionality alleviates the need to invoke block_find() and + * incurs overhead only when the indirect jump targets are not previously + * recorded. Additionally, the C code generator can reference the branch history + * table to link he indirect jump targets. + */ +#define LOOKUP_OR_UPDATE_BRANCH_HISTORY_TABLE() \ + /* lookup branch history table */ \ + for (int i = 0; i < ir->branch_table_count; i++) { \ + if (ir->branch_table[i].PC == PC) { \ + MUST_TAIL return ir->branch_table[i].branch_target->impl( \ + rv, ir->branch_table[i].branch_target, cycle, PC); \ + } \ + } \ + block_t *block = block_find(&rv->block_map, PC); \ + if (block) { \ + /* update branch history table */ \ + ir->branch_table_count = (ir->branch_table_count + 1) % HISTORY_SIZE; \ + ir->branch_table[ir->branch_table_count].PC = PC; \ + ir->branch_table[ir->branch_table_count].branch_target = \ + block->ir_head; \ + MUST_TAIL return block->ir_head->impl(rv, block->ir_head, cycle, PC); \ + } + /* The indirect jump instruction JALR uses the I-type encoding. The target * address is obtained by adding the sign-extended 12-bit I-immediate to the * register rs1, then setting the least-significant bit of the result to zero. @@ -57,9 +81,7 @@ RVOP(jalr, { rv->X[ir->rd] = pc + 4; /* check instruction misaligned */ RV_EXC_MISALIGN_HANDLER(pc, insn, false, 0); - block_t *block = block_find(&rv->block_map, PC); - if (block) - MUST_TAIL return block->ir_head->impl(rv, block->ir_head, cycle, PC); + LOOKUP_OR_UPDATE_BRANCH_HISTORY_TABLE(); rv->csr_cycle = cycle; rv->PC = PC; return true; @@ -1016,9 +1038,7 @@ RVOP(clwsp, { /* C.JR */ RVOP(cjr, { PC = rv->X[ir->rs1]; - block_t *block = block_find(&rv->block_map, PC); - if (block) - MUST_TAIL return block->ir_head->impl(rv, block->ir_head, cycle, PC); + LOOKUP_OR_UPDATE_BRANCH_HISTORY_TABLE(); rv->csr_cycle = cycle; rv->PC = PC; return true; @@ -1043,9 +1063,7 @@ RVOP(cjalr, { rv->X[rv_reg_ra] = PC + 2; PC = jump_to; RV_EXC_MISALIGN_HANDLER(PC, insn, true, 0); - block_t *block = block_find(&rv->block_map, PC); - if (block) - MUST_TAIL return block->ir_head->impl(rv, block->ir_head, cycle, PC); + LOOKUP_OR_UPDATE_BRANCH_HISTORY_TABLE(); rv->csr_cycle = cycle; rv->PC = PC; return true;