Skip to content

[RISC-V] codegen seems not ideal #361

Closed
@japaric

Description

@japaric

Disclaimer: I don't know much of the RISC-V ISA so this may be just my poor
understanding.

Some things I observed while working with the hifive1 board.

NOTE: wherever the (compilation) target is not specified,
riscv32imac-unknown-none-elf was used

jalr instead of jal

All direct function calls seems to use some sort of relocatable form. Example
below:

#[no_mangle]
unsafe fn _start() -> ! {
    foo();

    loop {}
}

#[inline(never)]
#[no_mangle]
unsafe fn foo() {
    asm!("" : : : "memory" : "volatile")
}
$ cargo objdump --bin app --release -- -C -d -no-show-raw-insn
0000000000011000 _start:
   11000:       addi    sp, sp, -16
   11002:       sw      ra, 12(sp)
   11004:       sw      s0, 8(sp)
   11006:       addi    s0, sp, 16
   11008:       auipc   ra, 0
   1100c:       jalr    ra, ra, 10
   11010:       j       0

0000000000011012 foo:
   11012:       addi    sp, sp, -16
   11014:       sw      ra, 12(sp)
   11016:       sw      s0, 8(sp)
   11018:       addi    s0, sp, 16
   1101a:       lw      s0, 8(sp)
   1101c:       lw      ra, 12(sp)
   1101e:       addi    sp, sp, 16
   11020:       ret

That auipc, jalr combination looks like relocatable code to me (not that I
know much about relocatable code). I was expecting to see jal function calls
of this form:

global_asm!(r#"
  .global _start
_start:
  addi    sp, sp, -16
  sw      ra, 12(sp)
  sw      s0, 8(sp)
  addi    s0, sp, 16
  jal     foo
  j       0
"#);

#[inline(never)]
#[no_mangle]
unsafe fn foo() {
    asm!("" : : : "memory" : "volatile")
}
$ cargo objdump --bin app --release -- -C -d -no-show-raw-insn
0000000000011000 _start:
   11000:       addi    sp, sp, -16
   11002:       sw      ra, 12(sp)
   11004:       sw      s0, 8(sp)
   11006:       addi    s0, sp, 16
   11008:       jal     6
   1100c:       j       0

000000000001100e foo:
   1100e:       addi    sp, sp, -16
   11010:       sw      ra, 12(sp)
   11012:       sw      s0, 8(sp)
   11014:       addi    s0, sp, 16
   11016:       lw      s0, 8(sp)
   11018:       lw      ra, 12(sp)
   1101a:       addi    sp, sp, 16
   1101c:       ret

Since the target specification says relocation-model: "static":

$ rustc -Z unstable-options --print target-spec-json --target riscv32imac-unknown-none-elf
(..)
  "relocation-model": "static",
(..)

That would save 4 bytes of .text per function call.

atomic::compiler_fence produces an instruction

This program shows that atomic::compiler_fence produces a fence instruction

#[no_mangle]
unsafe fn _start() -> ! {
    atomic::compiler_fence(Ordering::SeqCst);

    loop {}
}
$ cargo objdump --bin app --release -- -C -d -no-show-raw-insn
0000000000011000 _start:
   11000:       addi    sp, sp, -16
   11002:       sw      ra, 12(sp)
   11004:       sw      s0, 8(sp)
   11006:       addi    s0, sp, 16
   11008:       fence   rw, rw
   1100c:       j       0

Even though the description of the function states that "compiler_fence does
not emit any machine code".

As it is, both atomic::fence and atomic::compiler_fence generate the same
machine code.

#[no_mangle]
unsafe fn _start() -> ! {
    atomic::fence(Ordering::SeqCst);

    loop {}
}
$ cargo objdump --bin app --release -- -C -d -no-show-raw-insn
0000000000011000 _start:
   11000:       addi    sp, sp, -16
   11002:       sw      ra, 12(sp)
   11004:       sw      s0, 8(sp)
   11006:       addi    s0, sp, 16
   11008:       fence   rw, rw
   1100c:       j       0

You don't see this behavior with the ARM Cortex-M backend:

#[no_mangle]
unsafe fn _start() -> ! {
    atomic::compiler_fence(Ordering::SeqCst);

    loop {}
}
$ cargo objdump --target thumbv7m-none-eabi --bin app --release -- -C -d -no-show-raw-insn
0000000000011000 _start:
   11000:       b       #-4 <_start>

intrinsics::abort != UNIMP

Calling intrinsics::abort produces a function call to the abort symbol even
though the UNIMP instruction exists.

#[no_mangle]
unsafe fn _start() -> ! {
    core::intrinsics::abort()
}

global_asm!(r#"
  .global abort
abort:
  UNIMP
"#);
$ cargo objdump --bin app --release -- -C -d -no-show-raw-insn
0000000000011000 abort:
   11000:       unimp

0000000000011002 _start:
   11002:       addi    sp, sp, -16
   11004:       sw      ra, 12(sp)
   11006:       sw      s0, 8(sp)
   11008:       addi    s0, sp, 16
   1100a:       auipc   ra, 0
   1100e:       jalr    ra, ra, -10
   11012:       auipc   ra, 0
   11016:       jalr    ra, ra, -18

I was actually expecting something like this to be generated:

global_asm!(r#"
  .global _start
_start:
  UNIMP
"#);
$ cargo objdump --bin app --release -- -C -d -no-show-raw-insn
0000000000011000 _start:
   11000:       unimp

noreturn nounwind & divergent functions

The riscv32imac-unknown-none-elf target uses panic-strategy: "abort"; that
tells the backend that functions never unwind so divergent functions fn() -> !
, which never return and are marked as noreturn nounwind in LLVM IR, should
not preserve the caller "saved registers". However, one observes register
stacking in the following program:

#[no_mangle]
unsafe fn _start() -> ! {
    // (RISC-V has so many registers)
    asm!("" : :
         "r"(0) "r"(1) "r"(2) "r"(3) "r"(4) "r"(5) "r"(6) "r"(7) "r"(8) "r"(9)
         "r"(10) "r"(11) "r"(12) "r"(13) "r"(14) "r"(15) "r"(16)
         : : "volatile");

    loop {}
}
$ cargo objdump --bin app --release -- -C -d -no-show-raw-insn
0000000000011000 _start:
   11000:       addi    sp, sp, -16
   11002:       sw      ra, 12(sp)
   11004:       sw      s0, 8(sp)
   11006:       sw      s1, 4(sp)
   11008:       addi    s0, sp, 16
   1100a:       addi    a6, zero, 1
   1100c:       addi    a7, zero, 2
   1100e:       addi    t0, zero, 3
   11010:       addi    t1, zero, 4
   11012:       addi    t2, zero, 5
   11014:       addi    t3, zero, 6
   11016:       addi    t4, zero, 7
   11018:       addi    t5, zero, 8
   1101a:       addi    t6, zero, 9
   1101c:       addi    a3, zero, 10
   1101e:       addi    a4, zero, 11
   11020:       addi    a5, zero, 12
   11022:       addi    a0, zero, 13
   11024:       addi    a1, zero, 14
   11026:       addi    a2, zero, 15
   11028:       addi    s1, zero, 16
   1102a:       j       0

s0 and s1 are pushed onto the stack but never popped. Is that required by
the ISA / C ABI?

(off-topic: why is ra also being pushed onto the stack when _start performs
no function call?)

Compare the previous program to this ARM Cortex-M program:

#[no_mangle]
unsafe fn _start() -> ! {
    asm!("" : :
         "r"(0) "r"(1) "r"(2) "r"(3) "r"(4) "r"(5) "r"(6) "r"(7) "r"(8) "r"(9)
         "r"(10) "r"(11) "r"(12) "r"(13)
         : : "volatile");

    loop {}
}
$ cargo objdump --target thumbv7m-none-eabi --bin app --release -- -C -d -no-show-raw-insn
0000000000011000 _start:
   11000:       mov.w   r12, #0
   11004:       mov.w   lr, #1
   11008:       mov.w   r8, #2
   1100c:       mov.w   r9, #3
   11010:       mov.w   r10, #4
   11014:       mov.w   r11, #5
   11018:       movs    r4, #6
   1101a:       movs    r5, #7
   1101c:       movs    r6, #8
   1101e:       movs    r7, #9
   11020:       movs    r2, #10
   11022:       movs    r3, #11
   11024:       movs    r0, #12
   11026:       movs    r1, #13
   11028:       b       #-4 <_start+0x28>

Registers are never pushed onto the stack.

Compare that machine code to the machine code generated for a non-divergent
function:

#[no_mangle]
unsafe fn _start() {
    asm!("" : :
         "r"(0) "r"(1) "r"(2) "r"(3) "r"(4) "r"(5) "r"(6) "r"(7) "r"(8) "r"(9)
         "r"(10) "r"(11) "r"(12) "r"(13)
         : : "volatile");
}
$ cargo objdump --target thumbv7m-none-eabi --bin app --release -- -C -d -no-show-raw-insn
0000000000011000 _start:
   11000:       push.w  {r4, r5, r6, r7, r8, r9, r10, r11, lr}
   11004:       mov.w   r12, #0
   11008:       mov.w   lr, #1
   1100c:       mov.w   r8, #2
   11010:       mov.w   r9, #3
   11014:       mov.w   r10, #4
   11018:       mov.w   r11, #5
   1101c:       movs    r4, #6
   1101e:       movs    r5, #7
   11020:       movs    r6, #8
   11022:       movs    r7, #9
   11024:       movs    r2, #10
   11026:       movs    r3, #11
   11028:       movs    r0, #12
   1102a:       movs    r1, #13
   1102c:       pop.w   {r4, r5, r6, r7, r8, r9, r10, r11, pc}

Registers are pushed in the prologue and then popped in the epilogue of the
function.


It would be good to check if these backend issues have been fixed in the latest
version of LLVM (using llc) because rustc is using a several months old
LLVM. If a recent commit shows the same issues then we may want to submit bug
reports to the LLVM project.

Metadata

$ rustc -Vv
rustc 1.37.0-nightly (04a3dd8a8 2019-06-18)
binary: rustc
commit-hash: 04a3dd8a872633ca1e4c217d11f741cc35cb19a5
commit-date: 2019-06-18
host: x86_64-unknown-linux-gnu
release: 1.37.0-nightly
LLVM version: 8.0

cc @rust-embedded/riscv @Disasm

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions