Assembly optimisations

I was tinkering with the generated assembly, and I had to ideas that I'd like to propose:

1. Mark the lazy evaluation as cold
2. Eager evaluation mode
 
Examples from macos aarch64, but should be relevant for all supported platforms:

### 1. Mark the lazy evaluation as cold

https://github.com/oxidecomputer/usdt/blob/9a58fa8a730daabda2633382cb96eaa4f9d9a531/probe-test-attr/src/main.rs#L98 

This probe generates the following assembly:
```asm
    0x100001440 <+196>:  mov    x0, #0x0 ; =0
    0x100001444 <+200>:  cbz    w0, 0x10000172c ; <+944>
    ; this is the cold path of the code with a probe attached
    0x100001448 <+204>:  ldr    x8, [sp, #0x18]
    0x10000144c <+208>:  stur   x8, [x29, #-0x90]
    0x100001450 <+212>:  adrp   x8, 48
    0x100001454 <+216>:  add    x8, x8, #0x4b4 ; core::fmt::num::imp::_$LT$impl$u20$core..fmt..Display$u20$for$u20$u8$GT$::fmt::hcb95427a99b1dfea
    0x100001458 <+220>:  stur   x8, [x29, #-0x88]
    0x10000145c <+224>:  adrp   x8, 75
    0x100001460 <+228>:  add    x8, x8, #0x70 ; anon.2e6bf7caf09a98e22fab005eb5173d2b.17 + 48
    0x100001464 <+232>:  stp    x8, x21, [sp, #0xb0]
    0x100001468 <+236>:  sub    x8, x29, #0x90
    0x10000146c <+240>:  stp    x8, x21, [sp, #0xc0]
    0x100001470 <+244>:  str    xzr, [sp, #0xd0]
    0x100001474 <+248>:  add    x8, sp, #0x78
    0x100001478 <+252>:  add    x0, sp, #0xb0
    0x10000147c <+256>:  bl     0x100030778    ; alloc::fmt::format::format_inner::hfda0adfe3db50518
; ... 632 bytes later
    0x1000016f4 <+888>:  mov    x0, x26
    0x1000016f8 <+892>:  mov    x1, x25
    0x1000016fc <+896>:  nop
    0x100001700 <+900>:  cbz    x24, 0x100001714 ; <+920>
    0x100001704 <+904>:  mov    x0, x25
    0x100001708 <+908>:  mov    x1, x24
    0x10000170c <+912>:  mov    w2, #0x1 ; =1
    0x100001710 <+916>:  bl     0x1000029f4    ; __rustc::__rust_dealloc
    0x100001714 <+920>:  ldr    x1, [sp, #0x90]
    0x100001718 <+924>:  cbz    x1, 0x100001728 ; <+940>
    0x10000171c <+928>:  mov    x0, x26
    0x100001720 <+932>:  mov    w2, #0x1 ; =1
    0x100001724 <+936>:  bl     0x1000029f4    ; __rustc::__rust_dealloc
    0x100001728 <+940>:  str    x23, [sp, #0x20]
    ; now is the rest of the hot path code
```

This is quite a lot of code to generate for a no-op probe. Yes the branch predictor should see right through the `mov x0, #0x0; cbz w0, 0x10000172c` but it still has to then jump and load the instructions.

We can create a `#[cold] fn cold() {}` and call this cold function if the branch is taken. This usually has the effect that the `cbz` becomes a `cbnz`. Here's the same code that is now generated

```asm
    0x10000143c <+192>:  mov    x0, #0x0 ; =0
    0x100001440 <+196>:  cbnz   w0, 0x1000014b8 ; <+316>
    ; now is the rest of the hot path code
; ... 120 bytes later
    ; this is the cold path of the code with a probe attached, much later on
    0x1000014b8 <+316>:  ldr    x8, [sp, #0x18]
    0x1000014bc <+320>:  stur   x8, [x29, #-0x90]
    0x1000014c0 <+324>:  adrp   x8, 48
    0x1000014c4 <+328>:  add    x8, x8, #0x4dc ; core::fmt::num::imp::_$LT$impl$u20$core..fmt..Display$u20$for$u20$u8$GT$::fmt::hcb95427a99b1dfea
    0x1000014c8 <+332>:  stur   x8, [x29, #-0x88]
    0x1000014cc <+336>:  adrp   x8, 75
    0x1000014d0 <+340>:  add    x8, x8, #0x70 ; anon.2e6bf7caf09a98e22fab005eb5173d2b.17 + 48
    0x1000014d4 <+344>:  stp    x8, x21, [sp, #0xb0]
    0x1000014d8 <+348>:  sub    x8, x29, #0x90
    0x1000014dc <+352>:  stp    x8, x21, [sp, #0xc0]
    0x1000014e0 <+356>:  str    xzr, [sp, #0xd0]
    0x1000014e4 <+360>:  add    x8, sp, #0x78
    0x1000014e8 <+364>:  add    x0, sp, #0xb0
    0x1000014ec <+368>:  bl     0x1000307a0    ; alloc::fmt::format::format_inner::hfda0adfe3db50518
; ... 564 bytes later
    0x100001720 <+932>:  bl     0x100002a1c    ; __rustc::__rust_dealloc
    0x100001724 <+936>:  ldp    x23, x24, [x29, #-0xe0]
    0x100001728 <+940>:  sub    x0, x29, #0x90
    0x10000172c <+944>:  bl     0x100002244    ; core::ptr::drop_in_place$LT$usdt_impl..Error$GT$::h423cc3b4e1ca91c8
    0x100001730 <+948>:  mov    x0, x24
    0x100001734 <+952>:  nop
    0x100001738 <+956>:  cbz    x23, 0x10000145c ; <+224>
    0x10000173c <+960>:  mov    x0, x24
    0x100001740 <+964>:  mov    x1, x23
    0x100001744 <+968>:  mov    w2, #0x1 ; =1
    0x100001748 <+972>:  bl     0x100002a1c    ; __rustc::__rust_dealloc
    0x10000174c <+976>:  b      0x10000145c    ; <+224>
```

### 2. Eager evaluation mode

https://github.com/oxidecomputer/usdt/blob/9a58fa8a730daabda2633382cb96eaa4f9d9a531/probe-test-attr/src/main.rs#L95

This probe generates the following assembly:

```asm
    0x100001410 <+148>:  mov    x0, #0x0 ; =0
    0x100001414 <+152>:  cbz    w0, 0x100001420 ; <+164>
    0x100001418 <+156>:  ldrb   w0, [sp, #0x40]
    0x10000141c <+160>:  nop
```

Nice and short. With the prior `#[cold]` optimisation though, we get the following asm

```asm
    0x100001410 <+148>:  mov    x0, #0x0 ; =0
    0x100001414 <+152>:  cbnz   w0, 0x1000014a4 ; <+296>
    ; now is the rest of the hot path code
; ... 144 bytes later
    ; this is the cold path of the code with a probe attached, much later on
    0x1000014a4 <+296>:  ldrb   w0, [sp, #0x40]
    0x1000014a8 <+300>:  nop
    0x1000014ac <+304>:  b      0x100001418    ; <+156>
```

This is definitely not ideal. But why do we even bother with the branch here? Yes the branch predictor should make light work of it, but it seems more prudent to have the following

```asm
    0x100001410 <+148>:  ldrb   w0, [sp, #0x40]
    0x100001414 <+152>:  nop
```

To accomplish this I'd suggest exposing "eager" macros for each probe.

---

None of this has been benchmarked yet. I am opening this issue to discuss the tradeoffs and to see if there's anything I am missing. You can see my experimentation code here https://github.com/conradludgate/usdt/commit/a3abbbdbbf1c1f276d945c72a2a0b085f4795b06.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Assembly optimisations #490

1. Mark the lazy evaluation as cold

2. Eager evaluation mode

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Assembly optimisations #490

Description

1. Mark the lazy evaluation as cold

2. Eager evaluation mode

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions