Skip to content

Conversation

@folkertdev
Copy link
Member

turns out the default is really not as efficient as it could be, compare

.section .text.rotate_right_1,"ax",@progbits
	.globl	rotate_right_1
	.p2align	4, 0x90
	.type	rotate_right_1,@function
rotate_right_1:
	.cfi_startproc
	push rbp
	.cfi_def_cfa_offset 16
	push rbx
	.cfi_def_cfa_offset 24
	push rax
	.cfi_def_cfa_offset 32
	.cfi_offset rbx, -24
	.cfi_offset rbp, -16
	test rsi, rsi
	je .LBB32_11
	mov rdx, rsi
	dec rdx
	je .LBB32_5
	mov rbx, rdi
	cmp rsi, 24
	jae .LBB32_3
	movzx ebp, byte ptr [rbx]
	mov eax, 1
	mov ecx, 1
	sub rcx, rsi
	xor esi, esi
	mov edi, 1
	jmp .LBB32_7
	.p2align	4, 0x90
.LBB32_8:
	inc rdi
.LBB32_7:
	mov r8d, ebp
	movzx ebp, byte ptr [rbx + rdi]
	mov byte ptr [rbx + rdi], r8b
	cmp rdi, rdx
	jb .LBB32_8
	add rdi, rcx
	je .LBB32_4
	cmp rdi, rax
	cmovb rax, rsi
	cmovb rdi, rsi
	jmp .LBB32_7
.LBB32_3:
	lea rdi, [rbx + 1]
	movzx ebp, byte ptr [rbx + rdx]
	mov rsi, rbx
	call qword ptr [rip + memmove@GOTPCREL]
.LBB32_4:
	mov byte ptr [rbx], bpl
.LBB32_5:
	add rsp, 8
	.cfi_def_cfa_offset 24
	pop rbx
	.cfi_def_cfa_offset 16
	pop rbp
	.cfi_def_cfa_offset 8
	ret
.LBB32_11:
	.cfi_def_cfa_offset 32
	lea rdi, [rip + .L__unnamed_283]
	lea rdx, [rip + .L__unnamed_284]
	mov esi, 33
	call qword ptr [rip + core::panicking::panic@GOTPCREL]

versus

.section .text.rotate_right_1,"ax",@progbits
	.globl	rotate_right_1
	.p2align	4, 0x90
	.type	rotate_right_1,@function
rotate_right_1:
	.cfi_startproc
	test rsi, rsi
	je .LBB32_2
	push rbp
	.cfi_def_cfa_offset 16
	push rbx
	.cfi_def_cfa_offset 24
	push rax
	.cfi_def_cfa_offset 32
	.cfi_offset rbx, -24
	.cfi_offset rbp, -16
	mov rdx, rsi
	movzx ebp, byte ptr [rdi + rsi - 1]
	dec rdx
	lea rax, [rdi + 1]
	mov rbx, rdi
	mov rdi, rax
	mov rsi, rbx
	call qword ptr [rip + memmove@GOTPCREL]
	mov byte ptr [rbx], bpl
	add rsp, 8
	.cfi_def_cfa_offset 24
	pop rbx
	.cfi_def_cfa_offset 16
	pop rbp
	.cfi_def_cfa_offset 8
	.cfi_restore rbx
	.cfi_restore rbp
.LBB32_2:
	ret 

This gives some good speedup on some of the examples

Benchmark 2 (8 runs): target/release/examples/decompress rs tests/input/bzip2-testfiles/commons-compress/zip64support.tar.bz2
  measurement          mean ± σ            min … max           outliers         delta
  wall_time           697ms ± 1.69ms     695ms …  699ms          0 ( 0%)        ⚡-  2.2% ±  0.6%
  peak_rss            116MB ± 97.5KB     116MB …  116MB          0 ( 0%)          +  0.1% ±  0.1%
  cpu_cycles         2.99G  ± 8.43M     2.98G  … 3.00G           0 ( 0%)        ⚡-  2.3% ±  0.6%
  instructions       5.99G  ±  250      5.99G  … 5.99G           0 ( 0%)        ⚡-  4.1% ±  0.0%
  cache_references   91.1M  ±  173K     90.8M  … 91.3M           0 ( 0%)          -  0.5% ±  0.3%
  cache_misses       17.3M  ±  164K     17.0M  … 17.5M           1 (13%)        ⚡-  3.2% ±  0.9%
  branch_misses      7.41M  ± 19.4K     7.39M  … 7.45M           0 ( 0%)        ⚡-  5.9% ±  0.3%

@folkertdev folkertdev requested a review from bjorn3 May 30, 2025 08:45
@codecov
Copy link

codecov bot commented May 30, 2025

Codecov Report

All modified and coverable lines are covered by tests ✅

Flag Coverage Δ
fuzz-compress 51.00% <100.00%> (+0.06%) ⬆️
fuzz-decompress 30.17% <100.00%> (-1.99%) ⬇️
test-aarch64-apple-darwin 90.06% <100.00%> (+<0.01%) ⬆️
test-x86_64-apple-darwin 90.16% <100.00%> (+<0.01%) ⬆️
test-x86_64-unknown-linux-gnu 90.06% <100.00%> (+<0.01%) ⬆️

Flags with carried forward coverage won't be shown. Click here to find out more.

Files with missing lines Coverage Δ
libbz2-rs-sys/src/decompress.rs 97.61% <100.00%> (-0.52%) ⬇️

... and 1 file with indirect coverage changes

🚀 New features to boost your workflow:
  • ❄️ Test Analytics: Detect flaky tests, report on failures, and find test suite problems.

@folkertdev folkertdev merged commit 40b32c6 into main May 30, 2025
19 of 23 checks passed
@folkertdev folkertdev deleted the specialize-rotate-right branch May 30, 2025 11:29
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants