Skip to content

Commit 0b8324b

Browse files
committed
Re-implement case set as a macro
1 parent 412cd4c commit 0b8324b

File tree

6 files changed

+598
-367
lines changed

6 files changed

+598
-367
lines changed

lib.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,12 +37,12 @@ pub mod src {
3737
mod cdf;
3838
mod const_fn;
3939
pub mod cpu;
40-
mod ctx;
40+
pub mod ctx;
4141
mod cursor;
4242
mod data;
4343
mod decode;
4444
mod dequant_tables;
45-
pub(crate) mod disjoint_mut;
45+
pub mod disjoint_mut;
4646
pub(crate) mod enum_map;
4747
mod env;
4848
pub(crate) mod error;

src/ctx.rs

Lines changed: 235 additions & 120 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
//! The [`CaseSet`] API below is a safe and simplified version of the `case_set*` macros in `ctx.h`.
1+
//! The [`case_set!`] macro is a safe and simplified version of the `case_set*`
2+
//! macros in `ctx.h`.
23
//!
34
//! The `case_set*` macros themselves replaced `memset`s in order to further optimize them
45
//! (in e3b5d4d044506f9e0e95e79b3de42fd94386cc61,
@@ -13,135 +14,249 @@
1314
//! as unaligned writes are UB, and so we'd need to check at runtime if they're aligned
1415
//! (a runtime-determined `off`set is used, so we can't reasonably ensure this at compile-time).
1516
//!
16-
//! To more thoroughly check this, I ran the same benchmarks done in
17-
//! e3b5d4d044506f9e0e95e79b3de42fd94386cc61, which introduced the `case_set*` macros:
17+
//! We also want to avoid multiple switches when setting a group of buffers as
18+
//! the C implementation did, which was implemented in
19+
//! https://github.com/memorysafety/rav1d/pull/1293.
1820
//!
19-
//! ```sh
20-
//! cargo build --release && hyperfine './target/release/dav1d -i ./tests/large/chimera_8b_1080p.ivf -l 1000 -o /dev/null'
21-
//! ```
21+
//! # Benchmarks
2222
//!
23-
//! for 3 implementations:
24-
//! 1. the original `case_set*` macros translated directly to `unsafe` Rust `fn`s
25-
//! 2. the safe [`CaseSet`] implementation below using [`small_memset`] with its small powers of 2 optimization
26-
//! 3. a safe [`CaseSet`] implementation using [`slice::fill`]/`memset` only
27-
//!
28-
//! The [`small_memset`] version was ~1.27% faster than the `case_set*` one,
29-
//! and ~3.26% faster than the `memset` one.
30-
//! The `case_set*` macros were also faster than `memset` in C by a similar margin,
31-
//! meaning the `memset` option is the slowest in both C and Rust,
32-
//! and since it was replaced with `case_set*` in C, we shouldn't use it in Rust.
33-
//! Thus, the [`small_memset`] implementation seems optimal, as it:
34-
//! * is the fastest of the Rust implementations
35-
//! * is completely safe
36-
//! * employs the same small powers of 2 optimization the `case_set*` implementation did
37-
//! * is far simpler than the `case_set*` implementation, consisting of a `match` and array writes
23+
//! Comparing this implementation to the previous implementation of `CaseSet` we
24+
//! see an 8.2-10.5% speedup for a single buffer, a 5.9-7.0% speedup for
25+
//! multiple buffers, and a minor improvement to multiple [`DisjointMut`]
26+
//! buffers (which happened to be well-optimized in the previous
27+
//! implementation).
3828
//!
3929
//! [`BlockContext`]: crate::src::env::BlockContext
40-
use crate::src::disjoint_mut::AsMutPtr;
41-
use crate::src::disjoint_mut::DisjointMut;
42-
use std::iter::zip;
30+
//! [`DisjointMut`]: crate::src::disjoint_mut::DisjointMut
4331
44-
/// Perform a `memset` optimized for lengths that are small powers of 2.
32+
/// Fill small ranges of buffers with a value.
33+
///
34+
/// This is effectively a specialized version [`slice::fill`] for small
35+
/// power-of-two sized ranges of buffers.
36+
///
37+
/// `$UP_TO` is the maximum length that will be optimized, with powers of two up
38+
/// to 64 supported. If the buffer length is not a power of two or greater than
39+
/// `$UP_TO`, this macro will do nothing. See [`case_set_with_default!`] to fill
40+
/// buffers with non-comforming lengths if needed.
41+
///
42+
/// # Examples
43+
///
44+
/// ```
45+
/// # use rav1d::case_set;
46+
/// let mut buf = [0u8; 32];
47+
/// let len = 16;
48+
/// for offset in [0, 16] {
49+
/// case_set!(up_to = 32, len, offset, {
50+
/// set!(&mut buf, 1u8);
51+
/// });
52+
/// }
53+
/// ```
54+
///
55+
/// In the simplest case, `$len` is the length of the buffer range to fill
56+
/// starting from `$offset`. The `$body` block is executed with `len` and
57+
/// `offset` identifiers set to the given length and offset values. Within the
58+
/// body a `set!` macro is available and must be called to set each buffer range
59+
/// to a value. `set!` takes a buffer and a value and sets the range
60+
/// `buf[offset..][..len]` to the value.
61+
/// ```
62+
/// # macro_rules! set {
63+
/// # ($buf:expr, $val:expr) => {};
64+
/// # }
65+
/// set!(buf, value);
66+
/// ```
67+
///
68+
/// ## Naming parameters
69+
///
70+
/// The identifier for either or both of `len` and `offset` can be overridden by
71+
/// specifying `identifer=value` for those parameters:
72+
/// ```
73+
/// # use rav1d::case_set;
74+
/// let mut buf = [0u8; 32];
75+
/// let outer_len = 16;
76+
/// for outer_offset in [0, 16] {
77+
/// case_set!(
78+
/// up_to = 32,
79+
/// len=outer_len,
80+
/// offset=outer_offset,
81+
/// {
82+
/// set!(&mut buf, (offset+len) as u8);
83+
/// }
84+
/// );
85+
/// }
86+
/// ```
87+
///
88+
/// ## `DisjointMut` buffers
89+
///
90+
/// [`DisjointMut`] buffers can be used in basically the same way as normal
91+
/// buffers but using the `set_disjoint!` macro instead of `set!`.
92+
/// ```
93+
/// # use rav1d::case_set;
94+
/// # use rav1d::src::disjoint_mut::DisjointMut;
95+
/// let mut buf = DisjointMut::new([0u8; 32]);
96+
/// let len = 16;
97+
/// for offset in [0, 16] {
98+
/// case_set!(up_to = 32, len, offset, {
99+
/// set_disjoint!(&mut buf, 1u8);
100+
/// });
101+
/// }
102+
/// ```
103+
///
104+
/// ## Multiple buffer ranges
105+
///
106+
/// Multiple buffers with different lengths and offsets can be filled with the
107+
/// same body statements. In the following example, two buffers with different
108+
/// sizes are initialized by quarters.
109+
/// ```
110+
/// # use rav1d::case_set;
111+
/// let mut buf1 = [0u8; 32];
112+
/// let mut buf2 = [0u8; 64];
113+
/// for offset in [0, 8, 16, 24] {
114+
/// case_set!(
115+
/// up_to = 16,
116+
/// buf = [&mut buf1[..], &mut buf2[..]],
117+
/// len = [8, 16],
118+
/// offset = [offset, offset*2],
119+
/// {
120+
/// set!(buf, len as u8 >> 3);
121+
/// }
122+
/// );
123+
/// }
124+
/// ```
45125
///
46-
/// For power of 2 lengths `<= UP_TO`,
47-
/// the `memset` is done as an array write of that exactly (compile-time known) length.
48-
/// If the length is not a power of 2 or `> UP_TO`,
49-
/// then the `memset` is done by [`slice::fill`] (a `memset` call) if `WITH_DEFAULT` is `true`,
50-
/// or else skipped if `WITH_DEFAULT` is `false`.
126+
/// A more realistic example of filling multiple buffers with the same value is
127+
/// initializing different struct fields at the same time (from
128+
/// `src/decode.rs`):
129+
/// ```ignore
130+
/// case_set!(
131+
/// up_to = 32,
132+
/// ctx = [(&t.l, 1), (&f.a[t.a], 0)],
133+
/// len = [bh4, bw4],
134+
/// offset = [by4, bx4],
135+
/// {
136+
/// let (dir, dir_index) = ctx;
137+
/// set_disjoint!(dir.seg_pred, seg_pred.into());
138+
/// set_disjoint!(dir.skip_mode, b.skip_mode);
139+
/// set_disjoint!(dir.intra, 0);
140+
/// set_disjoint!(dir.skip, b.skip);
141+
/// set_disjoint!(dir.pal_sz, 0);
142+
/// }
143+
/// );
144+
/// ```
51145
///
52-
/// This optimizes for the common cases where `buf.len()` is a small power of 2,
53-
/// where the array write is optimized as few and large stores as possible.
54-
#[inline]
55-
pub fn small_memset<T: Clone + Copy, const UP_TO: usize, const WITH_DEFAULT: bool>(
56-
buf: &mut [T],
57-
val: T,
58-
) {
59-
fn as_array<T: Clone + Copy, const N: usize>(buf: &mut [T]) -> &mut [T; N] {
60-
buf.try_into().unwrap()
61-
}
62-
match buf.len() {
63-
01 if UP_TO >= 01 => *as_array(buf) = [val; 01],
64-
02 if UP_TO >= 02 => *as_array(buf) = [val; 02],
65-
04 if UP_TO >= 04 => *as_array(buf) = [val; 04],
66-
08 if UP_TO >= 08 => *as_array(buf) = [val; 08],
67-
16 if UP_TO >= 16 => *as_array(buf) = [val; 16],
68-
32 if UP_TO >= 32 => *as_array(buf) = [val; 32],
69-
64 if UP_TO >= 64 => *as_array(buf) = [val; 64],
70-
_ => {
71-
if WITH_DEFAULT {
72-
buf.fill(val)
146+
/// [`DisjointMut`]: crate::src::disjoint_mut::DisjointMut
147+
macro_rules! case_set {
148+
(up_to=$UP_TO:literal, $(@DEFAULT=$WITH_DEFAULT:literal,)? $ctx:ident=[$($ctx_expr:expr),* $(,)?], $len:ident=[$($len_expr:expr),* $(,)?], $offset:ident=[$($offset_expr:expr),* $(,)?], $body:block) => {
149+
let ctxs = [$($ctx_expr,)*];
150+
let lens = [$($len_expr,)*];
151+
let offsets = [$($offset_expr,)*];
152+
assert_eq!(ctxs.len(), lens.len());
153+
assert_eq!(ctxs.len(), offsets.len());
154+
for (i, ctx) in ctxs.into_iter().enumerate() {
155+
case_set!(up_to=$UP_TO, $(@DEFAULT=$WITH_DEFAULT,)? $ctx=ctx, $len=lens[i], $offset=offsets[i], $body);
156+
}
157+
};
158+
(up_to=$UP_TO:literal, $(@DEFAULT=$WITH_DEFAULT:literal,)? $len:ident, $offset:ident, $body:block) => {
159+
case_set!(up_to=$UP_TO, $(@DEFAULT=$WITH_DEFAULT,)? _ctx=(), $len=$len, $offset=$offset, $body);
160+
};
161+
(up_to=$UP_TO:literal, $(@DEFAULT=$WITH_DEFAULT:literal,)? $len:ident=$len_expr:expr, $offset:ident=$offset_expr:expr, $body:block) => {
162+
case_set!(up_to=$UP_TO, $(@DEFAULT=$WITH_DEFAULT,)? _ctx=(), $len=$len_expr, $offset=$offset_expr, $body);
163+
};
164+
(up_to=$UP_TO:literal, $(@DEFAULT=$WITH_DEFAULT:literal,)? $ctx:ident=$ctx_expr:expr, $len:ident=$len_expr:expr, $offset:ident=$offset_expr:expr, $body:block) => {
165+
#[allow(unused_mut)]
166+
let mut $ctx = $ctx_expr;
167+
let $len = $len_expr;
168+
let $offset = $offset_expr;
169+
{
170+
#[allow(unused_macros)]
171+
macro_rules! set {
172+
($buf:expr, $val:expr) => {{
173+
assert!($offset <= $buf.len() && $offset + $len <= $buf.len());
174+
}};
73175
}
176+
#[allow(unused_imports)]
177+
use set as set_disjoint;
178+
#[allow(unused)]
179+
$body
74180
}
75-
}
76-
}
77-
78-
pub struct CaseSetter<const UP_TO: usize, const WITH_DEFAULT: bool> {
79-
offset: usize,
80-
len: usize,
81-
}
82-
83-
impl<const UP_TO: usize, const WITH_DEFAULT: bool> CaseSetter<UP_TO, WITH_DEFAULT> {
84-
#[inline]
85-
pub fn set<T: Clone + Copy>(&self, buf: &mut [T], val: T) {
86-
small_memset::<T, UP_TO, WITH_DEFAULT>(&mut buf[self.offset..][..self.len], val);
87-
}
88-
89-
/// # Safety
90-
///
91-
/// Caller must ensure that no elements of the written range are concurrently
92-
/// borrowed (immutably or mutably) at all during the call to `set_disjoint`.
93-
#[inline]
94-
pub fn set_disjoint<T, V>(&self, buf: &DisjointMut<T>, val: V)
95-
where
96-
T: AsMutPtr<Target = V>,
97-
V: Clone + Copy,
98-
{
99-
let mut buf = buf.index_mut(self.offset..self.offset + self.len);
100-
small_memset::<V, UP_TO, WITH_DEFAULT>(&mut *buf, val);
101-
}
181+
macro_rules! exec_block {
182+
($N:literal, $block:block) => {
183+
{
184+
#[allow(unused_macros)]
185+
macro_rules! set {
186+
($buf:expr, $val:expr) => {
187+
// SAFETY: The offset and length are checked by the
188+
// assert outside of the match.
189+
let buf_range = unsafe {
190+
$buf.get_unchecked_mut($offset..$offset+$N)
191+
};
192+
*<&mut [_; $N]>::try_from(buf_range).unwrap() = [$val; $N];
193+
};
194+
}
195+
#[allow(unused_macros)]
196+
macro_rules! set_disjoint {
197+
($buf:expr, $val:expr) => {{
198+
// SAFETY: The offset and length are checked by the
199+
// assert outside of the match.
200+
let mut buf_range = unsafe {
201+
$buf.index_mut_unchecked(($offset.., ..$N))
202+
};
203+
*<&mut [_; $N]>::try_from(&mut *buf_range).unwrap() = [$val; $N];
204+
}};
205+
}
206+
$block
207+
}
208+
};
209+
}
210+
match $len {
211+
01 if $UP_TO >= 01 => exec_block!(01, $body),
212+
02 if $UP_TO >= 02 => exec_block!(02, $body),
213+
04 if $UP_TO >= 04 => exec_block!(04, $body),
214+
08 if $UP_TO >= 08 => exec_block!(08, $body),
215+
16 if $UP_TO >= 16 => exec_block!(16, $body),
216+
32 if $UP_TO >= 32 => exec_block!(32, $body),
217+
64 if $UP_TO >= 64 => exec_block!(64, $body),
218+
_ => {
219+
if $($WITH_DEFAULT ||)? false {
220+
#[allow(unused_macros)]
221+
macro_rules! set {
222+
($buf:expr, $val:expr) => {{
223+
// SAFETY: The offset and length are checked by the
224+
// assert outside of the match.
225+
let buf_range = unsafe {
226+
$buf.get_unchecked_mut($offset..$offset+$len)
227+
};
228+
buf_range.fill($val);
229+
}};
230+
}
231+
#[allow(unused_macros)]
232+
macro_rules! set_disjoint {
233+
($buf:expr, $val:expr) => {{
234+
// SAFETY: The offset and length are checked by the
235+
// assert outside of the match.
236+
let mut buf_range = unsafe {
237+
$buf.index_mut_unchecked(($offset.., ..$len))
238+
};
239+
buf_range.fill($val);
240+
}};
241+
}
242+
$body
243+
}
244+
}
245+
}
246+
};
102247
}
248+
pub(crate) use case_set;
103249

104-
/// The entrypoint to the [`CaseSet`] API.
250+
/// Fill small ranges of buffers with a value.
105251
///
106-
/// `UP_TO` and `WITH_DEFAULT` are made const generic parameters rather than have multiple `case_set*` `fn`s,
107-
/// and these are put in a separate `struct` so that these 2 generic parameters
108-
/// can be manually specified while the ones on the methods are inferred.
109-
pub struct CaseSet<const UP_TO: usize, const WITH_DEFAULT: bool>;
110-
111-
impl<const UP_TO: usize, const WITH_DEFAULT: bool> CaseSet<UP_TO, WITH_DEFAULT> {
112-
/// Perform one case set.
113-
///
114-
/// This API is generic over the element type (`T`) rather than hardcoding `u8`,
115-
/// as sometimes other types are used, though only `i8` is used currently.
116-
///
117-
/// The `len` and `offset` are supplied here and
118-
/// applied to each `buf` passed to [`CaseSetter::set`] in `set_ctx`.
119-
#[inline]
120-
pub fn one<T, F>(ctx: T, len: usize, offset: usize, mut set_ctx: F)
121-
where
122-
F: FnMut(&CaseSetter<UP_TO, WITH_DEFAULT>, T),
123-
{
124-
set_ctx(&CaseSetter { offset, len }, ctx);
125-
}
126-
127-
/// Perform many case sets in one call.
128-
///
129-
/// This allows specifying the `set_ctx` closure inline easily,
130-
/// and also allows you to group the same args together.
131-
///
132-
/// The `lens`, `offsets`, and `dirs` are zipped and passed to [`CaseSet::one`],
133-
/// where `dirs` can be an array of any type and whose elements are passed back to the `set_ctx` closure.
134-
#[inline]
135-
pub fn many<T, F, const N: usize>(
136-
dirs: [T; N],
137-
lens: [usize; N],
138-
offsets: [usize; N],
139-
mut set_ctx: F,
140-
) where
141-
F: FnMut(&CaseSetter<UP_TO, WITH_DEFAULT>, T),
142-
{
143-
for (dir, (len, offset)) in zip(dirs, zip(lens, offsets)) {
144-
Self::one(dir, len, offset, &mut set_ctx);
145-
}
146-
}
252+
/// `$UP_TO` is the maximum length that will be optimized, with powers of two up
253+
/// to 64 supported. If the buffer length is not a power of two or greater than
254+
/// `$UP_TO`, this macro will still fill the buffer with a slower fallback.
255+
///
256+
/// See [`case_set!`] for examples and more documentation.
257+
macro_rules! case_set_with_default {
258+
(up_to=$UP_TO:literal, $($tt:tt)*) => {
259+
$crate::src::ctx::case_set!(up_to=$UP_TO, @DEFAULT=true, $($tt)*);
260+
};
147261
}
262+
pub(crate) use case_set_with_default;

0 commit comments

Comments
 (0)