|
1 | | -//! The [`CaseSet`] API below is a safe and simplified version of the `case_set*` macros in `ctx.h`. |
| 1 | +//! The [`case_set!`] macro is a safe and simplified version of the `case_set*` |
| 2 | +//! macros in `ctx.h`. |
2 | 3 | //! |
3 | 4 | //! The `case_set*` macros themselves replaced `memset`s in order to further optimize them |
4 | 5 | //! (in e3b5d4d044506f9e0e95e79b3de42fd94386cc61, |
|
13 | 14 | //! as unaligned writes are UB, and so we'd need to check at runtime if they're aligned |
14 | 15 | //! (a runtime-determined `off`set is used, so we can't reasonably ensure this at compile-time). |
15 | 16 | //! |
16 | | -//! To more thoroughly check this, I ran the same benchmarks done in |
17 | | -//! e3b5d4d044506f9e0e95e79b3de42fd94386cc61, which introduced the `case_set*` macros: |
| 17 | +//! We also want to avoid multiple switches when setting a group of buffers as |
| 18 | +//! the C implementation did, which was implemented in |
| 19 | +//! https://github.com/memorysafety/rav1d/pull/1293. |
18 | 20 | //! |
19 | | -//! ```sh |
20 | | -//! cargo build --release && hyperfine './target/release/dav1d -i ./tests/large/chimera_8b_1080p.ivf -l 1000 -o /dev/null' |
21 | | -//! ``` |
| 21 | +//! # Benchmarks |
22 | 22 | //! |
23 | | -//! for 3 implementations: |
24 | | -//! 1. the original `case_set*` macros translated directly to `unsafe` Rust `fn`s |
25 | | -//! 2. the safe [`CaseSet`] implementation below using [`small_memset`] with its small powers of 2 optimization |
26 | | -//! 3. a safe [`CaseSet`] implementation using [`slice::fill`]/`memset` only |
27 | | -//! |
28 | | -//! The [`small_memset`] version was ~1.27% faster than the `case_set*` one, |
29 | | -//! and ~3.26% faster than the `memset` one. |
30 | | -//! The `case_set*` macros were also faster than `memset` in C by a similar margin, |
31 | | -//! meaning the `memset` option is the slowest in both C and Rust, |
32 | | -//! and since it was replaced with `case_set*` in C, we shouldn't use it in Rust. |
33 | | -//! Thus, the [`small_memset`] implementation seems optimal, as it: |
34 | | -//! * is the fastest of the Rust implementations |
35 | | -//! * is completely safe |
36 | | -//! * employs the same small powers of 2 optimization the `case_set*` implementation did |
37 | | -//! * is far simpler than the `case_set*` implementation, consisting of a `match` and array writes |
| 23 | +//! Comparing this implementation to the previous implementation of `CaseSet` we |
| 24 | +//! see an 8.2-10.5% speedup for a single buffer, a 5.9-7.0% speedup for |
| 25 | +//! multiple buffers, and a minor improvement to multiple [`DisjointMut`] |
| 26 | +//! buffers (which happened to be well-optimized in the previous |
| 27 | +//! implementation). |
38 | 28 | //! |
39 | 29 | //! [`BlockContext`]: crate::src::env::BlockContext |
40 | | -use crate::src::disjoint_mut::AsMutPtr; |
41 | | -use crate::src::disjoint_mut::DisjointMut; |
42 | | -use std::iter::zip; |
| 30 | +//! [`DisjointMut`]: crate::src::disjoint_mut::DisjointMut |
43 | 31 |
|
44 | | -/// Perform a `memset` optimized for lengths that are small powers of 2. |
| 32 | +/// Fill small ranges of buffers with a value. |
| 33 | +/// |
| 34 | +/// This is effectively a specialized version [`slice::fill`] for small |
| 35 | +/// power-of-two sized ranges of buffers. |
| 36 | +/// |
| 37 | +/// `$UP_TO` is the maximum length that will be optimized, with powers of two up |
| 38 | +/// to 64 supported. If the buffer length is not a power of two or greater than |
| 39 | +/// `$UP_TO`, this macro will do nothing. See [`case_set_with_default!`] to fill |
| 40 | +/// buffers with non-comforming lengths if needed. |
| 41 | +/// |
| 42 | +/// # Examples |
| 43 | +/// |
| 44 | +/// ``` |
| 45 | +/// # use rav1d::case_set; |
| 46 | +/// let mut buf = [0u8; 32]; |
| 47 | +/// let len = 16; |
| 48 | +/// for offset in [0, 16] { |
| 49 | +/// case_set!(up_to = 32, len, offset, { |
| 50 | +/// set!(&mut buf, 1u8); |
| 51 | +/// }); |
| 52 | +/// } |
| 53 | +/// ``` |
| 54 | +/// |
| 55 | +/// In the simplest case, `$len` is the length of the buffer range to fill |
| 56 | +/// starting from `$offset`. The `$body` block is executed with `len` and |
| 57 | +/// `offset` identifiers set to the given length and offset values. Within the |
| 58 | +/// body a `set!` macro is available and must be called to set each buffer range |
| 59 | +/// to a value. `set!` takes a buffer and a value and sets the range |
| 60 | +/// `buf[offset..][..len]` to the value. |
| 61 | +/// ``` |
| 62 | +/// # macro_rules! set { |
| 63 | +/// # ($buf:expr, $val:expr) => {}; |
| 64 | +/// # } |
| 65 | +/// set!(buf, value); |
| 66 | +/// ``` |
| 67 | +/// |
| 68 | +/// ## Naming parameters |
| 69 | +/// |
| 70 | +/// The identifier for either or both of `len` and `offset` can be overridden by |
| 71 | +/// specifying `identifer=value` for those parameters: |
| 72 | +/// ``` |
| 73 | +/// # use rav1d::case_set; |
| 74 | +/// let mut buf = [0u8; 32]; |
| 75 | +/// let outer_len = 16; |
| 76 | +/// for outer_offset in [0, 16] { |
| 77 | +/// case_set!( |
| 78 | +/// up_to = 32, |
| 79 | +/// len=outer_len, |
| 80 | +/// offset=outer_offset, |
| 81 | +/// { |
| 82 | +/// set!(&mut buf, (offset+len) as u8); |
| 83 | +/// } |
| 84 | +/// ); |
| 85 | +/// } |
| 86 | +/// ``` |
| 87 | +/// |
| 88 | +/// ## `DisjointMut` buffers |
| 89 | +/// |
| 90 | +/// [`DisjointMut`] buffers can be used in basically the same way as normal |
| 91 | +/// buffers but using the `set_disjoint!` macro instead of `set!`. |
| 92 | +/// ``` |
| 93 | +/// # use rav1d::case_set; |
| 94 | +/// # use rav1d::src::disjoint_mut::DisjointMut; |
| 95 | +/// let mut buf = DisjointMut::new([0u8; 32]); |
| 96 | +/// let len = 16; |
| 97 | +/// for offset in [0, 16] { |
| 98 | +/// case_set!(up_to = 32, len, offset, { |
| 99 | +/// set_disjoint!(&mut buf, 1u8); |
| 100 | +/// }); |
| 101 | +/// } |
| 102 | +/// ``` |
| 103 | +/// |
| 104 | +/// ## Multiple buffer ranges |
| 105 | +/// |
| 106 | +/// Multiple buffers with different lengths and offsets can be filled with the |
| 107 | +/// same body statements. In the following example, two buffers with different |
| 108 | +/// sizes are initialized by quarters. |
| 109 | +/// ``` |
| 110 | +/// # use rav1d::case_set; |
| 111 | +/// let mut buf1 = [0u8; 32]; |
| 112 | +/// let mut buf2 = [0u8; 64]; |
| 113 | +/// for offset in [0, 8, 16, 24] { |
| 114 | +/// case_set!( |
| 115 | +/// up_to = 16, |
| 116 | +/// buf = [&mut buf1[..], &mut buf2[..]], |
| 117 | +/// len = [8, 16], |
| 118 | +/// offset = [offset, offset*2], |
| 119 | +/// { |
| 120 | +/// set!(buf, len as u8 >> 3); |
| 121 | +/// } |
| 122 | +/// ); |
| 123 | +/// } |
| 124 | +/// ``` |
45 | 125 | /// |
46 | | -/// For power of 2 lengths `<= UP_TO`, |
47 | | -/// the `memset` is done as an array write of that exactly (compile-time known) length. |
48 | | -/// If the length is not a power of 2 or `> UP_TO`, |
49 | | -/// then the `memset` is done by [`slice::fill`] (a `memset` call) if `WITH_DEFAULT` is `true`, |
50 | | -/// or else skipped if `WITH_DEFAULT` is `false`. |
| 126 | +/// A more realistic example of filling multiple buffers with the same value is |
| 127 | +/// initializing different struct fields at the same time (from |
| 128 | +/// `src/decode.rs`): |
| 129 | +/// ```ignore |
| 130 | +/// case_set!( |
| 131 | +/// up_to = 32, |
| 132 | +/// ctx = [(&t.l, 1), (&f.a[t.a], 0)], |
| 133 | +/// len = [bh4, bw4], |
| 134 | +/// offset = [by4, bx4], |
| 135 | +/// { |
| 136 | +/// let (dir, dir_index) = ctx; |
| 137 | +/// set_disjoint!(dir.seg_pred, seg_pred.into()); |
| 138 | +/// set_disjoint!(dir.skip_mode, b.skip_mode); |
| 139 | +/// set_disjoint!(dir.intra, 0); |
| 140 | +/// set_disjoint!(dir.skip, b.skip); |
| 141 | +/// set_disjoint!(dir.pal_sz, 0); |
| 142 | +/// } |
| 143 | +/// ); |
| 144 | +/// ``` |
51 | 145 | /// |
52 | | -/// This optimizes for the common cases where `buf.len()` is a small power of 2, |
53 | | -/// where the array write is optimized as few and large stores as possible. |
54 | | -#[inline] |
55 | | -pub fn small_memset<T: Clone + Copy, const UP_TO: usize, const WITH_DEFAULT: bool>( |
56 | | - buf: &mut [T], |
57 | | - val: T, |
58 | | -) { |
59 | | - fn as_array<T: Clone + Copy, const N: usize>(buf: &mut [T]) -> &mut [T; N] { |
60 | | - buf.try_into().unwrap() |
61 | | - } |
62 | | - match buf.len() { |
63 | | - 01 if UP_TO >= 01 => *as_array(buf) = [val; 01], |
64 | | - 02 if UP_TO >= 02 => *as_array(buf) = [val; 02], |
65 | | - 04 if UP_TO >= 04 => *as_array(buf) = [val; 04], |
66 | | - 08 if UP_TO >= 08 => *as_array(buf) = [val; 08], |
67 | | - 16 if UP_TO >= 16 => *as_array(buf) = [val; 16], |
68 | | - 32 if UP_TO >= 32 => *as_array(buf) = [val; 32], |
69 | | - 64 if UP_TO >= 64 => *as_array(buf) = [val; 64], |
70 | | - _ => { |
71 | | - if WITH_DEFAULT { |
72 | | - buf.fill(val) |
| 146 | +/// [`DisjointMut`]: crate::src::disjoint_mut::DisjointMut |
| 147 | +macro_rules! case_set { |
| 148 | + (up_to=$UP_TO:literal, $(@DEFAULT=$WITH_DEFAULT:literal,)? $ctx:ident=[$($ctx_expr:expr),* $(,)?], $len:ident=[$($len_expr:expr),* $(,)?], $offset:ident=[$($offset_expr:expr),* $(,)?], $body:block) => { |
| 149 | + let ctxs = [$($ctx_expr,)*]; |
| 150 | + let lens = [$($len_expr,)*]; |
| 151 | + let offsets = [$($offset_expr,)*]; |
| 152 | + assert_eq!(ctxs.len(), lens.len()); |
| 153 | + assert_eq!(ctxs.len(), offsets.len()); |
| 154 | + for (i, ctx) in ctxs.into_iter().enumerate() { |
| 155 | + case_set!(up_to=$UP_TO, $(@DEFAULT=$WITH_DEFAULT,)? $ctx=ctx, $len=lens[i], $offset=offsets[i], $body); |
| 156 | + } |
| 157 | + }; |
| 158 | + (up_to=$UP_TO:literal, $(@DEFAULT=$WITH_DEFAULT:literal,)? $len:ident, $offset:ident, $body:block) => { |
| 159 | + case_set!(up_to=$UP_TO, $(@DEFAULT=$WITH_DEFAULT,)? _ctx=(), $len=$len, $offset=$offset, $body); |
| 160 | + }; |
| 161 | + (up_to=$UP_TO:literal, $(@DEFAULT=$WITH_DEFAULT:literal,)? $len:ident=$len_expr:expr, $offset:ident=$offset_expr:expr, $body:block) => { |
| 162 | + case_set!(up_to=$UP_TO, $(@DEFAULT=$WITH_DEFAULT,)? _ctx=(), $len=$len_expr, $offset=$offset_expr, $body); |
| 163 | + }; |
| 164 | + (up_to=$UP_TO:literal, $(@DEFAULT=$WITH_DEFAULT:literal,)? $ctx:ident=$ctx_expr:expr, $len:ident=$len_expr:expr, $offset:ident=$offset_expr:expr, $body:block) => { |
| 165 | + #[allow(unused_mut)] |
| 166 | + let mut $ctx = $ctx_expr; |
| 167 | + let $len = $len_expr; |
| 168 | + let $offset = $offset_expr; |
| 169 | + { |
| 170 | + #[allow(unused_macros)] |
| 171 | + macro_rules! set { |
| 172 | + ($buf:expr, $val:expr) => {{ |
| 173 | + assert!($offset <= $buf.len() && $offset + $len <= $buf.len()); |
| 174 | + }}; |
73 | 175 | } |
| 176 | + #[allow(unused_imports)] |
| 177 | + use set as set_disjoint; |
| 178 | + #[allow(unused)] |
| 179 | + $body |
74 | 180 | } |
75 | | - } |
76 | | -} |
77 | | - |
78 | | -pub struct CaseSetter<const UP_TO: usize, const WITH_DEFAULT: bool> { |
79 | | - offset: usize, |
80 | | - len: usize, |
81 | | -} |
82 | | - |
83 | | -impl<const UP_TO: usize, const WITH_DEFAULT: bool> CaseSetter<UP_TO, WITH_DEFAULT> { |
84 | | - #[inline] |
85 | | - pub fn set<T: Clone + Copy>(&self, buf: &mut [T], val: T) { |
86 | | - small_memset::<T, UP_TO, WITH_DEFAULT>(&mut buf[self.offset..][..self.len], val); |
87 | | - } |
88 | | - |
89 | | - /// # Safety |
90 | | - /// |
91 | | - /// Caller must ensure that no elements of the written range are concurrently |
92 | | - /// borrowed (immutably or mutably) at all during the call to `set_disjoint`. |
93 | | - #[inline] |
94 | | - pub fn set_disjoint<T, V>(&self, buf: &DisjointMut<T>, val: V) |
95 | | - where |
96 | | - T: AsMutPtr<Target = V>, |
97 | | - V: Clone + Copy, |
98 | | - { |
99 | | - let mut buf = buf.index_mut(self.offset..self.offset + self.len); |
100 | | - small_memset::<V, UP_TO, WITH_DEFAULT>(&mut *buf, val); |
101 | | - } |
| 181 | + macro_rules! exec_block { |
| 182 | + ($N:literal, $block:block) => { |
| 183 | + { |
| 184 | + #[allow(unused_macros)] |
| 185 | + macro_rules! set { |
| 186 | + ($buf:expr, $val:expr) => { |
| 187 | + // SAFETY: The offset and length are checked by the |
| 188 | + // assert outside of the match. |
| 189 | + let buf_range = unsafe { |
| 190 | + $buf.get_unchecked_mut($offset..$offset+$N) |
| 191 | + }; |
| 192 | + *<&mut [_; $N]>::try_from(buf_range).unwrap() = [$val; $N]; |
| 193 | + }; |
| 194 | + } |
| 195 | + #[allow(unused_macros)] |
| 196 | + macro_rules! set_disjoint { |
| 197 | + ($buf:expr, $val:expr) => {{ |
| 198 | + // SAFETY: The offset and length are checked by the |
| 199 | + // assert outside of the match. |
| 200 | + let mut buf_range = unsafe { |
| 201 | + $buf.index_mut_unchecked(($offset.., ..$N)) |
| 202 | + }; |
| 203 | + *<&mut [_; $N]>::try_from(&mut *buf_range).unwrap() = [$val; $N]; |
| 204 | + }}; |
| 205 | + } |
| 206 | + $block |
| 207 | + } |
| 208 | + }; |
| 209 | + } |
| 210 | + match $len { |
| 211 | + 01 if $UP_TO >= 01 => exec_block!(01, $body), |
| 212 | + 02 if $UP_TO >= 02 => exec_block!(02, $body), |
| 213 | + 04 if $UP_TO >= 04 => exec_block!(04, $body), |
| 214 | + 08 if $UP_TO >= 08 => exec_block!(08, $body), |
| 215 | + 16 if $UP_TO >= 16 => exec_block!(16, $body), |
| 216 | + 32 if $UP_TO >= 32 => exec_block!(32, $body), |
| 217 | + 64 if $UP_TO >= 64 => exec_block!(64, $body), |
| 218 | + _ => { |
| 219 | + if $($WITH_DEFAULT ||)? false { |
| 220 | + #[allow(unused_macros)] |
| 221 | + macro_rules! set { |
| 222 | + ($buf:expr, $val:expr) => {{ |
| 223 | + // SAFETY: The offset and length are checked by the |
| 224 | + // assert outside of the match. |
| 225 | + let buf_range = unsafe { |
| 226 | + $buf.get_unchecked_mut($offset..$offset+$len) |
| 227 | + }; |
| 228 | + buf_range.fill($val); |
| 229 | + }}; |
| 230 | + } |
| 231 | + #[allow(unused_macros)] |
| 232 | + macro_rules! set_disjoint { |
| 233 | + ($buf:expr, $val:expr) => {{ |
| 234 | + // SAFETY: The offset and length are checked by the |
| 235 | + // assert outside of the match. |
| 236 | + let mut buf_range = unsafe { |
| 237 | + $buf.index_mut_unchecked(($offset.., ..$len)) |
| 238 | + }; |
| 239 | + buf_range.fill($val); |
| 240 | + }}; |
| 241 | + } |
| 242 | + $body |
| 243 | + } |
| 244 | + } |
| 245 | + } |
| 246 | + }; |
102 | 247 | } |
| 248 | +pub(crate) use case_set; |
103 | 249 |
|
104 | | -/// The entrypoint to the [`CaseSet`] API. |
| 250 | +/// Fill small ranges of buffers with a value. |
105 | 251 | /// |
106 | | -/// `UP_TO` and `WITH_DEFAULT` are made const generic parameters rather than have multiple `case_set*` `fn`s, |
107 | | -/// and these are put in a separate `struct` so that these 2 generic parameters |
108 | | -/// can be manually specified while the ones on the methods are inferred. |
109 | | -pub struct CaseSet<const UP_TO: usize, const WITH_DEFAULT: bool>; |
110 | | - |
111 | | -impl<const UP_TO: usize, const WITH_DEFAULT: bool> CaseSet<UP_TO, WITH_DEFAULT> { |
112 | | - /// Perform one case set. |
113 | | - /// |
114 | | - /// This API is generic over the element type (`T`) rather than hardcoding `u8`, |
115 | | - /// as sometimes other types are used, though only `i8` is used currently. |
116 | | - /// |
117 | | - /// The `len` and `offset` are supplied here and |
118 | | - /// applied to each `buf` passed to [`CaseSetter::set`] in `set_ctx`. |
119 | | - #[inline] |
120 | | - pub fn one<T, F>(ctx: T, len: usize, offset: usize, mut set_ctx: F) |
121 | | - where |
122 | | - F: FnMut(&CaseSetter<UP_TO, WITH_DEFAULT>, T), |
123 | | - { |
124 | | - set_ctx(&CaseSetter { offset, len }, ctx); |
125 | | - } |
126 | | - |
127 | | - /// Perform many case sets in one call. |
128 | | - /// |
129 | | - /// This allows specifying the `set_ctx` closure inline easily, |
130 | | - /// and also allows you to group the same args together. |
131 | | - /// |
132 | | - /// The `lens`, `offsets`, and `dirs` are zipped and passed to [`CaseSet::one`], |
133 | | - /// where `dirs` can be an array of any type and whose elements are passed back to the `set_ctx` closure. |
134 | | - #[inline] |
135 | | - pub fn many<T, F, const N: usize>( |
136 | | - dirs: [T; N], |
137 | | - lens: [usize; N], |
138 | | - offsets: [usize; N], |
139 | | - mut set_ctx: F, |
140 | | - ) where |
141 | | - F: FnMut(&CaseSetter<UP_TO, WITH_DEFAULT>, T), |
142 | | - { |
143 | | - for (dir, (len, offset)) in zip(dirs, zip(lens, offsets)) { |
144 | | - Self::one(dir, len, offset, &mut set_ctx); |
145 | | - } |
146 | | - } |
| 252 | +/// `$UP_TO` is the maximum length that will be optimized, with powers of two up |
| 253 | +/// to 64 supported. If the buffer length is not a power of two or greater than |
| 254 | +/// `$UP_TO`, this macro will still fill the buffer with a slower fallback. |
| 255 | +/// |
| 256 | +/// See [`case_set!`] for examples and more documentation. |
| 257 | +macro_rules! case_set_with_default { |
| 258 | + (up_to=$UP_TO:literal, $($tt:tt)*) => { |
| 259 | + $crate::src::ctx::case_set!(up_to=$UP_TO, @DEFAULT=true, $($tt)*); |
| 260 | + }; |
147 | 261 | } |
| 262 | +pub(crate) use case_set_with_default; |
0 commit comments