Skip to content

Commit f5021fb

Browse files
committed
simd::unfilter_paethN: Load 4 (or 8) bytes at a time (faster than 3 or 6).
This CL loads RGB data using 4-bytes-wide loads (and RRGGBB data using 8-byte-wide loads), because: * This is faster as measured by the microbenchmarks below * It doesn't change the behavior - before and after these changes we were ignoring the 4th SIMD lane when processing RGB data (after this change the 4th SIMD lane will contain data from the next pixel, before this change it contained a 0 value) * This is safe as long as we have more than 4 bytes of remaining input data (we have to fall back to a 3-bytes-wide load for the last pixel). Results of running microbenchmarks on the author's machine: ``` $ bench --bench=unfilter --features=unstable,benchmarks -- --baseline=simd1 Paeth/bpp=[36] ... unfilter/filter=Paeth/bpp=3 time: [18.755 µs 18.761 µs 18.767 µs] thrpt: [624.44 MiB/s 624.65 MiB/s 624.83 MiB/s] change: time: [-16.148% -15.964% -15.751%] (p = 0.00 < 0.05) thrpt: [+18.696% +18.997% +19.258%] Performance has improved. ... unfilter/filter=Paeth/bpp=6 time: [18.991 µs 19.000 µs 19.009 µs] thrpt: [1.2041 GiB/s 1.2047 GiB/s 1.2052 GiB/s] change: time: [-15.161% -15.074% -14.987%] (p = 0.00 < 0.05) thrpt: [+17.629% +17.750% +17.871%] Performance has improved. ```
1 parent d7798cd commit f5021fb

File tree

1 file changed

+44
-10
lines changed

1 file changed

+44
-10
lines changed

src/filter.rs

Lines changed: 44 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -96,17 +96,34 @@ mod simd {
9696
}
9797

9898
/// Undoes `FilterType::Paeth` for `BytesPerPixel::Three`.
99-
pub fn unfilter_paeth3(prev_row: &[u8], curr_row: &mut [u8]) {
99+
pub fn unfilter_paeth3(mut prev_row: &[u8], mut curr_row: &mut [u8]) {
100100
debug_assert_eq!(prev_row.len(), curr_row.len());
101101
debug_assert_eq!(prev_row.len() % 3, 0);
102102

103103
let mut state = PaethState::<4>::default();
104-
for (prev, curr) in prev_row.chunks_exact(3).zip(curr_row.chunks_exact_mut(3)) {
105-
let b = load3(prev);
106-
let mut x = load3(curr);
104+
while prev_row.len() >= 4 {
105+
// `u8x4` requires working with `[u8;4]`, but we can just load and ignore the first
106+
// byte from the next triple. This optimization technique mimics the algorithm found
107+
// in
108+
// https://github.com/glennrp/libpng/blob/f8e5fa92b0e37ab597616f554bee254157998227/intel/filter_sse2_intrinsics.c#L130-L131
109+
let b = u8x4::from_slice(prev_row);
110+
let mut x = u8x4::from_slice(curr_row);
111+
107112
paeth_step(&mut state, b, &mut x);
108-
store3(x, curr);
113+
114+
// We can speculate that writing 4 bytes might be more efficient (just as with using
115+
// `u8x4::from_slice` above), but we can't use that here, because we can't clobber the
116+
// first byte of the next pixel in the `curr_row`.
117+
store3(x, curr_row);
118+
119+
prev_row = &prev_row[3..];
120+
curr_row = &mut curr_row[3..];
109121
}
122+
// Can't use `u8x4::from_slice` for the last `[u8;3]`.
123+
let b = load3(prev_row);
124+
let mut x = load3(curr_row);
125+
paeth_step(&mut state, b, &mut x);
126+
store3(x, curr_row);
110127
}
111128

112129
fn load6(src: &[u8]) -> u8x8 {
@@ -118,17 +135,34 @@ mod simd {
118135
}
119136

120137
/// Undoes `FilterType::Paeth` for `BytesPerPixel::Three`.
121-
pub fn unfilter_paeth6(prev_row: &[u8], curr_row: &mut [u8]) {
138+
pub fn unfilter_paeth6(mut prev_row: &[u8], mut curr_row: &mut [u8]) {
122139
debug_assert_eq!(prev_row.len(), curr_row.len());
123140
debug_assert_eq!(prev_row.len() % 6, 0);
124141

125142
let mut state = PaethState::<8>::default();
126-
for (prev, curr) in prev_row.chunks_exact(6).zip(curr_row.chunks_exact_mut(6)) {
127-
let b = load6(prev);
128-
let mut x = load6(curr);
143+
while prev_row.len() >= 8 {
144+
// `u8x8` requires working with `[u8;8]`, but we can just load and ignore the first two
145+
// bytes from the next pixel. This optimization technique mimics the algorithm found
146+
// in
147+
// https://github.com/glennrp/libpng/blob/f8e5fa92b0e37ab597616f554bee254157998227/intel/filter_sse2_intrinsics.c#L130-L131
148+
let b = u8x8::from_slice(prev_row);
149+
let mut x = u8x8::from_slice(curr_row);
150+
129151
paeth_step(&mut state, b, &mut x);
130-
store6(x, curr);
152+
153+
// We can speculate that writing 8 bytes might be more efficient (just as with using
154+
// `u8x8::from_slice` above), but we can't use that here, because we can't clobber the
155+
// first bytes of the next pixel in the `curr_row`.
156+
store6(x, curr_row);
157+
158+
prev_row = &prev_row[6..];
159+
curr_row = &mut curr_row[6..];
131160
}
161+
// Can't use `u8x8::from_slice` for the last `[u8;6]`.
162+
let b = load6(prev_row);
163+
let mut x = load6(curr_row);
164+
paeth_step(&mut state, b, &mut x);
165+
store6(x, curr_row);
132166
}
133167
}
134168

0 commit comments

Comments
 (0)