Skip to content

Commit 6d1dbe1

Browse files
authored
Merge pull request #72 from awxkee/novtb
Box blur improvements
2 parents 6b85dd0 + 62bbce2 commit 6d1dbe1

33 files changed

+1943
-1728
lines changed

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ image = { version = "0.25", optional = true, default-features = false }
2828
rustfft = { version = "6.3", optional = true }
2929
fast_transpose = { version = "0.2.5", optional = true }
3030
num-complex = "0.4"
31-
novtb = "^0.1.4"
31+
novtb = "^0.1.6"
3232

3333
[features]
3434
default = ["avx", "sse", "rdm", "neon"]

app/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ edition = "2021"
77
colorutils-rs = "0.7.0"
88
half = "2.4.1"
99
image = "0.25.5"
10-
libblur = { path = "../", features = ["image", "fft", "rdm", "neon", "sse", "avx", "nightly_avx512"], default-features = false }
10+
libblur = { path = "../", features = [], default-features = false }
1111
accelerate = { path = "accelerate" }
1212
rayon = "1.10.0"
1313
fast_transpose = "0.2.5"

app/src/main.rs

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,13 @@
3030
mod merge;
3131
mod split;
3232

33+
use image::imageops::FilterType;
3334
use image::{EncodableLayout, GenericImageView, ImageReader};
3435
use libblur::{
3536
bilateral_filter, complex_gaussian_kernel, fast_bilateral_filter, fast_bilateral_filter_u16,
36-
filter_1d_complex, filter_1d_complex_fixed_point, filter_2d_rgba_fft, gaussian_blur,
37-
gaussian_kernel_1d, lens_kernel, sigma_size, AnisotropicRadius, BilateralBlurParams, BlurImage,
38-
BlurImageMut, BoxBlurParameters, CLTParameters, ConvolutionMode, EdgeMode, FastBlurChannels,
37+
filter_1d_complex, filter_1d_complex_fixed_point, gaussian_blur, gaussian_kernel_1d,
38+
lens_kernel, sigma_size, AnisotropicRadius, BilateralBlurParams, BlurImage, BlurImageMut,
39+
BoxBlurParameters, CLTParameters, ConvolutionMode, EdgeMode, FastBlurChannels,
3940
GaussianBlurParams, KernelShape, Scalar, ThreadingPolicy, TransferFunction,
4041
};
4142
use num_complex::Complex;
@@ -91,10 +92,10 @@ fn main() {
9192
let mut v_vec = src_bytes
9293
.to_vec()
9394
.iter()
94-
.map(|&x| x)
95+
// .map(|&x| x)
9596
// .map(|&x| (x as f32 / 255.))
96-
// .map(|&x| u16::from_ne_bytes([x, x]))
97-
.collect::<Vec<u8>>();
97+
.map(|&x| u16::from_ne_bytes([x, x]))
98+
.collect::<Vec<u16>>();
9899

99100
// let mut dst_image = BlurImageMut::borrow(
100101
// &mut v_vec,
@@ -103,6 +104,7 @@ fn main() {
103104
// FastBlurChannels::Channels4,
104105
// );
105106

107+
// let z0 = v_vec.iter().map(|&x| (x as f32 * (1. / 255.))).collect::<Vec<_>>();
106108
let cvt = BlurImage::borrow(
107109
&v_vec,
108110
dyn_image.width(),
@@ -122,7 +124,7 @@ fn main() {
122124
// let gaussian_kernel = gaussian_kernel_1d(31, sigma_size(31.)).iter().map(|&x| Complex::new(x, 0.0)).collect::<Vec<Complex<f32>>>();
123125
let gaussian_kernel = complex_gaussian_kernel(51., 0.75, 5.);
124126

125-
let mut dst_image = cvt.clone_as_mut();
127+
let mut dst_image = BlurImageMut::default(); //cvt.clone_as_mut();
126128

127129
// gaussian_blur(
128130
// &cvt,
@@ -155,16 +157,13 @@ fn main() {
155157

156158
// }
157159

158-
libblur::bilateral_filter(
160+
libblur::box_blur_u16(
159161
&cvt,
160162
&mut dst_image,
161-
BilateralBlurParams {
162-
kernel_size: 15,
163-
spatial_sigma: 5.,
164-
range_sigma: 5.,
163+
BoxBlurParameters {
164+
x_axis_kernel: 7,
165+
y_axis_kernel: 7,
165166
},
166-
EdgeMode::Clamp,
167-
Scalar::default(),
168167
ThreadingPolicy::Single,
169168
)
170169
.unwrap();
@@ -187,16 +186,17 @@ fn main() {
187186
// )
188187
// .unwrap();
189188

190-
// let j_dag = dst_image.to_immutable_ref();
189+
let j_dag = dst_image.to_immutable_ref();
190+
191191
// let gamma = j_dag.gamma8(TransferFunction::Srgb, true).unwrap();
192192

193193
dst_bytes = dst_image
194194
.data
195195
.borrow_mut()
196196
.iter()
197-
.map(|&x| x)
197+
// .map(|&x| x)
198198
// .map(|&x| (x * 255f32).round() as u8)
199-
// .map(|&x| (x >> 8) as u8)
199+
.map(|&x| (x >> 8) as u8)
200200
.collect::<Vec<u8>>();
201201

202202
// dst_bytes = dst_image.data.borrow().to_vec();

src/box_filter/avx/hrgba8.rs

Lines changed: 57 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ unsafe fn box_blur_horizontal_pass_impl<const CN: usize>(
107107
let edge_count = (kernel_size / 2) + 1;
108108
let v_edge_count = _mm256_set1_epi32(edge_count as i32);
109109

110-
let v_weight = _mm256_set1_ps(1f32 / (radius * 2) as f32);
110+
let v_weight = _mm256_set1_ps(1f32 / (radius * 2 + 1) as f32);
111111

112112
let half_kernel = kernel_size / 2;
113113

@@ -140,7 +140,7 @@ unsafe fn box_blur_horizontal_pass_impl<const CN: usize>(
140140
}
141141

142142
unsafe {
143-
for x in 1..half_kernel as usize {
143+
for x in 1..=half_kernel as usize {
144144
let px = x.min(width as usize - 1) * CN;
145145

146146
let s_ptr_0 = src.as_ptr().add(y_src_shift + px);
@@ -161,7 +161,50 @@ unsafe fn box_blur_horizontal_pass_impl<const CN: usize>(
161161
}
162162

163163
for x in 0..width {
164-
// preload edge pixels
164+
let px = x as usize * CN;
165+
166+
unsafe {
167+
let scale_store_ps0 = _mm256_cvtepi32_ps(store_0);
168+
let scale_store_ps1 = _mm256_cvtepi32_ps(store_1);
169+
let scale_store_ps2 = _mm256_cvtepi32_ps(store_2);
170+
171+
let r0 = _mm256_mul_ps(scale_store_ps0, v_weight);
172+
let r1 = _mm256_mul_ps(scale_store_ps1, v_weight);
173+
let r2 = _mm256_mul_ps(scale_store_ps2, v_weight);
174+
175+
let scale_store0 = _mm256_cvtps_epi32(r0);
176+
let scale_store1 = _mm256_cvtps_epi32(r1);
177+
let scale_store2 = _mm256_cvtps_epi32(r2);
178+
179+
let px_160 = _mm256_packus_epi32(scale_store0, _mm256_setzero_si256());
180+
let px_161 = _mm256_packus_epi32(scale_store1, _mm256_setzero_si256());
181+
let px_162 = _mm256_packus_epi32(scale_store2, _mm256_setzero_si256());
182+
183+
let px_80 = _mm256_packus_epi16(px_160, _mm256_setzero_si256());
184+
let px_81 = _mm256_packus_epi16(px_161, _mm256_setzero_si256());
185+
let px_82 = _mm256_packus_epi16(px_162, _mm256_setzero_si256());
186+
187+
let bytes_offset_0 = y_dst_shift + px;
188+
let bytes_offset_1 = y_dst_shift + dst_stride as usize + px;
189+
let bytes_offset_2 = y_dst_shift + dst_stride as usize * 2 + px;
190+
let bytes_offset_3 = y_dst_shift + dst_stride as usize * 3 + px;
191+
let bytes_offset_4 = y_dst_shift + dst_stride as usize * 4 + px;
192+
let bytes_offset_5 = y_dst_shift + dst_stride as usize * 5 + px;
193+
194+
let dst_ptr_0 = unsafe_dst.slice.as_ptr().add(bytes_offset_0) as *mut u8;
195+
let dst_ptr_1 = unsafe_dst.slice.as_ptr().add(bytes_offset_1) as *mut u8;
196+
let dst_ptr_2 = unsafe_dst.slice.as_ptr().add(bytes_offset_2) as *mut u8;
197+
let dst_ptr_3 = unsafe_dst.slice.as_ptr().add(bytes_offset_3) as *mut u8;
198+
let dst_ptr_4 = unsafe_dst.slice.as_ptr().add(bytes_offset_4) as *mut u8;
199+
let dst_ptr_5 = unsafe_dst.slice.as_ptr().add(bytes_offset_5) as *mut u8;
200+
201+
write_u8::<CN>(dst_ptr_0, _mm256_castsi256_si128(px_80));
202+
write_u8::<CN>(dst_ptr_1, _mm256_extracti128_si256::<1>(px_80));
203+
write_u8::<CN>(dst_ptr_2, _mm256_castsi256_si128(px_81));
204+
write_u8::<CN>(dst_ptr_3, _mm256_extracti128_si256::<1>(px_81));
205+
write_u8::<CN>(dst_ptr_4, _mm256_castsi256_si128(px_82));
206+
write_u8::<CN>(dst_ptr_5, _mm256_extracti128_si256::<1>(px_82));
207+
}
165208

166209
// subtract previous
167210
unsafe {
@@ -196,7 +239,7 @@ unsafe fn box_blur_horizontal_pass_impl<const CN: usize>(
196239

197240
// add next
198241
unsafe {
199-
let next_x = (x + half_kernel).min(width - 1) as usize;
242+
let next_x = (x + half_kernel + 1).min(width - 1) as usize;
200243

201244
let next = next_x * CN;
202245

@@ -223,51 +266,6 @@ unsafe fn box_blur_horizontal_pass_impl<const CN: usize>(
223266
store_1 = _mm256_add_epi32(store_1, edge_colors_1);
224267
store_2 = _mm256_add_epi32(store_2, edge_colors_2);
225268
}
226-
227-
let px = x as usize * CN;
228-
229-
unsafe {
230-
let scale_store_ps0 = _mm256_cvtepi32_ps(store_0);
231-
let scale_store_ps1 = _mm256_cvtepi32_ps(store_1);
232-
let scale_store_ps2 = _mm256_cvtepi32_ps(store_2);
233-
234-
let r0 = _mm256_mul_ps(scale_store_ps0, v_weight);
235-
let r1 = _mm256_mul_ps(scale_store_ps1, v_weight);
236-
let r2 = _mm256_mul_ps(scale_store_ps2, v_weight);
237-
238-
let scale_store0 = _mm256_cvtps_epi32(r0);
239-
let scale_store1 = _mm256_cvtps_epi32(r1);
240-
let scale_store2 = _mm256_cvtps_epi32(r2);
241-
242-
let px_160 = _mm256_packus_epi32(scale_store0, _mm256_setzero_si256());
243-
let px_161 = _mm256_packus_epi32(scale_store1, _mm256_setzero_si256());
244-
let px_162 = _mm256_packus_epi32(scale_store2, _mm256_setzero_si256());
245-
246-
let px_80 = _mm256_packus_epi16(px_160, _mm256_setzero_si256());
247-
let px_81 = _mm256_packus_epi16(px_161, _mm256_setzero_si256());
248-
let px_82 = _mm256_packus_epi16(px_162, _mm256_setzero_si256());
249-
250-
let bytes_offset_0 = y_dst_shift + px;
251-
let bytes_offset_1 = y_dst_shift + dst_stride as usize + px;
252-
let bytes_offset_2 = y_dst_shift + dst_stride as usize * 2 + px;
253-
let bytes_offset_3 = y_dst_shift + dst_stride as usize * 3 + px;
254-
let bytes_offset_4 = y_dst_shift + dst_stride as usize * 4 + px;
255-
let bytes_offset_5 = y_dst_shift + dst_stride as usize * 5 + px;
256-
257-
let dst_ptr_0 = unsafe_dst.slice.as_ptr().add(bytes_offset_0) as *mut u8;
258-
let dst_ptr_1 = unsafe_dst.slice.as_ptr().add(bytes_offset_1) as *mut u8;
259-
let dst_ptr_2 = unsafe_dst.slice.as_ptr().add(bytes_offset_2) as *mut u8;
260-
let dst_ptr_3 = unsafe_dst.slice.as_ptr().add(bytes_offset_3) as *mut u8;
261-
let dst_ptr_4 = unsafe_dst.slice.as_ptr().add(bytes_offset_4) as *mut u8;
262-
let dst_ptr_5 = unsafe_dst.slice.as_ptr().add(bytes_offset_5) as *mut u8;
263-
264-
write_u8::<CN>(dst_ptr_0, _mm256_castsi256_si128(px_80));
265-
write_u8::<CN>(dst_ptr_1, _mm256_extracti128_si256::<1>(px_80));
266-
write_u8::<CN>(dst_ptr_2, _mm256_castsi256_si128(px_81));
267-
write_u8::<CN>(dst_ptr_3, _mm256_extracti128_si256::<1>(px_81));
268-
write_u8::<CN>(dst_ptr_4, _mm256_castsi256_si128(px_82));
269-
write_u8::<CN>(dst_ptr_5, _mm256_extracti128_si256::<1>(px_82));
270-
}
271269
}
272270

273271
yy += 6;
@@ -286,7 +284,7 @@ unsafe fn box_blur_horizontal_pass_impl<const CN: usize>(
286284
}
287285

288286
unsafe {
289-
for x in 1usize..half_kernel as usize {
287+
for x in 1usize..=half_kernel as usize {
290288
let px = x.min(width as usize - 1) * CN;
291289
let s_ptr = src.as_ptr().add(y_src_shift + px);
292290
let edge_colors = load_u8_s32_fast::<CN>(s_ptr);
@@ -295,7 +293,14 @@ unsafe fn box_blur_horizontal_pass_impl<const CN: usize>(
295293
}
296294

297295
for x in 0..width {
298-
// preload edge pixels
296+
let px = x as usize * CN;
297+
298+
unsafe {
299+
let r0 = _mm_mul_ps(_mm_cvtepi32_ps(store), _mm256_castps256_ps128(v_weight));
300+
let bytes_offset = y_dst_shift + px;
301+
let ptr = unsafe_dst.slice.as_ptr().add(bytes_offset) as *mut u8;
302+
store_u8_u32::<CN>(ptr, _mm_cvtps_epi32(r0));
303+
}
299304

300305
// subtract previous
301306
unsafe {
@@ -308,23 +313,14 @@ unsafe fn box_blur_horizontal_pass_impl<const CN: usize>(
308313

309314
// add next
310315
unsafe {
311-
let next_x = (x + half_kernel).min(width - 1) as usize;
316+
let next_x = (x + half_kernel + 1).min(width - 1) as usize;
312317

313318
let next = next_x * CN;
314319

315320
let s_ptr = src.as_ptr().add(y_src_shift + next);
316321
let edge_colors = load_u8_s32_fast::<CN>(s_ptr);
317322
store = _mm_add_epi32(store, edge_colors);
318323
}
319-
320-
let px = x as usize * CN;
321-
322-
unsafe {
323-
let r0 = _mm_mul_ps(_mm_cvtepi32_ps(store), _mm256_castps256_ps128(v_weight));
324-
let bytes_offset = y_dst_shift + px;
325-
let ptr = unsafe_dst.slice.as_ptr().add(bytes_offset) as *mut u8;
326-
store_u8_u32::<CN>(ptr, _mm_cvtps_epi32(r0));
327-
}
328324
}
329325
}
330326
}

0 commit comments

Comments
 (0)