@@ -107,7 +107,7 @@ unsafe fn box_blur_horizontal_pass_impl<const CN: usize>(
107
107
let edge_count = ( kernel_size / 2 ) + 1 ;
108
108
let v_edge_count = _mm256_set1_epi32 ( edge_count as i32 ) ;
109
109
110
- let v_weight = _mm256_set1_ps ( 1f32 / ( radius * 2 ) as f32 ) ;
110
+ let v_weight = _mm256_set1_ps ( 1f32 / ( radius * 2 + 1 ) as f32 ) ;
111
111
112
112
let half_kernel = kernel_size / 2 ;
113
113
@@ -140,7 +140,7 @@ unsafe fn box_blur_horizontal_pass_impl<const CN: usize>(
140
140
}
141
141
142
142
unsafe {
143
- for x in 1 ..half_kernel as usize {
143
+ for x in 1 ..= half_kernel as usize {
144
144
let px = x. min ( width as usize - 1 ) * CN ;
145
145
146
146
let s_ptr_0 = src. as_ptr ( ) . add ( y_src_shift + px) ;
@@ -161,7 +161,50 @@ unsafe fn box_blur_horizontal_pass_impl<const CN: usize>(
161
161
}
162
162
163
163
for x in 0 ..width {
164
- // preload edge pixels
164
+ let px = x as usize * CN ;
165
+
166
+ unsafe {
167
+ let scale_store_ps0 = _mm256_cvtepi32_ps ( store_0) ;
168
+ let scale_store_ps1 = _mm256_cvtepi32_ps ( store_1) ;
169
+ let scale_store_ps2 = _mm256_cvtepi32_ps ( store_2) ;
170
+
171
+ let r0 = _mm256_mul_ps ( scale_store_ps0, v_weight) ;
172
+ let r1 = _mm256_mul_ps ( scale_store_ps1, v_weight) ;
173
+ let r2 = _mm256_mul_ps ( scale_store_ps2, v_weight) ;
174
+
175
+ let scale_store0 = _mm256_cvtps_epi32 ( r0) ;
176
+ let scale_store1 = _mm256_cvtps_epi32 ( r1) ;
177
+ let scale_store2 = _mm256_cvtps_epi32 ( r2) ;
178
+
179
+ let px_160 = _mm256_packus_epi32 ( scale_store0, _mm256_setzero_si256 ( ) ) ;
180
+ let px_161 = _mm256_packus_epi32 ( scale_store1, _mm256_setzero_si256 ( ) ) ;
181
+ let px_162 = _mm256_packus_epi32 ( scale_store2, _mm256_setzero_si256 ( ) ) ;
182
+
183
+ let px_80 = _mm256_packus_epi16 ( px_160, _mm256_setzero_si256 ( ) ) ;
184
+ let px_81 = _mm256_packus_epi16 ( px_161, _mm256_setzero_si256 ( ) ) ;
185
+ let px_82 = _mm256_packus_epi16 ( px_162, _mm256_setzero_si256 ( ) ) ;
186
+
187
+ let bytes_offset_0 = y_dst_shift + px;
188
+ let bytes_offset_1 = y_dst_shift + dst_stride as usize + px;
189
+ let bytes_offset_2 = y_dst_shift + dst_stride as usize * 2 + px;
190
+ let bytes_offset_3 = y_dst_shift + dst_stride as usize * 3 + px;
191
+ let bytes_offset_4 = y_dst_shift + dst_stride as usize * 4 + px;
192
+ let bytes_offset_5 = y_dst_shift + dst_stride as usize * 5 + px;
193
+
194
+ let dst_ptr_0 = unsafe_dst. slice . as_ptr ( ) . add ( bytes_offset_0) as * mut u8 ;
195
+ let dst_ptr_1 = unsafe_dst. slice . as_ptr ( ) . add ( bytes_offset_1) as * mut u8 ;
196
+ let dst_ptr_2 = unsafe_dst. slice . as_ptr ( ) . add ( bytes_offset_2) as * mut u8 ;
197
+ let dst_ptr_3 = unsafe_dst. slice . as_ptr ( ) . add ( bytes_offset_3) as * mut u8 ;
198
+ let dst_ptr_4 = unsafe_dst. slice . as_ptr ( ) . add ( bytes_offset_4) as * mut u8 ;
199
+ let dst_ptr_5 = unsafe_dst. slice . as_ptr ( ) . add ( bytes_offset_5) as * mut u8 ;
200
+
201
+ write_u8 :: < CN > ( dst_ptr_0, _mm256_castsi256_si128 ( px_80) ) ;
202
+ write_u8 :: < CN > ( dst_ptr_1, _mm256_extracti128_si256 :: < 1 > ( px_80) ) ;
203
+ write_u8 :: < CN > ( dst_ptr_2, _mm256_castsi256_si128 ( px_81) ) ;
204
+ write_u8 :: < CN > ( dst_ptr_3, _mm256_extracti128_si256 :: < 1 > ( px_81) ) ;
205
+ write_u8 :: < CN > ( dst_ptr_4, _mm256_castsi256_si128 ( px_82) ) ;
206
+ write_u8 :: < CN > ( dst_ptr_5, _mm256_extracti128_si256 :: < 1 > ( px_82) ) ;
207
+ }
165
208
166
209
// subtract previous
167
210
unsafe {
@@ -196,7 +239,7 @@ unsafe fn box_blur_horizontal_pass_impl<const CN: usize>(
196
239
197
240
// add next
198
241
unsafe {
199
- let next_x = ( x + half_kernel) . min ( width - 1 ) as usize ;
242
+ let next_x = ( x + half_kernel + 1 ) . min ( width - 1 ) as usize ;
200
243
201
244
let next = next_x * CN ;
202
245
@@ -223,51 +266,6 @@ unsafe fn box_blur_horizontal_pass_impl<const CN: usize>(
223
266
store_1 = _mm256_add_epi32 ( store_1, edge_colors_1) ;
224
267
store_2 = _mm256_add_epi32 ( store_2, edge_colors_2) ;
225
268
}
226
-
227
- let px = x as usize * CN ;
228
-
229
- unsafe {
230
- let scale_store_ps0 = _mm256_cvtepi32_ps ( store_0) ;
231
- let scale_store_ps1 = _mm256_cvtepi32_ps ( store_1) ;
232
- let scale_store_ps2 = _mm256_cvtepi32_ps ( store_2) ;
233
-
234
- let r0 = _mm256_mul_ps ( scale_store_ps0, v_weight) ;
235
- let r1 = _mm256_mul_ps ( scale_store_ps1, v_weight) ;
236
- let r2 = _mm256_mul_ps ( scale_store_ps2, v_weight) ;
237
-
238
- let scale_store0 = _mm256_cvtps_epi32 ( r0) ;
239
- let scale_store1 = _mm256_cvtps_epi32 ( r1) ;
240
- let scale_store2 = _mm256_cvtps_epi32 ( r2) ;
241
-
242
- let px_160 = _mm256_packus_epi32 ( scale_store0, _mm256_setzero_si256 ( ) ) ;
243
- let px_161 = _mm256_packus_epi32 ( scale_store1, _mm256_setzero_si256 ( ) ) ;
244
- let px_162 = _mm256_packus_epi32 ( scale_store2, _mm256_setzero_si256 ( ) ) ;
245
-
246
- let px_80 = _mm256_packus_epi16 ( px_160, _mm256_setzero_si256 ( ) ) ;
247
- let px_81 = _mm256_packus_epi16 ( px_161, _mm256_setzero_si256 ( ) ) ;
248
- let px_82 = _mm256_packus_epi16 ( px_162, _mm256_setzero_si256 ( ) ) ;
249
-
250
- let bytes_offset_0 = y_dst_shift + px;
251
- let bytes_offset_1 = y_dst_shift + dst_stride as usize + px;
252
- let bytes_offset_2 = y_dst_shift + dst_stride as usize * 2 + px;
253
- let bytes_offset_3 = y_dst_shift + dst_stride as usize * 3 + px;
254
- let bytes_offset_4 = y_dst_shift + dst_stride as usize * 4 + px;
255
- let bytes_offset_5 = y_dst_shift + dst_stride as usize * 5 + px;
256
-
257
- let dst_ptr_0 = unsafe_dst. slice . as_ptr ( ) . add ( bytes_offset_0) as * mut u8 ;
258
- let dst_ptr_1 = unsafe_dst. slice . as_ptr ( ) . add ( bytes_offset_1) as * mut u8 ;
259
- let dst_ptr_2 = unsafe_dst. slice . as_ptr ( ) . add ( bytes_offset_2) as * mut u8 ;
260
- let dst_ptr_3 = unsafe_dst. slice . as_ptr ( ) . add ( bytes_offset_3) as * mut u8 ;
261
- let dst_ptr_4 = unsafe_dst. slice . as_ptr ( ) . add ( bytes_offset_4) as * mut u8 ;
262
- let dst_ptr_5 = unsafe_dst. slice . as_ptr ( ) . add ( bytes_offset_5) as * mut u8 ;
263
-
264
- write_u8 :: < CN > ( dst_ptr_0, _mm256_castsi256_si128 ( px_80) ) ;
265
- write_u8 :: < CN > ( dst_ptr_1, _mm256_extracti128_si256 :: < 1 > ( px_80) ) ;
266
- write_u8 :: < CN > ( dst_ptr_2, _mm256_castsi256_si128 ( px_81) ) ;
267
- write_u8 :: < CN > ( dst_ptr_3, _mm256_extracti128_si256 :: < 1 > ( px_81) ) ;
268
- write_u8 :: < CN > ( dst_ptr_4, _mm256_castsi256_si128 ( px_82) ) ;
269
- write_u8 :: < CN > ( dst_ptr_5, _mm256_extracti128_si256 :: < 1 > ( px_82) ) ;
270
- }
271
269
}
272
270
273
271
yy += 6 ;
@@ -286,7 +284,7 @@ unsafe fn box_blur_horizontal_pass_impl<const CN: usize>(
286
284
}
287
285
288
286
unsafe {
289
- for x in 1usize ..half_kernel as usize {
287
+ for x in 1usize ..= half_kernel as usize {
290
288
let px = x. min ( width as usize - 1 ) * CN ;
291
289
let s_ptr = src. as_ptr ( ) . add ( y_src_shift + px) ;
292
290
let edge_colors = load_u8_s32_fast :: < CN > ( s_ptr) ;
@@ -295,7 +293,14 @@ unsafe fn box_blur_horizontal_pass_impl<const CN: usize>(
295
293
}
296
294
297
295
for x in 0 ..width {
298
- // preload edge pixels
296
+ let px = x as usize * CN ;
297
+
298
+ unsafe {
299
+ let r0 = _mm_mul_ps ( _mm_cvtepi32_ps ( store) , _mm256_castps256_ps128 ( v_weight) ) ;
300
+ let bytes_offset = y_dst_shift + px;
301
+ let ptr = unsafe_dst. slice . as_ptr ( ) . add ( bytes_offset) as * mut u8 ;
302
+ store_u8_u32 :: < CN > ( ptr, _mm_cvtps_epi32 ( r0) ) ;
303
+ }
299
304
300
305
// subtract previous
301
306
unsafe {
@@ -308,23 +313,14 @@ unsafe fn box_blur_horizontal_pass_impl<const CN: usize>(
308
313
309
314
// add next
310
315
unsafe {
311
- let next_x = ( x + half_kernel) . min ( width - 1 ) as usize ;
316
+ let next_x = ( x + half_kernel + 1 ) . min ( width - 1 ) as usize ;
312
317
313
318
let next = next_x * CN ;
314
319
315
320
let s_ptr = src. as_ptr ( ) . add ( y_src_shift + next) ;
316
321
let edge_colors = load_u8_s32_fast :: < CN > ( s_ptr) ;
317
322
store = _mm_add_epi32 ( store, edge_colors) ;
318
323
}
319
-
320
- let px = x as usize * CN ;
321
-
322
- unsafe {
323
- let r0 = _mm_mul_ps ( _mm_cvtepi32_ps ( store) , _mm256_castps256_ps128 ( v_weight) ) ;
324
- let bytes_offset = y_dst_shift + px;
325
- let ptr = unsafe_dst. slice . as_ptr ( ) . add ( bytes_offset) as * mut u8 ;
326
- store_u8_u32 :: < CN > ( ptr, _mm_cvtps_epi32 ( r0) ) ;
327
- }
328
324
}
329
325
}
330
326
}
0 commit comments