@@ -412,23 +412,142 @@ private static Buffer2DRegion<byte> TrimTransparentPixels(Buffer2D<byte> buffer,
412412 int bottom = int . MaxValue ;
413413 int left = int . MaxValue ;
414414 int right = int . MinValue ;
415-
416- // Run through th buffer in a single pass. Use variables to track the min/max values.
417415 int minY = - 1 ;
418416 bool isTransparentRow = true ;
417+
418+ // Run through the buffer in a single pass. Use variables to track the min/max values.
419419 for ( int y = 0 ; y < buffer . Height ; y ++ )
420420 {
421421 isTransparentRow = true ;
422422 Span < byte > rowSpan = buffer . DangerousGetRowSpan ( y ) ;
423+ ref byte rowPtr = ref MemoryMarshal . GetReference ( rowSpan ) ;
424+ nint rowLength = ( nint ) ( uint ) rowSpan . Length ;
425+ nint x = 0 ;
426+
427+ #if NET7_0_OR_GREATER
428+ if ( Vector128 . IsHardwareAccelerated && rowLength >= Vector128 < byte > . Count )
429+ {
430+ Vector256 < byte > trimmableVec256 = Vector256 . Create ( trimmableIndex ) ;
431+
432+ if ( Vector256 . IsHardwareAccelerated && rowLength >= Vector256 < byte > . Count )
433+ {
434+ do
435+ {
436+ Vector256 < byte > vec = Vector256 . LoadUnsafe ( ref rowPtr , ( nuint ) x ) ;
437+ Vector256 < byte > notEquals = ~ Vector256 . Equals ( vec , trimmableVec256 ) ;
438+ uint mask = notEquals . ExtractMostSignificantBits ( ) ;
439+
440+ if ( mask != 0 )
441+ {
442+ isTransparentRow = false ;
443+ nint start = x + ( nint ) uint . TrailingZeroCount ( mask ) ;
444+ nint end = ( nint ) uint . LeadingZeroCount ( mask ) ;
445+
446+ // end is from the end, but we need the index from the beginning
447+ end = x + Vector256 < byte > . Count - 1 - end ;
448+
449+ left = Math . Min ( left , ( int ) start ) ;
450+ right = Math . Max ( right , ( int ) end ) ;
451+ }
452+
453+ x += Vector256 < byte > . Count ;
454+ }
455+ while ( x <= rowLength - Vector256 < byte > . Count ) ;
456+ }
457+
458+ Vector128 < byte > trimmableVec = Vector256 . IsHardwareAccelerated
459+ ? trimmableVec256 . GetLower ( )
460+ : Vector128 . Create ( trimmableIndex ) ;
461+
462+ while ( x <= rowLength - Vector128 < byte > . Count )
463+ {
464+ Vector128 < byte > vec = Vector128 . LoadUnsafe ( ref rowPtr , ( nuint ) x ) ;
465+ Vector128 < byte > notEquals = ~ Vector128 . Equals ( vec , trimmableVec ) ;
466+ uint mask = notEquals . ExtractMostSignificantBits ( ) ;
467+
468+ if ( mask != 0 )
469+ {
470+ isTransparentRow = false ;
471+ nint start = x + ( nint ) uint . TrailingZeroCount ( mask ) ;
472+ nint end = ( nint ) uint . LeadingZeroCount ( mask ) - Vector128 < byte > . Count ;
473+
474+ // end is from the end, but we need the index from the beginning
475+ end = x + Vector128 < byte > . Count - 1 - end ;
476+
477+ left = Math . Min ( left , ( int ) start ) ;
478+ right = Math . Max ( right , ( int ) end ) ;
479+ }
480+
481+ x += Vector128 < byte > . Count ;
482+ }
483+ }
484+ #else
485+ if ( Sse41 . IsSupported && rowLength >= Vector128 < byte > . Count )
486+ {
487+ Vector256 < byte > trimmableVec256 = Vector256 . Create ( trimmableIndex ) ;
488+
489+ if ( Avx2 . IsSupported && rowLength >= Vector256 < byte > . Count )
490+ {
491+ do
492+ {
493+ Vector256 < byte > vec = Unsafe . ReadUnaligned < Vector256 < byte > > ( ref Unsafe . Add ( ref rowPtr , x ) ) ;
494+ Vector256 < byte > notEquals = Avx2 . CompareEqual ( vec , trimmableVec256 ) ;
495+ notEquals = Avx2 . Xor ( notEquals , Vector256 < byte > . AllBitsSet ) ;
496+ int mask = Avx2 . MoveMask ( notEquals ) ;
497+
498+ if ( mask != 0 )
499+ {
500+ isTransparentRow = false ;
501+ nint start = x + ( nint ) ( uint ) BitOperations . TrailingZeroCount ( mask ) ;
502+ nint end = ( nint ) ( uint ) BitOperations . LeadingZeroCount ( ( uint ) mask ) ;
503+
504+ // end is from the end, but we need the index from the beginning
505+ end = x + Vector256 < byte > . Count - 1 - end ;
506+
507+ left = Math . Min ( left , ( int ) start ) ;
508+ right = Math . Max ( right , ( int ) end ) ;
509+ }
510+
511+ x += Vector256 < byte > . Count ;
512+ }
513+ while ( x <= rowLength - Vector256 < byte > . Count ) ;
514+ }
515+
516+ Vector128 < byte > trimmableVec = Sse41 . IsSupported
517+ ? trimmableVec256 . GetLower ( )
518+ : Vector128 . Create ( trimmableIndex ) ;
519+
520+ while ( x <= rowLength - Vector128 < byte > . Count )
521+ {
522+ Vector128 < byte > vec = Unsafe . ReadUnaligned < Vector128 < byte > > ( ref Unsafe . Add ( ref rowPtr , x ) ) ;
523+ Vector128 < byte > notEquals = Sse2 . CompareEqual ( vec , trimmableVec ) ;
524+ notEquals = Sse2 . Xor ( notEquals , Vector128 < byte > . AllBitsSet ) ;
525+ int mask = Sse2 . MoveMask ( notEquals ) ;
526+
527+ if ( mask != 0 )
528+ {
529+ isTransparentRow = false ;
530+ nint start = x + ( nint ) ( uint ) BitOperations . TrailingZeroCount ( mask ) ;
531+ nint end = ( nint ) ( uint ) BitOperations . LeadingZeroCount ( ( uint ) mask ) - Vector128 < byte > . Count ;
423532
424- // TODO: It may be possible to optimize this inner loop using SIMD.
425- for ( int x = 0 ; x < rowSpan . Length ; x ++ )
533+ // end is from the end, but we need the index from the beginning
534+ end = x + Vector128 < byte > . Count - 1 - end ;
535+
536+ left = Math . Min ( left , ( int ) start ) ;
537+ right = Math . Max ( right , ( int ) end ) ;
538+ }
539+
540+ x += Vector128 < byte > . Count ;
541+ }
542+ }
543+ #endif
544+ for ( ; x < rowLength ; ++ x )
426545 {
427- if ( rowSpan [ x ] != trimmableIndex )
546+ if ( Unsafe . Add ( ref rowPtr , x ) != trimmableIndex )
428547 {
429548 isTransparentRow = false ;
430- left = Math . Min ( left , x ) ;
431- right = Math . Max ( right , x ) ;
549+ left = Math . Min ( left , ( int ) x ) ;
550+ right = Math . Max ( right , ( int ) x ) ;
432551 }
433552 }
434553
0 commit comments