From 77ffeeabd623b6331e1fafc3142a8a3132e8ce6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Foidl?= Date: Sun, 26 Mar 2023 17:03:29 +0200 Subject: [PATCH 1/2] Create vector constants inline and not via ROS --- .../Common/Helpers/SimdUtils.HwIntrinsics.cs | 113 +++++++++--------- .../PixelConversion_PackFromRgbPlanes.cs | 3 +- 2 files changed, 56 insertions(+), 60 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index ce6f335a8f..8eebf7fb9e 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -13,33 +13,38 @@ internal static partial class SimdUtils { public static class HwIntrinsics { - public static ReadOnlySpan PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 }; + [MethodImpl(MethodImplOptions.AggressiveInlining)] // too much IL for JIT to inline, so give a hint + public static Vector256 PermuteMaskDeinterleave8x32() => Vector256.Create(0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0).AsInt32(); - public static ReadOnlySpan PermuteMaskEvenOdd8x32 => new byte[] { 0, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, 6, 0, 0, 0, 1, 0, 0, 0, 3, 0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0 }; + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 PermuteMaskEvenOdd8x32() => Vector256.Create(0, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, 6, 0, 0, 0, 1, 0, 0, 0, 3, 0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0).AsUInt32(); - public static ReadOnlySpan PermuteMaskSwitchInnerDWords8x32 => new byte[] { 0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0 }; + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 PermuteMaskSwitchInnerDWords8x32() => Vector256.Create(0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0).AsUInt32(); - private static ReadOnlySpan MoveFirst24BytesToSeparateLanes => new byte[] { 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0 }; + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector256 MoveFirst24BytesToSeparateLanes() => Vector256.Create(0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0).AsUInt32(); - internal static ReadOnlySpan ExtractRgb => new byte[] { 0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 0xFF, 0xFF, 0xFF, 0xFF, 0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 0xFF, 0xFF, 0xFF, 0xFF }; + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static Vector256 ExtractRgb() => Vector256.Create(0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 0xFF, 0xFF, 0xFF, 0xFF, 0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 0xFF, 0xFF, 0xFF, 0xFF); - private static ReadOnlySpan ShuffleMaskPad4Nx16 => new byte[] { 0, 1, 2, 0x80, 3, 4, 5, 0x80, 6, 7, 8, 0x80, 9, 10, 11, 0x80 }; + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector128 ShuffleMaskPad4Nx16() => Vector128.Create(0, 1, 2, 0x80, 3, 4, 5, 0x80, 6, 7, 8, 0x80, 9, 10, 11, 0x80); - private static ReadOnlySpan ShuffleMaskSlice4Nx16 => new byte[] { 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 0x80, 0x80, 0x80, 0x80 }; + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector128 ShuffleMaskSlice4Nx16() => Vector128.Create(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 0x80, 0x80, 0x80, 0x80); - private static ReadOnlySpan ShuffleMaskShiftAlpha => - new byte[] - { - 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15, - 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15 - }; +#pragma warning disable SA1003, SA1116, SA1117 // Parameters should be on same line or separate lines + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector256 ShuffleMaskShiftAlpha() => Vector256.Create((byte) + 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15, + 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15); - public static ReadOnlySpan PermuteMaskShiftAlpha8x32 => - new byte[] - { - 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, - 5, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 - }; + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 PermuteMaskShiftAlpha8x32() => Vector256.Create( + 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, + 5, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0).AsUInt32(); +#pragma warning restore SA1003, SA1116, SA1117 // Parameters should be on same line or separate lines /// /// Shuffle single-precision (32-bit) floating-point elements in @@ -189,7 +194,7 @@ public static void Shuffle4Slice3Reduce( { if (Ssse3.IsSupported) { - int remainder = source.Length % (Vector128.Count * 4); + int remainder = source.Length & (Vector128.Count * 4 - 1); // bit-hack for modulo int sourceCount = source.Length - remainder; int destCount = (int)((uint)sourceCount * 3 / 4); @@ -221,7 +226,7 @@ private static void Shuffle4( ref Vector256 destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); - nint n = (nint)(uint)(dest.Length / Vector256.Count); + nint n = (nint)((uint)dest.Length / (uint)Vector256.Count); nint m = Numerics.Modulo4(n); nint u = n - m; @@ -253,7 +258,7 @@ private static void Shuffle4( ref Vector128 destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); - nint n = (nint)(uint)dest.Length / Vector128.Count; + nint n = (nint)((uint)dest.Length / (uint)Vector128.Count); nint m = Numerics.Modulo4(n); nint u = n - m; @@ -306,7 +311,7 @@ private static void Shuffle4( ref Vector256 destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); - nint n = (nint)(uint)dest.Length / Vector256.Count; + nint n = (nint)((uint)dest.Length / (uint)Vector256.Count); nint m = Numerics.Modulo4(n); nint u = n - m; @@ -342,7 +347,7 @@ private static void Shuffle4( ref Vector128 destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); - nint n = (nint)(uint)dest.Length / Vector128.Count; + nint n = (nint)((uint)dest.Length / (uint)Vector128.Count); nint m = Numerics.Modulo4(n); nint u = n - m; @@ -375,10 +380,8 @@ private static void Shuffle3( { if (Ssse3.IsSupported) { - ref byte vmaskBase = ref MemoryMarshal.GetReference(ShuffleMaskPad4Nx16); - Vector128 vmask = Unsafe.As>(ref vmaskBase); - ref byte vmaskoBase = ref MemoryMarshal.GetReference(ShuffleMaskSlice4Nx16); - Vector128 vmasko = Unsafe.As>(ref vmaskoBase); + Vector128 vmask = ShuffleMaskPad4Nx16(); + Vector128 vmasko = ShuffleMaskSlice4Nx16(); Vector128 vmaske = Ssse3.AlignRight(vmasko, vmasko, 12); Span bytes = stackalloc byte[Vector128.Count]; @@ -440,8 +443,7 @@ private static void Pad3Shuffle4( { if (Ssse3.IsSupported) { - ref byte vmaskBase = ref MemoryMarshal.GetReference(ShuffleMaskPad4Nx16); - Vector128 vmask = Unsafe.As>(ref vmaskBase); + Vector128 vmask = ShuffleMaskPad4Nx16(); Vector128 vfill = Vector128.Create(0xff000000ff000000ul).AsByte(); Span bytes = stackalloc byte[Vector128.Count]; @@ -484,8 +486,7 @@ private static void Shuffle4Slice3( { if (Ssse3.IsSupported) { - ref byte vmaskoBase = ref MemoryMarshal.GetReference(ShuffleMaskSlice4Nx16); - Vector128 vmasko = Unsafe.As>(ref vmaskoBase); + Vector128 vmasko = ShuffleMaskSlice4Nx16(); Vector128 vmaske = Ssse3.AlignRight(vmasko, vmasko, 12); Span bytes = stackalloc byte[Vector128.Count]; @@ -542,9 +543,9 @@ private static void Shuffle4Slice3( /// The . [MethodImpl(InliningOptions.AlwaysInline)] public static Vector256 MultiplyAdd( - in Vector256 va, - in Vector256 vm0, - in Vector256 vm1) + Vector256 va, + Vector256 vm0, + Vector256 vm1) { if (Fma.IsSupported) { @@ -565,9 +566,9 @@ public static Vector256 MultiplyAdd( /// The . [MethodImpl(InliningOptions.ShortMethod)] public static Vector256 MultiplySubtract( - in Vector256 vs, - in Vector256 vm0, - in Vector256 vm1) + Vector256 vs, + Vector256 vm0, + Vector256 vm1) { if (Fma.IsSupported) { @@ -587,9 +588,9 @@ public static Vector256 MultiplySubtract( /// The . [MethodImpl(InliningOptions.ShortMethod)] public static Vector256 MultiplyAddNegated( - in Vector256 a, - in Vector256 b, - in Vector256 c) + Vector256 a, + Vector256 b, + Vector256 c) { if (Fma.IsSupported) { @@ -655,7 +656,7 @@ internal static unsafe void ByteToNormalizedFloat( ref Vector256 destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); - var scale = Vector256.Create(1 / (float)byte.MaxValue); + Vector256 scale = Vector256.Create(1 / (float)byte.MaxValue); for (nuint i = 0; i < n; i++) { @@ -688,7 +689,7 @@ internal static unsafe void ByteToNormalizedFloat( ref Vector128 destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); - var scale = Vector128.Create(1 / (float)byte.MaxValue); + Vector128 scale = Vector128.Create(1 / (float)byte.MaxValue); Vector128 zero = Vector128.Zero; for (nuint i = 0; i < n; i++) @@ -790,9 +791,8 @@ internal static void NormalizedFloatToByteSaturate( ref Vector256 destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); - var scale = Vector256.Create((float)byte.MaxValue); - ref byte maskBase = ref MemoryMarshal.GetReference(PermuteMaskDeinterleave8x32); - Vector256 mask = Unsafe.As>(ref maskBase); + Vector256 scale = Vector256.Create((float)byte.MaxValue); + Vector256 mask = PermuteMaskDeinterleave8x32(); for (nuint i = 0; i < n; i++) { @@ -829,7 +829,7 @@ internal static void NormalizedFloatToByteSaturate( ref Vector128 destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); - var scale = Vector128.Create((float)byte.MaxValue); + Vector128 scale = Vector128.Create((float)byte.MaxValue); for (nuint i = 0; i < n; i++) { @@ -866,14 +866,12 @@ internal static void PackFromRgbPlanesAvx2Reduce( nuint count = (uint)redChannel.Length / (uint)Vector256.Count; - ref byte control1Bytes = ref MemoryMarshal.GetReference(PermuteMaskEvenOdd8x32); - Vector256 control1 = Unsafe.As>(ref control1Bytes); + Vector256 control1 = PermuteMaskEvenOdd8x32(); - ref byte control2Bytes = ref MemoryMarshal.GetReference(PermuteMaskShiftAlpha8x32); - Vector256 control2 = Unsafe.As>(ref control2Bytes); - var a = Vector256.Create((byte)255); + Vector256 control2 = PermuteMaskShiftAlpha8x32(); + Vector256 a = Vector256.Create((byte)255); - Vector256 shuffleAlpha = Unsafe.As>(ref MemoryMarshal.GetReference(ShuffleMaskShiftAlpha)); + Vector256 shuffleAlpha = ShuffleMaskShiftAlpha(); for (nuint i = 0; i < count; i++) { @@ -937,9 +935,8 @@ internal static void PackFromRgbPlanesAvx2Reduce( ref Vector256 dBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(destination)); nuint count = (uint)redChannel.Length / (uint)Vector256.Count; - ref byte control1Bytes = ref MemoryMarshal.GetReference(PermuteMaskEvenOdd8x32); - Vector256 control1 = Unsafe.As>(ref control1Bytes); - var a = Vector256.Create((byte)255); + Vector256 control1 = PermuteMaskEvenOdd8x32(); + Vector256 a = Vector256.Create((byte)255); for (nuint i = 0; i < count; i++) { @@ -988,8 +985,8 @@ internal static void UnpackToRgbPlanesAvx2Reduce( ref Vector256 destGRef = ref Unsafe.As>(ref MemoryMarshal.GetReference(greenChannel)); ref Vector256 destBRef = ref Unsafe.As>(ref MemoryMarshal.GetReference(blueChannel)); - Vector256 extractToLanesMask = Unsafe.As>(ref MemoryMarshal.GetReference(MoveFirst24BytesToSeparateLanes)); - Vector256 extractRgbMask = Unsafe.As>(ref MemoryMarshal.GetReference(ExtractRgb)); + Vector256 extractToLanesMask = MoveFirst24BytesToSeparateLanes(); + Vector256 extractRgbMask = ExtractRgb(); Vector256 rgb, rg, bx; Vector256 r, g, b; diff --git a/tests/ImageSharp.Benchmarks/General/PixelConversion/PixelConversion_PackFromRgbPlanes.cs b/tests/ImageSharp.Benchmarks/General/PixelConversion/PixelConversion_PackFromRgbPlanes.cs index ddf192af87..a42c6c253c 100644 --- a/tests/ImageSharp.Benchmarks/General/PixelConversion/PixelConversion_PackFromRgbPlanes.cs +++ b/tests/ImageSharp.Benchmarks/General/PixelConversion/PixelConversion_PackFromRgbPlanes.cs @@ -207,8 +207,7 @@ public void Rgba32_Avx2_Float() nuint count = (uint)this.Count / (uint)Vector256.Count; - ref byte control = ref MemoryMarshal.GetReference(SimdUtils.HwIntrinsics.PermuteMaskEvenOdd8x32); - Vector256 vcontrol = Unsafe.As>(ref control); + Vector256 vcontrol = SimdUtils.HwIntrinsics.PermuteMaskEvenOdd8x32().AsInt32(); var va = Vector256.Create(1F); From cdf6eed33b7392e1abccea2da277b22df8498945 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Foidl?= Date: Sun, 26 Mar 2023 17:19:17 +0200 Subject: [PATCH 2/2] Fixed build --- src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index 8eebf7fb9e..d4705fed74 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -194,7 +194,7 @@ public static void Shuffle4Slice3Reduce( { if (Ssse3.IsSupported) { - int remainder = source.Length & (Vector128.Count * 4 - 1); // bit-hack for modulo + int remainder = source.Length & ((Vector128.Count * 4) - 1); // bit-hack for modulo int sourceCount = source.Length - remainder; int destCount = (int)((uint)sourceCount * 3 / 4);