Closed
Description
Description
Calling Vector128.Shuffle
or Vector256.Shuffle
produces different code than calling Shuffle
directly on the intrinsic.
Reproduction Steps
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
namespace ConsoleApp7;
internal static class Class1
{
static Vector128<byte> ViaVector(Vector128<byte> a, Vector128<byte> b) => Vector128.Shuffle(a, b);
static Vector128<byte> Explicit(Vector128<byte> a, Vector128<byte> b) => Ssse3.Shuffle(a, b);
static Vector256<byte> ViaVector(Vector256<byte> a, Vector256<byte> b) => Vector256.Shuffle(a, b);
static Vector256<byte> Explicit(Vector256<byte> a, Vector256<byte> b) => Avx2.Shuffle(a, b);
}
Expected behavior
Expected for VectorN
to produce identical output as the direct intrinsics calls:
; Assembly listing for method ConsoleApp7.Class1:Explicit(System.Runtime.Intrinsics.Vector128`1[ubyte],System.Runtime.Intrinsics.Vector128`1[ubyte]):System.Runtime.Intrinsics.Vector128`1[ubyte] (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX - Windows
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data
G_M000_IG01: ;; offset=0x0000
G_M000_IG02: ;; offset=0x0000
vmovups xmm0, xmmword ptr [rdx]
vpshufb xmm0, xmm0, xmmword ptr [r8]
vmovups xmmword ptr [rcx], xmm0
mov rax, rcx
G_M000_IG03: ;; offset=0x0010
ret
; Total bytes of code 17
; Assembly listing for method ConsoleApp7.Class1:Explicit(System.Runtime.Intrinsics.Vector256`1[ubyte],System.Runtime.Intrinsics.Vector256`1[ubyte]):System.Runtime.Intrinsics.Vector256`1[ubyte] (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX - Windows
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data
G_M000_IG01: ;; offset=0x0000
G_M000_IG02: ;; offset=0x0000
vmovups ymm0, ymmword ptr [rdx]
vpshufb ymm0, ymm0, ymmword ptr [r8]
vmovups ymmword ptr [rcx], ymm0
mov rax, rcx
G_M000_IG03: ;; offset=0x0010
vzeroupper
ret
; Total bytes of code 20
Actual behavior
; Assembly listing for method ConsoleApp7.Class1:ViaVector(System.Runtime.Intrinsics.Vector128`1[ubyte],System.Runtime.Intrinsics.Vector128`1[ubyte]):System.Runtime.Intrinsics.Vector128`1[ubyte] (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX - Windows
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data
G_M000_IG01: ;; offset=0x0000
push rbx
sub rsp, 80
mov rbx, rcx
G_M000_IG02: ;; offset=0x0008
vmovups xmm0, xmmword ptr [rdx]
vmovaps xmmword ptr [rsp+0x30], xmm0
vmovups xmm0, xmmword ptr [r8]
vmovaps xmmword ptr [rsp+0x20], xmm0
lea rdx, [rsp+0x30]
lea r8, [rsp+0x20]
lea rcx, [rsp+0x40]
call [System.Runtime.Intrinsics.Vector128:Shuffle(System.Runtime.Intrinsics.Vector128`1[ubyte],System.Runtime.Intrinsics.Vector128`1[ubyte]):System.Runtime.Intrinsics.Vector128`1[ubyte]]
vmovaps xmm0, xmmword ptr [rsp+0x40]
vmovups xmmword ptr [rbx], xmm0
mov rax, rbx
G_M000_IG03: ;; offset=0x003F
add rsp, 80
pop rbx
ret
; Total bytes of code 69
; Assembly listing for method ConsoleApp7.Class1:ViaVector(System.Runtime.Intrinsics.Vector256`1[ubyte],System.Runtime.Intrinsics.Vector256`1[ubyte]):System.Runtime.Intrinsics.Vector256`1[ubyte] (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX - Windows
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data
G_M000_IG01: ;; offset=0x0000
push rbx
sub rsp, 144
mov rbx, rcx
G_M000_IG02: ;; offset=0x000B
vmovups ymm0, ymmword ptr [rdx]
vmovups ymmword ptr [rsp+0x40], ymm0
vmovups ymm0, ymmword ptr [r8]
vmovups ymmword ptr [rsp+0x20], ymm0
lea rdx, [rsp+0x40]
lea r8, [rsp+0x20]
lea rcx, [rsp+0x60]
call [System.Runtime.Intrinsics.Vector256:Shuffle(System.Runtime.Intrinsics.Vector256`1[ubyte],System.Runtime.Intrinsics.Vector256`1[ubyte]):System.Runtime.Intrinsics.Vector256`1[ubyte]]
vmovups ymm0, ymmword ptr [rsp+0x60]
vmovups ymmword ptr [rbx], ymm0
mov rax, rbx
G_M000_IG03: ;; offset=0x0042
vzeroupper
add rsp, 144
pop rbx
ret
; Total bytes of code 78
Regression?
No response
Known Workarounds
Use the hardware intrinsic class instead of generic Vector
Configuration
- dotnet version 9.0.100
- Ryzen 7 3700X
- Windows 10 Pro 10.0.19045 Build 19045
Other information
No response