Skip to content

Vector128 and Vector256 Shuffle produces unexpected asm #113582

Closed
@ovska

Description

@ovska

Description

Calling Vector128.Shuffle or Vector256.Shuffle produces different code than calling Shuffle directly on the intrinsic.

Reproduction Steps

using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;

namespace ConsoleApp7;

internal static class Class1
{
    static Vector128<byte> ViaVector(Vector128<byte> a, Vector128<byte> b) => Vector128.Shuffle(a, b);
    static Vector128<byte> Explicit(Vector128<byte> a, Vector128<byte> b) => Ssse3.Shuffle(a, b);

    static Vector256<byte> ViaVector(Vector256<byte> a, Vector256<byte> b) => Vector256.Shuffle(a, b);
    static Vector256<byte> Explicit(Vector256<byte> a, Vector256<byte> b) => Avx2.Shuffle(a, b);
}

Expected behavior

Expected for VectorN to produce identical output as the direct intrinsics calls:

; Assembly listing for method ConsoleApp7.Class1:Explicit(System.Runtime.Intrinsics.Vector128`1[ubyte],System.Runtime.Intrinsics.Vector128`1[ubyte]):System.Runtime.Intrinsics.Vector128`1[ubyte] (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX - Windows
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data

G_M000_IG01:                ;; offset=0x0000
 
G_M000_IG02:                ;; offset=0x0000
       vmovups  xmm0, xmmword ptr [rdx]
       vpshufb  xmm0, xmm0, xmmword ptr [r8]
       vmovups  xmmword ptr [rcx], xmm0
       mov      rax, rcx
 
G_M000_IG03:                ;; offset=0x0010
       ret      
 
; Total bytes of code 17

; Assembly listing for method ConsoleApp7.Class1:Explicit(System.Runtime.Intrinsics.Vector256`1[ubyte],System.Runtime.Intrinsics.Vector256`1[ubyte]):System.Runtime.Intrinsics.Vector256`1[ubyte] (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX - Windows
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data

G_M000_IG01:                ;; offset=0x0000
 
G_M000_IG02:                ;; offset=0x0000
       vmovups  ymm0, ymmword ptr [rdx]
       vpshufb  ymm0, ymm0, ymmword ptr [r8]
       vmovups  ymmword ptr [rcx], ymm0
       mov      rax, rcx
 
G_M000_IG03:                ;; offset=0x0010
       vzeroupper 
       ret      
 
; Total bytes of code 20

Actual behavior

; Assembly listing for method ConsoleApp7.Class1:ViaVector(System.Runtime.Intrinsics.Vector128`1[ubyte],System.Runtime.Intrinsics.Vector128`1[ubyte]):System.Runtime.Intrinsics.Vector128`1[ubyte] (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX - Windows
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data

G_M000_IG01:                ;; offset=0x0000
       push     rbx
       sub      rsp, 80
       mov      rbx, rcx
 
G_M000_IG02:                ;; offset=0x0008
       vmovups  xmm0, xmmword ptr [rdx]
       vmovaps  xmmword ptr [rsp+0x30], xmm0
       vmovups  xmm0, xmmword ptr [r8]
       vmovaps  xmmword ptr [rsp+0x20], xmm0
       lea      rdx, [rsp+0x30]
       lea      r8, [rsp+0x20]
       lea      rcx, [rsp+0x40]
       call     [System.Runtime.Intrinsics.Vector128:Shuffle(System.Runtime.Intrinsics.Vector128`1[ubyte],System.Runtime.Intrinsics.Vector128`1[ubyte]):System.Runtime.Intrinsics.Vector128`1[ubyte]]
       vmovaps  xmm0, xmmword ptr [rsp+0x40]
       vmovups  xmmword ptr [rbx], xmm0
       mov      rax, rbx
 
G_M000_IG03:                ;; offset=0x003F
       add      rsp, 80
       pop      rbx
       ret      
 
; Total bytes of code 69

; Assembly listing for method ConsoleApp7.Class1:ViaVector(System.Runtime.Intrinsics.Vector256`1[ubyte],System.Runtime.Intrinsics.Vector256`1[ubyte]):System.Runtime.Intrinsics.Vector256`1[ubyte] (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX - Windows
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data

G_M000_IG01:                ;; offset=0x0000
       push     rbx
       sub      rsp, 144
       mov      rbx, rcx
 
G_M000_IG02:                ;; offset=0x000B
       vmovups  ymm0, ymmword ptr [rdx]
       vmovups  ymmword ptr [rsp+0x40], ymm0
       vmovups  ymm0, ymmword ptr [r8]
       vmovups  ymmword ptr [rsp+0x20], ymm0
       lea      rdx, [rsp+0x40]
       lea      r8, [rsp+0x20]
       lea      rcx, [rsp+0x60]
       call     [System.Runtime.Intrinsics.Vector256:Shuffle(System.Runtime.Intrinsics.Vector256`1[ubyte],System.Runtime.Intrinsics.Vector256`1[ubyte]):System.Runtime.Intrinsics.Vector256`1[ubyte]]
       vmovups  ymm0, ymmword ptr [rsp+0x60]
       vmovups  ymmword ptr [rbx], ymm0
       mov      rax, rbx
 
G_M000_IG03:                ;; offset=0x0042
       vzeroupper 
       add      rsp, 144
       pop      rbx
       ret      
 
; Total bytes of code 78

Regression?

No response

Known Workarounds

Use the hardware intrinsic class instead of generic Vector

Configuration

  • dotnet version 9.0.100
  • Ryzen 7 3700X
  • Windows 10 Pro 10.0.19045 Build 19045

Other information

No response

Metadata

Metadata

Assignees

No one assigned

    Labels

    area-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions