Skip to content

Optimize ~VectorXXX<T> with vpternlogd #90075

Open
@MineCake147E

Description

@MineCake147E

Description

For example,

[Benchmark(OperationsPerInvoke = 2048)]
public Vector512<ushort> Not(Vector512<ushort> value) => ~value;

Current codegen is like:

vpternlogd  zmm0,zmm0,zmm0,0FFh  
vpxord      zmm0,zmm0,zmmword ptr [rcx]

It'll be better to instead emit something like:

vpternlogd zmm0,zmm0,zmmword ptr [rcx],55

Configuration

BenchmarkDotNet v0.13.7, Windows 11 (10.0.22621.1992/22H2/2022Update/SunValley2)
Intel Xeon w5-2455X, 1 CPU, 24 logical and 12 physical cores
.NET SDK 8.0.100-preview.6.23330.14
  [Host]     : .NET 8.0.0 (8.0.23.32907), X64 RyuJIT AVX2
  DefaultJob : .NET 8.0.0 (8.0.23.32907), X64 RyuJIT AVX2

Regression?

No

Data

Benchmark code:

using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;

using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Jobs;

namespace Avx512Benchmarks
{
    [SimpleJob(runtimeMoniker: RuntimeMoniker.HostProcess)]
    [DisassemblyDiagnoser(maxDepth: int.MaxValue)]
    public class Avx512UnaryNegationBenchmarks
    {
        private const int OperationsPerInvoke = 8192;

        [Benchmark(OperationsPerInvoke = OperationsPerInvoke)]
        public (Vector512<ushort>, Vector512<ushort>, Vector512<ushort>) UnaryNegationStandardLatency()
        {
            Vector512<ushort> zmm0 = Vector512<ushort>.Zero;
            Vector512<ushort> zmm1 = Vector512<ushort>.Zero;
            Vector512<ushort> zmm2 = Vector512<ushort>.Zero;
            for (int i = 0; i < OperationsPerInvoke / 8; i++)
            {
                zmm0 = ~zmm2;
                zmm1 = ~zmm0;
                zmm2 = ~zmm1;
                zmm0 = ~zmm2;
                zmm1 = ~zmm0;
                zmm2 = ~zmm1;
                zmm0 = ~zmm2;
                zmm1 = ~zmm0;
            }
            return (zmm0, zmm1, zmm2);
        }

        [Benchmark(OperationsPerInvoke = OperationsPerInvoke)]
        public (Vector512<ushort>, Vector512<ushort>, Vector512<ushort>, Vector512<ushort>) UnaryNegationStandardThroughput()
        {
            Vector512<ushort> zmm0 = Vector512<ushort>.Zero;
            Vector512<ushort> zmm1 = Vector512<ushort>.Zero;
            Vector512<ushort> zmm2 = Vector512<ushort>.Zero;
            Vector512<ushort> zmm3 = Vector512<ushort>.Zero;
            Vector512<ushort> zmm4 = Vector512<ushort>.Zero;
            Vector512<ushort> zmm5 = Vector512<ushort>.Zero;
            Vector512<ushort> zmm6 = Vector512<ushort>.Zero;
            Vector512<ushort> zmm7 = Vector512<ushort>.Zero;
            for (int i = 0; i < OperationsPerInvoke / 8; i++)
            {
                zmm0 = ~zmm0;
                zmm1 = ~zmm1;
                zmm2 = ~zmm2;
                zmm3 = ~zmm3;
                zmm4 = ~zmm4;
                zmm5 = ~zmm5;
                zmm6 = ~zmm6;
                zmm7 = ~zmm7;
            }
            return (zmm0 ^ zmm4, zmm1 ^ zmm5, zmm2 ^ zmm6, zmm3 ^ zmm7);
        }

        [Benchmark(OperationsPerInvoke = OperationsPerInvoke)]
        public (Vector512<ushort>, Vector512<ushort>, Vector512<ushort>) UnaryNegationTernaryLogic55Latency()
        {
            Vector512<ushort> zmm0 = Vector512<ushort>.Zero;
            Vector512<ushort> zmm1 = Vector512<ushort>.Zero;
            Vector512<ushort> zmm2 = Vector512<ushort>.Zero;
            const int Control = 0x55;
            for (int i = 0; i < OperationsPerInvoke / 8; i++)
            {
                zmm0 = Avx512F.TernaryLogic(zmm0, zmm2, zmm2, Control);
                zmm1 = Avx512F.TernaryLogic(zmm1, zmm0, zmm0, Control);
                zmm2 = Avx512F.TernaryLogic(zmm2, zmm1, zmm1, Control);
                zmm0 = Avx512F.TernaryLogic(zmm0, zmm2, zmm2, Control);
                zmm1 = Avx512F.TernaryLogic(zmm1, zmm0, zmm0, Control);
                zmm2 = Avx512F.TernaryLogic(zmm2, zmm1, zmm1, Control);
                zmm0 = Avx512F.TernaryLogic(zmm0, zmm2, zmm2, Control);
                zmm1 = Avx512F.TernaryLogic(zmm1, zmm0, zmm0, Control);
            }
            return (zmm0, zmm1, zmm2);
        }

        [Benchmark(OperationsPerInvoke = OperationsPerInvoke)]
        public (Vector512<ushort>, Vector512<ushort>, Vector512<ushort>, Vector512<ushort>) UnaryNegationTernaryLogic55Throughput()
        {
            Vector512<ushort> zmm0 = Vector512<ushort>.Zero;
            Vector512<ushort> zmm1 = Vector512<ushort>.Zero;
            Vector512<ushort> zmm2 = Vector512<ushort>.Zero;
            Vector512<ushort> zmm3 = Vector512<ushort>.Zero;
            Vector512<ushort> zmm4 = Vector512<ushort>.Zero;
            Vector512<ushort> zmm5 = Vector512<ushort>.Zero;
            Vector512<ushort> zmm6 = Vector512<ushort>.Zero;
            Vector512<ushort> zmm7 = Vector512<ushort>.Zero;
            const int Control = 0x55;
            for (int i = 0; i < OperationsPerInvoke / 8; i++)
            {
                zmm0 = Avx512F.TernaryLogic(zmm0, zmm0, zmm0, Control);
                zmm1 = Avx512F.TernaryLogic(zmm1, zmm1, zmm1, Control);
                zmm2 = Avx512F.TernaryLogic(zmm2, zmm2, zmm2, Control);
                zmm3 = Avx512F.TernaryLogic(zmm3, zmm3, zmm3, Control);
                zmm4 = Avx512F.TernaryLogic(zmm4, zmm4, zmm4, Control);
                zmm5 = Avx512F.TernaryLogic(zmm5, zmm5, zmm5, Control);
                zmm6 = Avx512F.TernaryLogic(zmm6, zmm6, zmm6, Control);
                zmm7 = Avx512F.TernaryLogic(zmm7, zmm7, zmm7, Control);
            }
            return (zmm0 ^ zmm4, zmm1 ^ zmm5, zmm2 ^ zmm6, zmm3 ^ zmm7);
        }
    }
}

Result:


BenchmarkDotNet v0.13.7, Windows 11 (10.0.22621.1992/22H2/2022Update/SunValley2)
Intel Xeon w5-2455X, 1 CPU, 24 logical and 12 physical cores
.NET SDK 8.0.100-preview.6.23330.14
  [Host]     : .NET 8.0.0 (8.0.23.32907), X64 RyuJIT AVX2
  DefaultJob : .NET 8.0.0 (8.0.23.32907), X64 RyuJIT AVX2


Method Mean Error StdDev Code Size
UnaryNegationStandardLatency 0.3068 ns 0.0043 ns 0.0040 ns 149 B
UnaryNegationStandardThroughput 0.2997 ns 0.0025 ns 0.0020 ns 250 B
UnaryNegationTernaryLogic55Latency 0.2299 ns 0.0017 ns 0.0016 ns 101 B
UnaryNegationTernaryLogic55Throughput 0.1316 ns 0.0013 ns 0.0012 ns 190 B

Codegen:

.NET 8.0.0 (8.0.23.32907), X64 RyuJIT AVX2

; Avx512Benchmarks.Avx512UnaryNegationBenchmarks.UnaryNegationStandardLatency()
       vzeroupper
       vxorps    ymm0,ymm0,ymm0
       xor       eax,eax
M00_L00:
       vpternlogd zmm1,zmm1,zmm1,0FF
       vpxord    zmm0,zmm0,zmm1
       vpternlogd zmm1,zmm1,zmm1,0FF
       vpxord    zmm0,zmm0,zmm1
       vpternlogd zmm1,zmm1,zmm1,0FF
       vpxord    zmm0,zmm0,zmm1
       vpternlogd zmm1,zmm1,zmm1,0FF
       vpxord    zmm0,zmm0,zmm1
       vpternlogd zmm1,zmm1,zmm1,0FF
       vpxord    zmm0,zmm0,zmm1
       vpternlogd zmm1,zmm1,zmm1,0FF
       vpxord    zmm0,zmm0,zmm1
       vpternlogd zmm1,zmm1,zmm1,0FF
       vpxord    zmm1,zmm0,zmm1
       vpternlogd zmm2,zmm2,zmm2,0FF
       vpxord    zmm2,zmm1,zmm2
       inc       eax
       cmp       eax,400
       jl        short M00_L00
       vmovups   [rdx],zmm1
       vmovups   [rdx+40],zmm2
       vmovups   [rdx+80],zmm0
       mov       rax,rdx
       vzeroupper
       ret
; Total bytes of code 149

.NET 8.0.0 (8.0.23.32907), X64 RyuJIT AVX2

; Avx512Benchmarks.Avx512UnaryNegationBenchmarks.UnaryNegationStandardThroughput()
       sub       rsp,38
       vzeroupper
       vmovaps   [rsp+20],xmm6
       vmovaps   [rsp+10],xmm7
       vmovaps   [rsp],xmm8
       vxorps    ymm0,ymm0,ymm0
       vxorps    ymm1,ymm1,ymm1
       vxorps    ymm2,ymm2,ymm2
       vxorps    ymm3,ymm3,ymm3
       vxorps    ymm4,ymm4,ymm4
       vxorps    ymm5,ymm5,ymm5
       vxorps    ymm6,ymm6,ymm6
       vxorps    ymm7,ymm7,ymm7
       xor       eax,eax
M00_L00:
       vpternlogd zmm8,zmm8,zmm8,0FF
       vpxord    zmm0,zmm0,zmm8
       vpternlogd zmm8,zmm8,zmm8,0FF
       vpxord    zmm1,zmm1,zmm8
       vpternlogd zmm8,zmm8,zmm8,0FF
       vpxord    zmm2,zmm2,zmm8
       vpternlogd zmm8,zmm8,zmm8,0FF
       vpxord    zmm3,zmm3,zmm8
       vpternlogd zmm8,zmm8,zmm8,0FF
       vpxord    zmm4,zmm4,zmm8
       vpternlogd zmm8,zmm8,zmm8,0FF
       vpxord    zmm5,zmm5,zmm8
       vpternlogd zmm8,zmm8,zmm8,0FF
       vpxord    zmm6,zmm6,zmm8
       vpternlogd zmm8,zmm8,zmm8,0FF
       vpxord    zmm7,zmm7,zmm8
       inc       eax
       cmp       eax,400
       jl        short M00_L00
       vpxord    zmm0,zmm0,zmm4
       vpxord    zmm1,zmm1,zmm5
       vpxord    zmm2,zmm2,zmm6
       vpxord    zmm3,zmm3,zmm7
       vmovups   [rdx],zmm0
       vmovups   [rdx+40],zmm1
       vmovups   [rdx+80],zmm2
       vmovups   [rdx+0C0],zmm3
       mov       rax,rdx
       vmovaps   xmm6,[rsp+20]
       vmovaps   xmm7,[rsp+10]
       vmovaps   xmm8,[rsp]
       vzeroupper
       add       rsp,38
       ret
; Total bytes of code 250

.NET 8.0.0 (8.0.23.32907), X64 RyuJIT AVX2

; Avx512Benchmarks.Avx512UnaryNegationBenchmarks.UnaryNegationTernaryLogic55Latency()
       vzeroupper
       vxorps    ymm0,ymm0,ymm0
       xor       eax,eax
M00_L00:
       vpternlogd zmm1,zmm1,zmm0,55
       vpternlogd zmm2,zmm2,zmm1,55
       vpternlogd zmm0,zmm0,zmm2,55
       vpternlogd zmm1,zmm1,zmm0,55
       vpternlogd zmm2,zmm2,zmm1,55
       vpternlogd zmm0,zmm0,zmm2,55
       vpternlogd zmm1,zmm1,zmm0,55
       vpternlogd zmm2,zmm2,zmm1,55
       inc       eax
       cmp       eax,400
       jl        short M00_L00
       vmovups   [rdx],zmm1
       vmovups   [rdx+40],zmm2
       vmovups   [rdx+80],zmm0
       mov       rax,rdx
       vzeroupper
       ret
; Total bytes of code 101

.NET 8.0.0 (8.0.23.32907), X64 RyuJIT AVX2

; Avx512Benchmarks.Avx512UnaryNegationBenchmarks.UnaryNegationTernaryLogic55Throughput()
       sub       rsp,28
       vzeroupper
       vmovaps   [rsp+10],xmm6
       vmovaps   [rsp],xmm7
       vxorps    ymm0,ymm0,ymm0
       vxorps    ymm1,ymm1,ymm1
       vxorps    ymm2,ymm2,ymm2
       vxorps    ymm3,ymm3,ymm3
       vxorps    ymm4,ymm4,ymm4
       vxorps    ymm5,ymm5,ymm5
       vxorps    ymm6,ymm6,ymm6
       vxorps    ymm7,ymm7,ymm7
       xor       eax,eax
M00_L00:
       vpternlogd zmm0,zmm0,zmm0,55
       vpternlogd zmm1,zmm1,zmm1,55
       vpternlogd zmm2,zmm2,zmm2,55
       vpternlogd zmm3,zmm3,zmm3,55
       vpternlogd zmm4,zmm4,zmm4,55
       vpternlogd zmm5,zmm5,zmm5,55
       vpternlogd zmm6,zmm6,zmm6,55
       vpternlogd zmm7,zmm7,zmm7,55
       inc       eax
       cmp       eax,400
       jl        short M00_L00
       vpxord    zmm0,zmm0,zmm4
       vpxord    zmm1,zmm1,zmm5
       vpxord    zmm2,zmm2,zmm6
       vpxord    zmm3,zmm3,zmm7
       vmovups   [rdx],zmm0
       vmovups   [rdx+40],zmm1
       vmovups   [rdx+80],zmm2
       vmovups   [rdx+0C0],zmm3
       mov       rax,rdx
       vmovaps   xmm6,[rsp+10]
       vmovaps   xmm7,[rsp]
       vzeroupper
       add       rsp,28
       ret
; Total bytes of code 190

Analysis

Metadata

Metadata

Assignees

No one assigned

    Labels

    area-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMIavx512Related to the AVX-512 architecturetenet-performancePerformance related issue

    Type

    No type

    Projects

    No projects

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions