Open
Description
Description
For example,
[Benchmark(OperationsPerInvoke = 2048)]
public Vector512<ushort> Not(Vector512<ushort> value) => ~value;
Current codegen is like:
vpternlogd zmm0,zmm0,zmm0,0FFh
vpxord zmm0,zmm0,zmmword ptr [rcx]
It'll be better to instead emit something like:
vpternlogd zmm0,zmm0,zmmword ptr [rcx],55
Configuration
BenchmarkDotNet v0.13.7, Windows 11 (10.0.22621.1992/22H2/2022Update/SunValley2)
Intel Xeon w5-2455X, 1 CPU, 24 logical and 12 physical cores
.NET SDK 8.0.100-preview.6.23330.14
[Host] : .NET 8.0.0 (8.0.23.32907), X64 RyuJIT AVX2
DefaultJob : .NET 8.0.0 (8.0.23.32907), X64 RyuJIT AVX2
Regression?
No
Data
Benchmark code:
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Jobs;
namespace Avx512Benchmarks
{
[SimpleJob(runtimeMoniker: RuntimeMoniker.HostProcess)]
[DisassemblyDiagnoser(maxDepth: int.MaxValue)]
public class Avx512UnaryNegationBenchmarks
{
private const int OperationsPerInvoke = 8192;
[Benchmark(OperationsPerInvoke = OperationsPerInvoke)]
public (Vector512<ushort>, Vector512<ushort>, Vector512<ushort>) UnaryNegationStandardLatency()
{
Vector512<ushort> zmm0 = Vector512<ushort>.Zero;
Vector512<ushort> zmm1 = Vector512<ushort>.Zero;
Vector512<ushort> zmm2 = Vector512<ushort>.Zero;
for (int i = 0; i < OperationsPerInvoke / 8; i++)
{
zmm0 = ~zmm2;
zmm1 = ~zmm0;
zmm2 = ~zmm1;
zmm0 = ~zmm2;
zmm1 = ~zmm0;
zmm2 = ~zmm1;
zmm0 = ~zmm2;
zmm1 = ~zmm0;
}
return (zmm0, zmm1, zmm2);
}
[Benchmark(OperationsPerInvoke = OperationsPerInvoke)]
public (Vector512<ushort>, Vector512<ushort>, Vector512<ushort>, Vector512<ushort>) UnaryNegationStandardThroughput()
{
Vector512<ushort> zmm0 = Vector512<ushort>.Zero;
Vector512<ushort> zmm1 = Vector512<ushort>.Zero;
Vector512<ushort> zmm2 = Vector512<ushort>.Zero;
Vector512<ushort> zmm3 = Vector512<ushort>.Zero;
Vector512<ushort> zmm4 = Vector512<ushort>.Zero;
Vector512<ushort> zmm5 = Vector512<ushort>.Zero;
Vector512<ushort> zmm6 = Vector512<ushort>.Zero;
Vector512<ushort> zmm7 = Vector512<ushort>.Zero;
for (int i = 0; i < OperationsPerInvoke / 8; i++)
{
zmm0 = ~zmm0;
zmm1 = ~zmm1;
zmm2 = ~zmm2;
zmm3 = ~zmm3;
zmm4 = ~zmm4;
zmm5 = ~zmm5;
zmm6 = ~zmm6;
zmm7 = ~zmm7;
}
return (zmm0 ^ zmm4, zmm1 ^ zmm5, zmm2 ^ zmm6, zmm3 ^ zmm7);
}
[Benchmark(OperationsPerInvoke = OperationsPerInvoke)]
public (Vector512<ushort>, Vector512<ushort>, Vector512<ushort>) UnaryNegationTernaryLogic55Latency()
{
Vector512<ushort> zmm0 = Vector512<ushort>.Zero;
Vector512<ushort> zmm1 = Vector512<ushort>.Zero;
Vector512<ushort> zmm2 = Vector512<ushort>.Zero;
const int Control = 0x55;
for (int i = 0; i < OperationsPerInvoke / 8; i++)
{
zmm0 = Avx512F.TernaryLogic(zmm0, zmm2, zmm2, Control);
zmm1 = Avx512F.TernaryLogic(zmm1, zmm0, zmm0, Control);
zmm2 = Avx512F.TernaryLogic(zmm2, zmm1, zmm1, Control);
zmm0 = Avx512F.TernaryLogic(zmm0, zmm2, zmm2, Control);
zmm1 = Avx512F.TernaryLogic(zmm1, zmm0, zmm0, Control);
zmm2 = Avx512F.TernaryLogic(zmm2, zmm1, zmm1, Control);
zmm0 = Avx512F.TernaryLogic(zmm0, zmm2, zmm2, Control);
zmm1 = Avx512F.TernaryLogic(zmm1, zmm0, zmm0, Control);
}
return (zmm0, zmm1, zmm2);
}
[Benchmark(OperationsPerInvoke = OperationsPerInvoke)]
public (Vector512<ushort>, Vector512<ushort>, Vector512<ushort>, Vector512<ushort>) UnaryNegationTernaryLogic55Throughput()
{
Vector512<ushort> zmm0 = Vector512<ushort>.Zero;
Vector512<ushort> zmm1 = Vector512<ushort>.Zero;
Vector512<ushort> zmm2 = Vector512<ushort>.Zero;
Vector512<ushort> zmm3 = Vector512<ushort>.Zero;
Vector512<ushort> zmm4 = Vector512<ushort>.Zero;
Vector512<ushort> zmm5 = Vector512<ushort>.Zero;
Vector512<ushort> zmm6 = Vector512<ushort>.Zero;
Vector512<ushort> zmm7 = Vector512<ushort>.Zero;
const int Control = 0x55;
for (int i = 0; i < OperationsPerInvoke / 8; i++)
{
zmm0 = Avx512F.TernaryLogic(zmm0, zmm0, zmm0, Control);
zmm1 = Avx512F.TernaryLogic(zmm1, zmm1, zmm1, Control);
zmm2 = Avx512F.TernaryLogic(zmm2, zmm2, zmm2, Control);
zmm3 = Avx512F.TernaryLogic(zmm3, zmm3, zmm3, Control);
zmm4 = Avx512F.TernaryLogic(zmm4, zmm4, zmm4, Control);
zmm5 = Avx512F.TernaryLogic(zmm5, zmm5, zmm5, Control);
zmm6 = Avx512F.TernaryLogic(zmm6, zmm6, zmm6, Control);
zmm7 = Avx512F.TernaryLogic(zmm7, zmm7, zmm7, Control);
}
return (zmm0 ^ zmm4, zmm1 ^ zmm5, zmm2 ^ zmm6, zmm3 ^ zmm7);
}
}
}
Result:
BenchmarkDotNet v0.13.7, Windows 11 (10.0.22621.1992/22H2/2022Update/SunValley2)
Intel Xeon w5-2455X, 1 CPU, 24 logical and 12 physical cores
.NET SDK 8.0.100-preview.6.23330.14
[Host] : .NET 8.0.0 (8.0.23.32907), X64 RyuJIT AVX2
DefaultJob : .NET 8.0.0 (8.0.23.32907), X64 RyuJIT AVX2
Method | Mean | Error | StdDev | Code Size |
---|---|---|---|---|
UnaryNegationStandardLatency | 0.3068 ns | 0.0043 ns | 0.0040 ns | 149 B |
UnaryNegationStandardThroughput | 0.2997 ns | 0.0025 ns | 0.0020 ns | 250 B |
UnaryNegationTernaryLogic55Latency | 0.2299 ns | 0.0017 ns | 0.0016 ns | 101 B |
UnaryNegationTernaryLogic55Throughput | 0.1316 ns | 0.0013 ns | 0.0012 ns | 190 B |
Codegen:
.NET 8.0.0 (8.0.23.32907), X64 RyuJIT AVX2
; Avx512Benchmarks.Avx512UnaryNegationBenchmarks.UnaryNegationStandardLatency()
vzeroupper
vxorps ymm0,ymm0,ymm0
xor eax,eax
M00_L00:
vpternlogd zmm1,zmm1,zmm1,0FF
vpxord zmm0,zmm0,zmm1
vpternlogd zmm1,zmm1,zmm1,0FF
vpxord zmm0,zmm0,zmm1
vpternlogd zmm1,zmm1,zmm1,0FF
vpxord zmm0,zmm0,zmm1
vpternlogd zmm1,zmm1,zmm1,0FF
vpxord zmm0,zmm0,zmm1
vpternlogd zmm1,zmm1,zmm1,0FF
vpxord zmm0,zmm0,zmm1
vpternlogd zmm1,zmm1,zmm1,0FF
vpxord zmm0,zmm0,zmm1
vpternlogd zmm1,zmm1,zmm1,0FF
vpxord zmm1,zmm0,zmm1
vpternlogd zmm2,zmm2,zmm2,0FF
vpxord zmm2,zmm1,zmm2
inc eax
cmp eax,400
jl short M00_L00
vmovups [rdx],zmm1
vmovups [rdx+40],zmm2
vmovups [rdx+80],zmm0
mov rax,rdx
vzeroupper
ret
; Total bytes of code 149
.NET 8.0.0 (8.0.23.32907), X64 RyuJIT AVX2
; Avx512Benchmarks.Avx512UnaryNegationBenchmarks.UnaryNegationStandardThroughput()
sub rsp,38
vzeroupper
vmovaps [rsp+20],xmm6
vmovaps [rsp+10],xmm7
vmovaps [rsp],xmm8
vxorps ymm0,ymm0,ymm0
vxorps ymm1,ymm1,ymm1
vxorps ymm2,ymm2,ymm2
vxorps ymm3,ymm3,ymm3
vxorps ymm4,ymm4,ymm4
vxorps ymm5,ymm5,ymm5
vxorps ymm6,ymm6,ymm6
vxorps ymm7,ymm7,ymm7
xor eax,eax
M00_L00:
vpternlogd zmm8,zmm8,zmm8,0FF
vpxord zmm0,zmm0,zmm8
vpternlogd zmm8,zmm8,zmm8,0FF
vpxord zmm1,zmm1,zmm8
vpternlogd zmm8,zmm8,zmm8,0FF
vpxord zmm2,zmm2,zmm8
vpternlogd zmm8,zmm8,zmm8,0FF
vpxord zmm3,zmm3,zmm8
vpternlogd zmm8,zmm8,zmm8,0FF
vpxord zmm4,zmm4,zmm8
vpternlogd zmm8,zmm8,zmm8,0FF
vpxord zmm5,zmm5,zmm8
vpternlogd zmm8,zmm8,zmm8,0FF
vpxord zmm6,zmm6,zmm8
vpternlogd zmm8,zmm8,zmm8,0FF
vpxord zmm7,zmm7,zmm8
inc eax
cmp eax,400
jl short M00_L00
vpxord zmm0,zmm0,zmm4
vpxord zmm1,zmm1,zmm5
vpxord zmm2,zmm2,zmm6
vpxord zmm3,zmm3,zmm7
vmovups [rdx],zmm0
vmovups [rdx+40],zmm1
vmovups [rdx+80],zmm2
vmovups [rdx+0C0],zmm3
mov rax,rdx
vmovaps xmm6,[rsp+20]
vmovaps xmm7,[rsp+10]
vmovaps xmm8,[rsp]
vzeroupper
add rsp,38
ret
; Total bytes of code 250
.NET 8.0.0 (8.0.23.32907), X64 RyuJIT AVX2
; Avx512Benchmarks.Avx512UnaryNegationBenchmarks.UnaryNegationTernaryLogic55Latency()
vzeroupper
vxorps ymm0,ymm0,ymm0
xor eax,eax
M00_L00:
vpternlogd zmm1,zmm1,zmm0,55
vpternlogd zmm2,zmm2,zmm1,55
vpternlogd zmm0,zmm0,zmm2,55
vpternlogd zmm1,zmm1,zmm0,55
vpternlogd zmm2,zmm2,zmm1,55
vpternlogd zmm0,zmm0,zmm2,55
vpternlogd zmm1,zmm1,zmm0,55
vpternlogd zmm2,zmm2,zmm1,55
inc eax
cmp eax,400
jl short M00_L00
vmovups [rdx],zmm1
vmovups [rdx+40],zmm2
vmovups [rdx+80],zmm0
mov rax,rdx
vzeroupper
ret
; Total bytes of code 101
.NET 8.0.0 (8.0.23.32907), X64 RyuJIT AVX2
; Avx512Benchmarks.Avx512UnaryNegationBenchmarks.UnaryNegationTernaryLogic55Throughput()
sub rsp,28
vzeroupper
vmovaps [rsp+10],xmm6
vmovaps [rsp],xmm7
vxorps ymm0,ymm0,ymm0
vxorps ymm1,ymm1,ymm1
vxorps ymm2,ymm2,ymm2
vxorps ymm3,ymm3,ymm3
vxorps ymm4,ymm4,ymm4
vxorps ymm5,ymm5,ymm5
vxorps ymm6,ymm6,ymm6
vxorps ymm7,ymm7,ymm7
xor eax,eax
M00_L00:
vpternlogd zmm0,zmm0,zmm0,55
vpternlogd zmm1,zmm1,zmm1,55
vpternlogd zmm2,zmm2,zmm2,55
vpternlogd zmm3,zmm3,zmm3,55
vpternlogd zmm4,zmm4,zmm4,55
vpternlogd zmm5,zmm5,zmm5,55
vpternlogd zmm6,zmm6,zmm6,55
vpternlogd zmm7,zmm7,zmm7,55
inc eax
cmp eax,400
jl short M00_L00
vpxord zmm0,zmm0,zmm4
vpxord zmm1,zmm1,zmm5
vpxord zmm2,zmm2,zmm6
vpxord zmm3,zmm3,zmm7
vmovups [rdx],zmm0
vmovups [rdx+40],zmm1
vmovups [rdx+80],zmm2
vmovups [rdx+0C0],zmm3
mov rax,rdx
vmovaps xmm6,[rsp+10]
vmovaps xmm7,[rsp]
vzeroupper
add rsp,28
ret
; Total bytes of code 190