[x86-64 BMI1] AND-NOT's should be preferred to OR-NOT's because we have `andn`/`vpandn` instructions #108731

Validark · 2024-09-15T05:40:49Z

Zig code:

export fn foo(w: u64, x: u64, y: u64, z: u64) u64 {
    var s = w;
    s &= ~(x & y);
    s &= ~(z & ~y);
    return s;
}

export fn fooVec(w: @Vector(16, u8), x: @Vector(16, u8), y: @Vector(16, u8), z: @Vector(16, u8)) @Vector(16, u8) {
    var s = w;
    s &= ~(x & y);
    s &= ~(z & ~y);
    return s;
}

Compiled for Znver3:

foo:
        not     rcx
        and     rsi, rdx
        andn    rax, rsi, rdi
        or      rcx, rdx
        and     rax, rcx
        ret

fooVec:
        vpand   xmm1, xmm2, xmm1
        vpcmpeqd        xmm4, xmm4, xmm4
        vpandn  xmm0, xmm1, xmm0
        vpxor   xmm1, xmm3, xmm4
        vpor    xmm1, xmm1, xmm2
        vpand   xmm0, xmm0, xmm1
        ret

I expected it to be and+andn+andn+andn for the scalar version, and vpand+vpandn+vpandn+vpandn for the vector version. However, it turns s &= ~(z & ~y); into s &= ~z | y;. While this makes sense on machines without andn (and doesn't matter for machines which have andn and orn, or just vpternlogq), with x86-64 BMI1 we only get andn. We never got an orn, so this optimization hurts us.

LLVM (optimized):

define dso_local i64 @foo(i64 %0, i64 %1, i64 %2, i64 %3) local_unnamed_addr {
Entry:
  %4 = and i64 %2, %1
  %5 = xor i64 %4, -1
  %6 = and i64 %5, %0
  %.not = xor i64 %3, -1
  %7 = or i64 %.not, %2
  %8 = and i64 %6, %7
  ret i64 %8
}

declare void @llvm.dbg.value(metadata, metadata, metadata) #1

define dso_local <16 x i8> @fooVec(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2, <16 x i8> %3) local_unnamed_addr {
Entry:
  %4 = and <16 x i8> %2, %1
  %5 = xor <16 x i8> %4, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
  %6 = and <16 x i8> %5, %0
  %.not = xor <16 x i8> %3, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
  %7 = or <16 x i8> %.not, %2
  %8 = and <16 x i8> %6, %7
  ret <16 x i8> %8
}

LLVM Godbolt link

The text was updated successfully, but these errors were encountered:

llvmbot · 2024-09-15T11:34:21Z

@llvm/issue-subscribers-backend-x86

Author: Niles Salter (Validark)

Zig code:

export fn foo(w: u64, x: u64, y: u64, z: u64) u64 {
    var s = w;
    s &amp;= ~(x &amp; y);
    s &amp;= ~(z &amp; ~y);
    return s;
}

export fn fooVec(w: @<!-- -->Vector(16, u8), x: @<!-- -->Vector(16, u8), y: @<!-- -->Vector(16, u8), z: @<!-- -->Vector(16, u8)) @<!-- -->Vector(16, u8) {
    var s = w;
    s &amp;= ~(x &amp; y);
    s &amp;= ~(z &amp; ~y);
    return s;
}

Compiled for Znver3:

foo:
        not     rcx
        and     rsi, rdx
        andn    rax, rsi, rdi
        or      rcx, rdx
        and     rax, rcx
        ret

fooVec:
        vpand   xmm1, xmm2, xmm1
        vpcmpeqd        xmm4, xmm4, xmm4
        vpandn  xmm0, xmm1, xmm0
        vpxor   xmm1, xmm3, xmm4
        vpor    xmm1, xmm1, xmm2
        vpand   xmm0, xmm0, xmm1
        ret

I expected it to be and+andn+andn+andn for the scalar version, and vpand+vpandn+vpandn+vpandn for the vector version. However, it turns s &= ~(z & ~y); into s &= ~z | y;. While this makes sense on machines without andn (and doesn't matter for machines which have andn and orn, or just vpternlogq), with x86-64 BMI1 we only get andn. We never got an orn, so this optimization hurts us.

LLVM (optimized):

define dso_local i64 @<!-- -->foo(i64 %0, i64 %1, i64 %2, i64 %3) local_unnamed_addr {
Entry:
  %4 = and i64 %2, %1
  %5 = xor i64 %4, -1
  %6 = and i64 %5, %0
  %.not = xor i64 %3, -1
  %7 = or i64 %.not, %2
  %8 = and i64 %6, %7
  ret i64 %8
}

declare void @<!-- -->llvm.dbg.value(metadata, metadata, metadata) #<!-- -->1

define dso_local &lt;16 x i8&gt; @<!-- -->fooVec(&lt;16 x i8&gt; %0, &lt;16 x i8&gt; %1, &lt;16 x i8&gt; %2, &lt;16 x i8&gt; %3) local_unnamed_addr {
Entry:
  %4 = and &lt;16 x i8&gt; %2, %1
  %5 = xor &lt;16 x i8&gt; %4, &lt;i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1&gt;
  %6 = and &lt;16 x i8&gt; %5, %0
  %.not = xor &lt;16 x i8&gt; %3, &lt;i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1&gt;
  %7 = or &lt;16 x i8&gt; %.not, %2
  %8 = and &lt;16 x i8&gt; %6, %7
  ret &lt;16 x i8&gt; %8
}

LLVM Godbolt link

RKSimon · 2024-09-15T11:34:27Z

I've tagged this as x86 but hopefully we can handle this generically

Saldivarcher · 2024-09-18T23:00:28Z

@RKSimon just noticed you self-assigned this, but I took a stab at this last night. This is the PR #109215 .

…9215) When `andn` is available, we should avoid switching `s &= ~(z & ~y);` into `s &= ~z | y;` This patch turns this assembly from: ``` foo: not rcx and rsi, rdx andn rax, rsi, rdi or rcx, rdx and rax, rcx ret ``` into: ``` foo: and rsi, rdx andn rcx, rdx, rcx andn rax, rsi, rdi andn rax, rcx, rax ret ``` Fixes #108731

…m#109215) When `andn` is available, we should avoid switching `s &= ~(z & ~y);` into `s &= ~z | y;` This patch turns this assembly from: ``` foo: not rcx and rsi, rdx andn rax, rsi, rdi or rcx, rdx and rax, rcx ret ``` into: ``` foo: and rsi, rdx andn rcx, rdx, rcx andn rax, rsi, rdi andn rax, rcx, rax ret ``` Fixes llvm#108731

github-actions bot added the new issue label Sep 15, 2024

RKSimon added backend:X86 missed-optimization llvm:SelectionDAG SelectionDAGISel as well and removed new issue labels Sep 15, 2024

RKSimon self-assigned this Sep 18, 2024

RKSimon mentioned this issue Sep 19, 2024

[X86] Invert (and X, ~(and ~Y, Z)) back into (and X, (or Y, ~Z)) #109215

Merged

RKSimon assigned Saldivarcher and unassigned RKSimon Sep 19, 2024

RKSimon closed this as completed in #109215 Oct 12, 2024

EugeneZelenko removed the llvm:SelectionDAG SelectionDAGISel as well label Oct 12, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[x86-64 BMI1] AND-NOT's should be preferred to OR-NOT's because we have `andn`/`vpandn` instructions #108731

[x86-64 BMI1] AND-NOT's should be preferred to OR-NOT's because we have `andn`/`vpandn` instructions #108731

Validark commented Sep 15, 2024

llvmbot commented Sep 15, 2024

Uh oh!

RKSimon commented Sep 15, 2024

Uh oh!

Saldivarcher commented Sep 18, 2024

Uh oh!

[x86-64 BMI1] AND-NOT's should be preferred to OR-NOT's because we have andn/vpandn instructions #108731

[x86-64 BMI1] AND-NOT's should be preferred to OR-NOT's because we have andn/vpandn instructions #108731

Comments

Validark commented Sep 15, 2024

llvmbot commented Sep 15, 2024

Uh oh!

RKSimon commented Sep 15, 2024

Uh oh!

Saldivarcher commented Sep 18, 2024

Uh oh!

[x86-64 BMI1] AND-NOT's should be preferred to OR-NOT's because we have `andn`/`vpandn` instructions #108731

[x86-64 BMI1] AND-NOT's should be preferred to OR-NOT's because we have `andn`/`vpandn` instructions #108731