Skip to content

[x86-64 BMI1] AND-NOT's should be preferred to OR-NOT's because we have andn/vpandn instructions #108731

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Validark opened this issue Sep 15, 2024 · 3 comments · Fixed by #109215

Comments

@Validark
Copy link

Zig code:

export fn foo(w: u64, x: u64, y: u64, z: u64) u64 {
    var s = w;
    s &= ~(x & y);
    s &= ~(z & ~y);
    return s;
}

export fn fooVec(w: @Vector(16, u8), x: @Vector(16, u8), y: @Vector(16, u8), z: @Vector(16, u8)) @Vector(16, u8) {
    var s = w;
    s &= ~(x & y);
    s &= ~(z & ~y);
    return s;
}

Compiled for Znver3:

foo:
        not     rcx
        and     rsi, rdx
        andn    rax, rsi, rdi
        or      rcx, rdx
        and     rax, rcx
        ret

fooVec:
        vpand   xmm1, xmm2, xmm1
        vpcmpeqd        xmm4, xmm4, xmm4
        vpandn  xmm0, xmm1, xmm0
        vpxor   xmm1, xmm3, xmm4
        vpor    xmm1, xmm1, xmm2
        vpand   xmm0, xmm0, xmm1
        ret

I expected it to be and+andn+andn+andn for the scalar version, and vpand+vpandn+vpandn+vpandn for the vector version. However, it turns s &= ~(z & ~y); into s &= ~z | y;. While this makes sense on machines without andn (and doesn't matter for machines which have andn and orn, or just vpternlogq), with x86-64 BMI1 we only get andn. We never got an orn, so this optimization hurts us.

LLVM (optimized):

define dso_local i64 @foo(i64 %0, i64 %1, i64 %2, i64 %3) local_unnamed_addr {
Entry:
  %4 = and i64 %2, %1
  %5 = xor i64 %4, -1
  %6 = and i64 %5, %0
  %.not = xor i64 %3, -1
  %7 = or i64 %.not, %2
  %8 = and i64 %6, %7
  ret i64 %8
}

declare void @llvm.dbg.value(metadata, metadata, metadata) #1

define dso_local <16 x i8> @fooVec(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2, <16 x i8> %3) local_unnamed_addr {
Entry:
  %4 = and <16 x i8> %2, %1
  %5 = xor <16 x i8> %4, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
  %6 = and <16 x i8> %5, %0
  %.not = xor <16 x i8> %3, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
  %7 = or <16 x i8> %.not, %2
  %8 = and <16 x i8> %6, %7
  ret <16 x i8> %8
}

LLVM Godbolt link

@llvmbot
Copy link
Member

llvmbot commented Sep 15, 2024

@llvm/issue-subscribers-backend-x86

Author: Niles Salter (Validark)

Zig code:
export fn foo(w: u64, x: u64, y: u64, z: u64) u64 {
    var s = w;
    s &amp;= ~(x &amp; y);
    s &amp;= ~(z &amp; ~y);
    return s;
}

export fn fooVec(w: @<!-- -->Vector(16, u8), x: @<!-- -->Vector(16, u8), y: @<!-- -->Vector(16, u8), z: @<!-- -->Vector(16, u8)) @<!-- -->Vector(16, u8) {
    var s = w;
    s &amp;= ~(x &amp; y);
    s &amp;= ~(z &amp; ~y);
    return s;
}

Compiled for Znver3:

foo:
        not     rcx
        and     rsi, rdx
        andn    rax, rsi, rdi
        or      rcx, rdx
        and     rax, rcx
        ret

fooVec:
        vpand   xmm1, xmm2, xmm1
        vpcmpeqd        xmm4, xmm4, xmm4
        vpandn  xmm0, xmm1, xmm0
        vpxor   xmm1, xmm3, xmm4
        vpor    xmm1, xmm1, xmm2
        vpand   xmm0, xmm0, xmm1
        ret

I expected it to be and+andn+andn+andn for the scalar version, and vpand+vpandn+vpandn+vpandn for the vector version. However, it turns s &amp;= ~(z &amp; ~y); into s &amp;= ~z | y;. While this makes sense on machines without andn (and doesn't matter for machines which have andn and orn, or just vpternlogq), with x86-64 BMI1 we only get andn. We never got an orn, so this optimization hurts us.

LLVM (optimized):

define dso_local i64 @<!-- -->foo(i64 %0, i64 %1, i64 %2, i64 %3) local_unnamed_addr {
Entry:
  %4 = and i64 %2, %1
  %5 = xor i64 %4, -1
  %6 = and i64 %5, %0
  %.not = xor i64 %3, -1
  %7 = or i64 %.not, %2
  %8 = and i64 %6, %7
  ret i64 %8
}

declare void @<!-- -->llvm.dbg.value(metadata, metadata, metadata) #<!-- -->1

define dso_local &lt;16 x i8&gt; @<!-- -->fooVec(&lt;16 x i8&gt; %0, &lt;16 x i8&gt; %1, &lt;16 x i8&gt; %2, &lt;16 x i8&gt; %3) local_unnamed_addr {
Entry:
  %4 = and &lt;16 x i8&gt; %2, %1
  %5 = xor &lt;16 x i8&gt; %4, &lt;i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1&gt;
  %6 = and &lt;16 x i8&gt; %5, %0
  %.not = xor &lt;16 x i8&gt; %3, &lt;i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1&gt;
  %7 = or &lt;16 x i8&gt; %.not, %2
  %8 = and &lt;16 x i8&gt; %6, %7
  ret &lt;16 x i8&gt; %8
}

LLVM Godbolt link

@RKSimon
Copy link
Collaborator

RKSimon commented Sep 15, 2024

I've tagged this as x86 but hopefully we can handle this generically

@RKSimon RKSimon self-assigned this Sep 18, 2024
@Saldivarcher
Copy link
Contributor

@RKSimon just noticed you self-assigned this, but I took a stab at this last night. This is the PR #109215 .

@RKSimon RKSimon assigned Saldivarcher and unassigned RKSimon Sep 19, 2024
RKSimon pushed a commit that referenced this issue Oct 12, 2024
…9215)

When `andn` is available, we should avoid switching `s &= ~(z & ~y);` into `s &= ~z | y;`

This patch turns this assembly from:
```
foo:
        not     rcx
        and     rsi, rdx
        andn    rax, rsi, rdi
        or      rcx, rdx
        and     rax, rcx
        ret
```
into:
```
foo:
        and     rsi, rdx
        andn    rcx, rdx, rcx
        andn    rax, rsi, rdi
        andn    rax, rcx, rax
        ret
```
Fixes #108731
@EugeneZelenko EugeneZelenko removed the llvm:SelectionDAG SelectionDAGISel as well label Oct 12, 2024
DanielCChen pushed a commit to DanielCChen/llvm-project that referenced this issue Oct 16, 2024
…m#109215)

When `andn` is available, we should avoid switching `s &= ~(z & ~y);` into `s &= ~z | y;`

This patch turns this assembly from:
```
foo:
        not     rcx
        and     rsi, rdx
        andn    rax, rsi, rdi
        or      rcx, rdx
        and     rax, rcx
        ret
```
into:
```
foo:
        and     rsi, rdx
        andn    rcx, rdx, rcx
        andn    rax, rsi, rdi
        andn    rax, rcx, rax
        ret
```
Fixes llvm#108731
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging a pull request may close this issue.

5 participants