-
Notifications
You must be signed in to change notification settings - Fork 14.5k
[DAG] Expand vXi1 add/sub overflow operations as xor/and #69191
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-selectiondag @llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesSimilar to what we already do for add/sub + saturation variants. Fixes #69080 Full diff: https://github.com/llvm/llvm-project/pull/69191.diff 7 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 9a37627e36b9ffa..8f79fecc7ccca28 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -9854,6 +9854,27 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
SDValue ZeroOverFlow = getConstant(0, DL, VTList.VTs[1]);
return getNode(ISD::MERGE_VALUES, DL, VTList, {N1, ZeroOverFlow}, Flags);
}
+
+ if (VTList.VTs[0].isVector() &&
+ VTList.VTs[0].getVectorElementType() == MVT::i1 &&
+ VTList.VTs[1].getVectorElementType() == MVT::i1) {
+ SDValue F1 = getFreeze(N1);
+ SDValue F2 = getFreeze(N2);
+ // {vXi1,vXi1} (u/s)addo(vXi1 x, vXi1y) -> {xor(x,y),and(x,y))
+ if (Opcode == ISD::UADDO || Opcode == ISD::SADDO)
+ return getNode(ISD::MERGE_VALUES, DL, VTList,
+ {getNode(ISD::XOR, DL, VTList.VTs[0], F1, F2),
+ getNode(ISD::AND, DL, VTList.VTs[1], F1, F2)},
+ Flags);
+ // {vXi1,vXi1} (u/s)subo(vXi1 x, vXi1y) -> {xor(x,y),and(~x,y))
+ if (Opcode == ISD::USUBO || Opcode == ISD::SSUBO) {
+ SDValue NotF1 = getNOT(DL, F1, VTList.VTs[0]);
+ return getNode(ISD::MERGE_VALUES, DL, VTList,
+ {getNode(ISD::XOR, DL, VTList.VTs[0], F1, F2),
+ getNode(ISD::AND, DL, VTList.VTs[1], NotF1, F2)},
+ Flags);
+ }
+ }
break;
}
case ISD::SMUL_LOHI:
diff --git a/llvm/test/CodeGen/AArch64/vec_uaddo.ll b/llvm/test/CodeGen/AArch64/vec_uaddo.ll
index 6ad880020cc664a..00609b0df9b4e15 100644
--- a/llvm/test/CodeGen/AArch64/vec_uaddo.ll
+++ b/llvm/test/CodeGen/AArch64/vec_uaddo.ll
@@ -245,21 +245,17 @@ define <4 x i32> @uaddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind {
define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
; CHECK-LABEL: uaddo_v4i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v2.4h, #1
+; CHECK-NEXT: eor v2.8b, v0.8b, v1.8b
+; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-NEXT: adrp x8, .LCPI10_0
+; CHECK-NEXT: shl v2.4h, v2.4h, #15
+; CHECK-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-NEXT: cmlt v1.4h, v2.4h, #0
+; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI10_0]
+; CHECK-NEXT: shl v0.4s, v0.4s, #31
; CHECK-NEXT: and v1.8b, v1.8b, v2.8b
-; CHECK-NEXT: and v0.8b, v0.8b, v2.8b
-; CHECK-NEXT: add v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: fmov d1, d0
-; CHECK-NEXT: shl v2.4h, v0.4h, #15
-; CHECK-NEXT: cmlt v2.4h, v2.4h, #0
-; CHECK-NEXT: bic v1.4h, #2
-; CHECK-NEXT: cmeq v0.4h, v1.4h, v0.4h
-; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI10_0]
-; CHECK-NEXT: and v1.8b, v2.8b, v1.8b
-; CHECK-NEXT: mvn v0.8b, v0.8b
+; CHECK-NEXT: cmlt v0.4s, v0.4s, #0
; CHECK-NEXT: addv h1, v1.4h
-; CHECK-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-NEXT: fmov w8, s1
; CHECK-NEXT: strb w8, [x0]
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/X86/pr69080.ll b/llvm/test/CodeGen/X86/pr69080.ll
new file mode 100644
index 000000000000000..1b27adcb1ae7ce8
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr69080.ll
@@ -0,0 +1,38 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f | FileCheck %s --check-prefixes=AVX
+
+define { <4 x i1>, <4 x i1> } @uaddo(<4 x i1> %a) {
+; SSE-LABEL: uaddo:
+; SSE: # %bb.0:
+; SSE-NEXT: movaps %xmm0, %xmm1
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: uaddo:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovaps %xmm0, %xmm1
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %f = call { <4 x i1>, <4 x i1> } @llvm.uadd.with.overflow.v4i1(<4 x i1> %a, <4 x i1> %a)
+ ret { <4 x i1>, <4 x i1> } %f
+}
+declare { <4 x i1>, <4 x i1> } @llvm.uadd.with.overflow.v4i1(<4 x i1>, <4 x i1>)
+
+define { <4 x i1>, <4 x i1> } @saddo(<4 x i1> %a) {
+; SSE-LABEL: saddo:
+; SSE: # %bb.0:
+; SSE-NEXT: movaps %xmm0, %xmm1
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: saddo:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovaps %xmm0, %xmm1
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %f = call { <4 x i1>, <4 x i1> } @llvm.sadd.with.overflow.v4i1(<4 x i1> %a, <4 x i1> %a)
+ ret { <4 x i1>, <4 x i1> } %f
+}
+declare { <4 x i1>, <4 x i1> } @llvm.sadd.with.overflow.v4i1(<4 x i1>, <4 x i1>)
diff --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll
index 7631367ba5d667c..eae9b969211f66c 100644
--- a/llvm/test/CodeGen/X86/vec_saddo.ll
+++ b/llvm/test/CodeGen/X86/vec_saddo.ll
@@ -976,34 +976,24 @@ define <4 x i32> @saddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind {
define <4 x i32> @saddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
; SSE-LABEL: saddo_v4i1:
; SSE: # %bb.0:
-; SSE-NEXT: pslld $31, %xmm1
-; SSE-NEXT: psrad $31, %xmm1
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: pxor %xmm1, %xmm2
+; SSE-NEXT: pslld $31, %xmm2
+; SSE-NEXT: movmskps %xmm2, %eax
+; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: pslld $31, %xmm0
; SSE-NEXT: psrad $31, %xmm0
-; SSE-NEXT: paddd %xmm1, %xmm0
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: pslld $31, %xmm1
-; SSE-NEXT: movmskps %xmm1, %eax
-; SSE-NEXT: psrad $31, %xmm1
-; SSE-NEXT: pcmpeqd %xmm1, %xmm0
-; SSE-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE-NEXT: pxor %xmm1, %xmm0
; SSE-NEXT: movb %al, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: saddo_v4i1:
; AVX: # %bb.0:
-; AVX-NEXT: vpslld $31, %xmm1, %xmm1
-; AVX-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX-NEXT: vmovmskps %xmm2, %eax
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpslld $31, %xmm0, %xmm0
; AVX-NEXT: vpsrad $31, %xmm0, %xmm0
-; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpslld $31, %xmm0, %xmm1
-; AVX-NEXT: vpsrad $31, %xmm1, %xmm2
-; AVX-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0
-; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vmovmskps %xmm1, %eax
; AVX-NEXT: movb %al, (%rdi)
; AVX-NEXT: retq
;
@@ -1011,11 +1001,10 @@ define <4 x i32> @saddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
; AVX512: # %bb.0:
; AVX512-NEXT: vpslld $31, %xmm0, %xmm0
; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0
-; AVX512-NEXT: vpslld $31, %xmm1, %xmm1
-; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k1
+; AVX512-NEXT: vpslld $31, %xmm1, %xmm0
+; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1
; AVX512-NEXT: kxorw %k1, %k0, %k2
-; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k0 {%k2}
-; AVX512-NEXT: kxorw %k0, %k1, %k1
+; AVX512-NEXT: kandw %k1, %k0, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: kshiftlw $12, %k2, %k0
diff --git a/llvm/test/CodeGen/X86/vec_ssubo.ll b/llvm/test/CodeGen/X86/vec_ssubo.ll
index d634457069c0daf..f8cf543cb9fab30 100644
--- a/llvm/test/CodeGen/X86/vec_ssubo.ll
+++ b/llvm/test/CodeGen/X86/vec_ssubo.ll
@@ -985,34 +985,24 @@ define <4 x i32> @ssubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind {
define <4 x i32> @ssubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
; SSE-LABEL: ssubo_v4i1:
; SSE: # %bb.0:
-; SSE-NEXT: pslld $31, %xmm1
-; SSE-NEXT: psrad $31, %xmm1
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: pxor %xmm1, %xmm2
+; SSE-NEXT: pslld $31, %xmm2
+; SSE-NEXT: movmskps %xmm2, %eax
+; SSE-NEXT: pandn %xmm1, %xmm0
; SSE-NEXT: pslld $31, %xmm0
; SSE-NEXT: psrad $31, %xmm0
-; SSE-NEXT: psubd %xmm1, %xmm0
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: pslld $31, %xmm1
-; SSE-NEXT: movmskps %xmm1, %eax
-; SSE-NEXT: psrad $31, %xmm1
-; SSE-NEXT: pcmpeqd %xmm1, %xmm0
-; SSE-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE-NEXT: pxor %xmm1, %xmm0
; SSE-NEXT: movb %al, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: ssubo_v4i1:
; AVX: # %bb.0:
-; AVX-NEXT: vpslld $31, %xmm1, %xmm1
-; AVX-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX-NEXT: vmovmskps %xmm2, %eax
+; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpslld $31, %xmm0, %xmm0
; AVX-NEXT: vpsrad $31, %xmm0, %xmm0
-; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpslld $31, %xmm0, %xmm1
-; AVX-NEXT: vpsrad $31, %xmm1, %xmm2
-; AVX-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0
-; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vmovmskps %xmm1, %eax
; AVX-NEXT: movb %al, (%rdi)
; AVX-NEXT: retq
;
@@ -1022,11 +1012,11 @@ define <4 x i32> @ssubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0
; AVX512-NEXT: vpslld $31, %xmm1, %xmm1
; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k1
-; AVX512-NEXT: kxorw %k1, %k0, %k1
-; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k2 {%k1}
+; AVX512-NEXT: kxorw %k1, %k0, %k0
+; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k1 {%k1}
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k2} {z}
-; AVX512-NEXT: kshiftlw $12, %k1, %k0
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: kshiftlw $12, %k0, %k0
; AVX512-NEXT: kshiftrw $12, %k0, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: movb %al, (%rdi)
diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll
index 653c3a996915114..950e943bd902013 100644
--- a/llvm/test/CodeGen/X86/vec_uaddo.ll
+++ b/llvm/test/CodeGen/X86/vec_uaddo.ll
@@ -1075,49 +1075,26 @@ define <4 x i32> @uaddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind {
define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
; SSE-LABEL: uaddo_v4i1:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1]
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: paddd %xmm1, %xmm0
-; SSE-NEXT: pand %xmm0, %xmm2
-; SSE-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE-NEXT: pxor %xmm2, %xmm1
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: pxor %xmm1, %xmm2
+; SSE-NEXT: pslld $31, %xmm2
+; SSE-NEXT: movmskps %xmm2, %eax
+; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: pslld $31, %xmm0
-; SSE-NEXT: movmskps %xmm0, %eax
+; SSE-NEXT: psrad $31, %xmm0
; SSE-NEXT: movb %al, (%rdi)
-; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX1-LABEL: uaddo_v4i1:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1]
-; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
-; AVX1-NEXT: vmovmskps %xmm1, %eax
-; AVX1-NEXT: movb %al, (%rdi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: uaddo_v4i1:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1]
-; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm0
-; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
-; AVX2-NEXT: vmovmskps %xmm1, %eax
-; AVX2-NEXT: movb %al, (%rdi)
-; AVX2-NEXT: retq
+; AVX-LABEL: uaddo_v4i1:
+; AVX: # %bb.0:
+; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX-NEXT: vmovmskps %xmm2, %eax
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX-NEXT: movb %al, (%rdi)
+; AVX-NEXT: retq
;
; AVX512-LABEL: uaddo_v4i1:
; AVX512: # %bb.0:
@@ -1125,11 +1102,11 @@ define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0
; AVX512-NEXT: vpslld $31, %xmm1, %xmm0
; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1
-; AVX512-NEXT: kxorw %k1, %k0, %k1
-; AVX512-NEXT: kandnw %k0, %k1, %k2
+; AVX512-NEXT: kxorw %k1, %k0, %k2
+; AVX512-NEXT: kandw %k1, %k0, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k2} {z}
-; AVX512-NEXT: kshiftlw $12, %k1, %k0
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: kshiftlw $12, %k2, %k0
; AVX512-NEXT: kshiftrw $12, %k0, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: movb %al, (%rdi)
diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll
index a58c3dd0d530734..7de972770d8da4f 100644
--- a/llvm/test/CodeGen/X86/vec_usubo.ll
+++ b/llvm/test/CodeGen/X86/vec_usubo.ll
@@ -1122,49 +1122,26 @@ define <4 x i32> @usubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind {
define <4 x i32> @usubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
; SSE-LABEL: usubo_v4i1:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1]
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: psubd %xmm1, %xmm0
-; SSE-NEXT: pand %xmm0, %xmm2
-; SSE-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE-NEXT: pxor %xmm2, %xmm1
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: pxor %xmm1, %xmm2
+; SSE-NEXT: pslld $31, %xmm2
+; SSE-NEXT: movmskps %xmm2, %eax
+; SSE-NEXT: pandn %xmm1, %xmm0
; SSE-NEXT: pslld $31, %xmm0
-; SSE-NEXT: movmskps %xmm0, %eax
+; SSE-NEXT: psrad $31, %xmm0
; SSE-NEXT: movb %al, (%rdi)
-; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX1-LABEL: usubo_v4i1:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1]
-; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
-; AVX1-NEXT: vmovmskps %xmm1, %eax
-; AVX1-NEXT: movb %al, (%rdi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: usubo_v4i1:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1]
-; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm0
-; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
-; AVX2-NEXT: vmovmskps %xmm1, %eax
-; AVX2-NEXT: movb %al, (%rdi)
-; AVX2-NEXT: retq
+; AVX-LABEL: usubo_v4i1:
+; AVX: # %bb.0:
+; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX-NEXT: vmovmskps %xmm2, %eax
+; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX-NEXT: movb %al, (%rdi)
+; AVX-NEXT: retq
;
; AVX512-LABEL: usubo_v4i1:
; AVX512: # %bb.0:
@@ -1172,11 +1149,11 @@ define <4 x i32> @usubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0
; AVX512-NEXT: vpslld $31, %xmm1, %xmm1
; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k1
-; AVX512-NEXT: kxorw %k1, %k0, %k1
-; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k2 {%k1}
+; AVX512-NEXT: kxorw %k1, %k0, %k0
+; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k1 {%k1}
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k2} {z}
-; AVX512-NEXT: kshiftlw $12, %k1, %k0
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: kshiftlw $12, %k0, %k0
; AVX512-NEXT: kshiftrw $12, %k0, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: movb %al, (%rdi)
|
Similar to what we already do for add/sub + saturation variants. Alive2: https://alive2.llvm.org/ce/z/rBDrNE Fixes llvm#69080
|
||
if (VTList.VTs[0].isVector() && | ||
VTList.VTs[0].getVectorElementType() == MVT::i1 && | ||
VTList.VTs[1].getVectorElementType() == MVT::i1) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why limit this to vectors only?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Mainly because all other add/sub/sat cases did
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would you prefer if I generalized this for scalars as well?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it would make sense to switch all of them to work on scalars as well, but that can be a separate change...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
|
||
if (VTList.VTs[0].isVector() && | ||
VTList.VTs[0].getVectorElementType() == MVT::i1 && | ||
VTList.VTs[1].getVectorElementType() == MVT::i1) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it would make sense to switch all of them to work on scalars as well, but that can be a separate change...
Similar to what we already do for add/sub + saturation variants.
Scalar support will be added in a future patch covering the other variants at the same time.
Alive2: https://alive2.llvm.org/ce/z/rBDrNE
Fixes #69080