From f0450d0a2c614618813ee8d0b269d9f987c001d8 Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Tue, 2 Jul 2024 18:03:57 +0100 Subject: [PATCH 1/2] [AArch64] All bits of an exact right shift are demanded When building a vector which contains zero elements, the AArch64 ISel replaces those elements with `undef`, if they are right shifted out. However, these elements need to stay zero if the right shift is exact, or otherwise we will be introducing undefined behavior. Change-Id: I8d7eb1964ebbe88a90568be7805a29a72edd89e1 --- .../Target/AArch64/AArch64ISelLowering.cpp | 4 ++++ .../AArch64/ashr-exact-demanded-bits.ll | 20 +++++++++++++++++++ 2 files changed, 24 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/ashr-exact-demanded-bits.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index e0c3cc5eddb82..c7b2a21e6ed58 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -22142,6 +22142,10 @@ static SDValue performVectorShiftCombine(SDNode *N, if (DCI.DAG.ComputeNumSignBits(Op.getOperand(0)) > ShiftImm) return Op.getOperand(0); + // If the right shift is exact, the shifted out bits matter. + if (N->getOpcode() == AArch64ISD::VASHR && N->getFlags().hasExact()) + return SDValue(); + APInt ShiftedOutBits = APInt::getLowBitsSet(OpScalarSize, ShiftImm); APInt DemandedMask = ~ShiftedOutBits; diff --git a/llvm/test/CodeGen/AArch64/ashr-exact-demanded-bits.ll b/llvm/test/CodeGen/AArch64/ashr-exact-demanded-bits.ll new file mode 100644 index 0000000000000..9f877dffe1ab5 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/ashr-exact-demanded-bits.ll @@ -0,0 +1,20 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s | FileCheck %s +target triple = "aarch64-linux" + +define <2 x float> @f(i8 %0, i8 %1) { +; CHECK-LABEL: f: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: mov v0.b[3], w0 +; CHECK-NEXT: mov v0.b[7], w1 +; CHECK-NEXT: scvtf v0.2s, v0.2s, #24 +; CHECK-NEXT: ret + %3 = insertelement <2 x i8> poison, i8 %0, i64 0 + %4 = insertelement <2 x i8> %3, i8 %1, i64 1 + %5 = shufflevector <2 x i8> %4, <2 x i8> , <8 x i32> + %6 = bitcast <8 x i8> %5 to <2 x i32> + %7 = ashr exact <2 x i32> %6, + %8 = sitofp <2 x i32> %7 to <2 x float> + ret <2 x float> %8 +} From 647c4c1877dda5a1ccb25e9a9748374ae1d505ab Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Wed, 3 Jul 2024 13:07:16 +0100 Subject: [PATCH 2/2] [fixup] Do the same for logical shift right too Change-Id: I5ac39bf06434cb4029ec8482bd54b2f756bb6af8 --- .../Target/AArch64/AArch64ISelLowering.cpp | 4 +-- .../AArch64/ashr-exact-demanded-bits.ll | 20 ----------- .../AArch64/shr-exact-demanded-bits.ll | 35 +++++++++++++++++++ 3 files changed, 37 insertions(+), 22 deletions(-) delete mode 100644 llvm/test/CodeGen/AArch64/ashr-exact-demanded-bits.ll create mode 100644 llvm/test/CodeGen/AArch64/shr-exact-demanded-bits.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index c7b2a21e6ed58..341cf51173ccc 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -22142,8 +22142,8 @@ static SDValue performVectorShiftCombine(SDNode *N, if (DCI.DAG.ComputeNumSignBits(Op.getOperand(0)) > ShiftImm) return Op.getOperand(0); - // If the right shift is exact, the shifted out bits matter. - if (N->getOpcode() == AArch64ISD::VASHR && N->getFlags().hasExact()) + // If the shift is exact, the shifted out bits matter. + if (N->getFlags().hasExact()) return SDValue(); APInt ShiftedOutBits = APInt::getLowBitsSet(OpScalarSize, ShiftImm); diff --git a/llvm/test/CodeGen/AArch64/ashr-exact-demanded-bits.ll b/llvm/test/CodeGen/AArch64/ashr-exact-demanded-bits.ll deleted file mode 100644 index 9f877dffe1ab5..0000000000000 --- a/llvm/test/CodeGen/AArch64/ashr-exact-demanded-bits.ll +++ /dev/null @@ -1,20 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s | FileCheck %s -target triple = "aarch64-linux" - -define <2 x float> @f(i8 %0, i8 %1) { -; CHECK-LABEL: f: -; CHECK: // %bb.0: -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: mov v0.b[3], w0 -; CHECK-NEXT: mov v0.b[7], w1 -; CHECK-NEXT: scvtf v0.2s, v0.2s, #24 -; CHECK-NEXT: ret - %3 = insertelement <2 x i8> poison, i8 %0, i64 0 - %4 = insertelement <2 x i8> %3, i8 %1, i64 1 - %5 = shufflevector <2 x i8> %4, <2 x i8> , <8 x i32> - %6 = bitcast <8 x i8> %5 to <2 x i32> - %7 = ashr exact <2 x i32> %6, - %8 = sitofp <2 x i32> %7 to <2 x float> - ret <2 x float> %8 -} diff --git a/llvm/test/CodeGen/AArch64/shr-exact-demanded-bits.ll b/llvm/test/CodeGen/AArch64/shr-exact-demanded-bits.ll new file mode 100644 index 0000000000000..9698626aea655 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/shr-exact-demanded-bits.ll @@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s | FileCheck %s +target triple = "aarch64-linux" + +define <2 x i32> @f(i8 %0, i8 %1) { +; CHECK-LABEL: f: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: mov v0.b[3], w0 +; CHECK-NEXT: mov v0.b[7], w1 +; CHECK-NEXT: sshr v0.2s, v0.2s, #24 +; CHECK-NEXT: ret + %3 = insertelement <2 x i8> poison, i8 %0, i64 0 + %4 = insertelement <2 x i8> %3, i8 %1, i64 1 + %5 = shufflevector <2 x i8> %4, <2 x i8> , <8 x i32> + %6 = bitcast <8 x i8> %5 to <2 x i32> + %7 = ashr exact <2 x i32> %6, + ret <2 x i32> %7 +} + +define <2 x i32> @g(i8 %0, i8 %1) { +; CHECK-LABEL: g: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: mov v0.b[3], w0 +; CHECK-NEXT: mov v0.b[7], w1 +; CHECK-NEXT: ushr v0.2s, v0.2s, #24 +; CHECK-NEXT: ret + %3 = insertelement <2 x i8> poison, i8 %0, i64 0 + %4 = insertelement <2 x i8> %3, i8 %1, i64 1 + %5 = shufflevector <2 x i8> %4, <2 x i8> , <8 x i32> + %6 = bitcast <8 x i8> %5 to <2 x i32> + %7 = lshr exact <2 x i32> %6, + ret <2 x i32> %7 +}