[llvm][SPIRV] Expose fast popcnt support for SPIR-V targets#109845
Conversation
|
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-backend-spir-v Author: Alex Voicu (AlexVlx) ChangesThis adds the TTI predicate for conveying the availability of fast Full diff: https://github.com/llvm/llvm-project/pull/109845.diff 4 Files Affected:
diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h b/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h
index 2fbb4381da2637..7a64aa81218a89 100644
--- a/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h
+++ b/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h
@@ -24,6 +24,7 @@
namespace llvm {
class SPIRVTTIImpl : public BasicTTIImplBase<SPIRVTTIImpl> {
using BaseT = BasicTTIImplBase<SPIRVTTIImpl>;
+ using TTI = TargetTransformInfo;
friend BaseT;
@@ -37,6 +38,15 @@ class SPIRVTTIImpl : public BasicTTIImplBase<SPIRVTTIImpl> {
explicit SPIRVTTIImpl(const SPIRVTargetMachine *TM, const Function &F)
: BaseT(TM, F.getDataLayout()), ST(TM->getSubtargetImpl(F)),
TLI(ST->getTargetLowering()) {}
+
+ TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) {
+ // SPIR-V natively supports OpBitcount, per 3.53.14 in the spec, as such it
+ // is reasonable to assume the Op is fast / preferable to the expanded loop.
+ // Furthermore, this prevents information being lost if transforms are
+ // applied to SPIR-V before lowering to a concrete target.
+ assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
+ return TTI::PSK_FastHardware;
+ }
};
} // namespace llvm
diff --git a/llvm/test/Transforms/LoopIdiom/AMDGPU/popcnt.ll b/llvm/test/Transforms/LoopIdiom/AMDGPU/popcnt.ll
index 5f49d6c05ae4ed..f030d188b890ef 100644
--- a/llvm/test/Transforms/LoopIdiom/AMDGPU/popcnt.ll
+++ b/llvm/test/Transforms/LoopIdiom/AMDGPU/popcnt.ll
@@ -1,4 +1,5 @@
; RUN: opt -passes=loop-idiom -mtriple=amdgcn-- -S < %s | FileCheck %s
+; RUN: opt -passes=loop-idiom -mtriple=spirv64-amd-amdhsa -S < %s | FileCheck %s
; Mostly copied from x86 version.
diff --git a/llvm/test/Transforms/LoopIdiom/SPIRV/lit.local.cfg b/llvm/test/Transforms/LoopIdiom/SPIRV/lit.local.cfg
new file mode 100644
index 00000000000000..78dd74cd6dc634
--- /dev/null
+++ b/llvm/test/Transforms/LoopIdiom/SPIRV/lit.local.cfg
@@ -0,0 +1,2 @@
+if not "SPIRV" in config.root.targets:
+ config.unsupported = True
diff --git a/llvm/test/Transforms/LoopIdiom/SPIRV/popcnt.ll b/llvm/test/Transforms/LoopIdiom/SPIRV/popcnt.ll
new file mode 100644
index 00000000000000..661e40dd96ddf8
--- /dev/null
+++ b/llvm/test/Transforms/LoopIdiom/SPIRV/popcnt.ll
@@ -0,0 +1,128 @@
+; RUN: opt -passes=loop-idiom -mtriple=spirv32-- -S < %s | FileCheck %s
+; RUN: opt -passes=loop-idiom -mtriple=spirv64-- -S < %s | FileCheck %s
+
+; Mostly copied from x86 version.
+
+;To recognize this pattern:
+;int popcount(unsigned long long a) {
+; int c = 0;
+; while (a) {
+; c++;
+; a &= a - 1;
+; }
+; return c;
+;}
+;
+
+; CHECK-LABEL: @popcount_i64
+; CHECK: entry
+; CHECK: llvm.ctpop.i64
+; CHECK: ret
+define i32 @popcount_i64(i64 %a) nounwind uwtable readnone ssp {
+entry:
+ %tobool3 = icmp eq i64 %a, 0
+ br i1 %tobool3, label %while.end, label %while.body
+
+while.body: ; preds = %entry, %while.body
+ %c.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+ %a.addr.04 = phi i64 [ %and, %while.body ], [ %a, %entry ]
+ %inc = add nsw i32 %c.05, 1
+ %sub = add i64 %a.addr.04, -1
+ %and = and i64 %sub, %a.addr.04
+ %tobool = icmp eq i64 %and, 0
+ br i1 %tobool, label %while.end, label %while.body
+
+while.end: ; preds = %while.body, %entry
+ %c.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.body ]
+ ret i32 %c.0.lcssa
+}
+
+; CHECK-LABEL: @popcount_i32
+; CHECK: entry
+; CHECK: llvm.ctpop.i32
+; CHECK: ret
+define i32 @popcount_i32(i32 %a) nounwind uwtable readnone ssp {
+entry:
+ %tobool3 = icmp eq i32 %a, 0
+ br i1 %tobool3, label %while.end, label %while.body
+
+while.body: ; preds = %entry, %while.body
+ %c.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+ %a.addr.04 = phi i32 [ %and, %while.body ], [ %a, %entry ]
+ %inc = add nsw i32 %c.05, 1
+ %sub = add i32 %a.addr.04, -1
+ %and = and i32 %sub, %a.addr.04
+ %tobool = icmp eq i32 %and, 0
+ br i1 %tobool, label %while.end, label %while.body
+
+while.end: ; preds = %while.body, %entry
+ %c.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.body ]
+ ret i32 %c.0.lcssa
+}
+
+; CHECK-LABEL: @popcount_i128
+; CHECK: entry
+; CHECK: llvm.ctpop.i128
+; CHECK: ret
+define i32 @popcount_i128(i128 %a) nounwind uwtable readnone ssp {
+entry:
+ %tobool3 = icmp eq i128 %a, 0
+ br i1 %tobool3, label %while.end, label %while.body
+
+while.body: ; preds = %entry, %while.body
+ %c.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+ %a.addr.04 = phi i128 [ %and, %while.body ], [ %a, %entry ]
+ %inc = add nsw i32 %c.05, 1
+ %sub = add i128 %a.addr.04, -1
+ %and = and i128 %sub, %a.addr.04
+ %tobool = icmp eq i128 %and, 0
+ br i1 %tobool, label %while.end, label %while.body
+
+while.end: ; preds = %while.body, %entry
+ %c.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.body ]
+ ret i32 %c.0.lcssa
+}
+
+; To recognize this pattern:
+;int popcount(unsigned long long a, int mydata1, int mydata2) {
+; int c = 0;
+; while (a) {
+; c++;
+; a &= a - 1;
+; mydata1 *= c;
+; mydata2 *= (int)a;
+; }
+; return c + mydata1 + mydata2;
+;}
+
+; CHECK-LABEL: @popcount2
+; CHECK: entry
+; CHECK: llvm.ctpop.i64
+; CHECK: ret
+define i32 @popcount2(i64 %a, i32 %mydata1, i32 %mydata2) nounwind uwtable readnone ssp {
+entry:
+ %tobool9 = icmp eq i64 %a, 0
+ br i1 %tobool9, label %while.end, label %while.body
+
+while.body: ; preds = %entry, %while.body
+ %c.013 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+ %mydata2.addr.012 = phi i32 [ %mul1, %while.body ], [ %mydata2, %entry ]
+ %mydata1.addr.011 = phi i32 [ %mul, %while.body ], [ %mydata1, %entry ]
+ %a.addr.010 = phi i64 [ %and, %while.body ], [ %a, %entry ]
+ %inc = add nsw i32 %c.013, 1
+ %sub = add i64 %a.addr.010, -1
+ %and = and i64 %sub, %a.addr.010
+ %mul = mul nsw i32 %inc, %mydata1.addr.011
+ %conv = trunc i64 %and to i32
+ %mul1 = mul nsw i32 %conv, %mydata2.addr.012
+ %tobool = icmp eq i64 %and, 0
+ br i1 %tobool, label %while.end, label %while.body
+
+while.end: ; preds = %while.body, %entry
+ %c.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.body ]
+ %mydata2.addr.0.lcssa = phi i32 [ %mydata2, %entry ], [ %mul1, %while.body ]
+ %mydata1.addr.0.lcssa = phi i32 [ %mydata1, %entry ], [ %mul, %while.body ]
+ %add = add i32 %mydata2.addr.0.lcssa, %mydata1.addr.0.lcssa
+ %add2 = add i32 %add, %c.0.lcssa
+ ret i32 %add2
+}
|
| : BaseT(TM, F.getDataLayout()), ST(TM->getSubtargetImpl(F)), | ||
| TLI(ST->getTargetLowering()) {} | ||
|
|
||
| TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) { |
There was a problem hiding this comment.
TTI probably should have a better default checking if the operation is legal. It's also not great that TargetLowering also has isCtpopFast
arsenm
left a comment
There was a problem hiding this comment.
LGTM, but really we should fix the defaults in the BasicTTIImpl
|
The first concern coming to my mind is that we would need to have a test case with LIT CHECK's of the output SPIR-V to see/validate how ctpop and G_CTPOP are being lowered. We have test/CodeGen/SPIRV/llvm-intrinsics/ctpop.ll, but I wonder would it make sense to add a full-blown test case from unoptimized llvm ir to spir-v, what do you think? |
I think this makes sense, can never have too many tests; we can probably mirror the |
I'll pick your brains over this because I hadn't thought about this at all before you mentioned it:) |
AlexVlx
left a comment
There was a problem hiding this comment.
The first concern coming to my mind is that we would need to have a test case with LIT CHECK's of the output SPIR-V to see/validate how ctpop and G_CTPOP are being lowered. We have test/CodeGen/SPIRV/llvm-intrinsics/ctpop.ll, but I wonder would it make sense to add a full-blown test case from unoptimized llvm ir to spir-v, what do you think?
Please see llvm/test/CodeGen/SPIRV/optimizations/recognize-popcnt-loop.ll for an attempt to do this.
| @@ -0,0 +1,114 @@ | |||
| ; RUN: opt -O3 -mtriple=spirv32-- %s -o - | llc -O3 -mtriple=spirv32-- -o - | FileCheck %s | |||
| ; RUN: %if spirv-tools %{ opt -O3 -mtriple=spirv32-- %s -o - | llc -O3 -mtriple=spirv32-- -o - -filetype=obj | spirv-val %} | |||
There was a problem hiding this comment.
As a nit, it'd be useful to run this as llc -verify-machineinstrs ....
There was a problem hiding this comment.
This is actually a great idea, because it tickles something that looks like a notable bug - that insertion is incorrect, we should skip over PHIs if the successor is a PHI. However, doing that doesn't quite solve it, as vreg lifetimes get messed up and verification still fails. I'll need to spend a bit more time to understand the transform. This can be verified, independently from, this patch, by using -verify-machineinstrs on e.g. /llvm/test/CodeGen/SPIRV/phi-spvintrinsic-dominate.ll. So this doesn't make things worse - we could merge it and deal with the issue separately, I reckon.
There was a problem hiding this comment.
that insertion is incorrect
yes, this has already been addressed in #109660 (
llvm-project/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
Lines 344 to 366 in 01c7776
vreg lifetimes get messed up and verification still fails
I didn't see the case to be sure, but I'd guess that it's the same problem as we're discussing currently within #110019
|
LGTM overall, however, did you check failed Github action logs? I noticed that fails are related to this PR: |
I have looked at the failure, it's rather odd; the log would suggest that the host's |
Please don't. The codegen test should only cover codegen of the intrinsic. Adding in the extra steps of pattern matching the intrinsic in a far removed optimization pass does not improve coverage |
This makes sense as well, thank you. I'm ok with any of options. |
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/51/builds/4464 Here is the relevant piece of the build log for the reference |
Adds the following patches AMDGPU: Remove wavefrontsize64 feature from dummy target llvm#117410 [LLVM][NFC] Use used's element type if available llvm#116804 [llvm][AMDGPU] Fold llvm.amdgcn.wavefrontsize early llvm#114481 [clang][Driver][HIP] Add support for mixing AMDGCNSPIRV & concrete offload-archs. llvm#113509 [clang][llvm][SPIR-V] Explicitly encode native integer widths for SPIR-V llvm#110695 [llvm][opt][Transforms] Replacement calloc should match replaced malloc llvm#110524 [clang][HIP] Don't use the OpenCLKernel CC when targeting AMDGCNSPIRV llvm#110447 [cuda][HIP] constant should imply constant llvm#110182 [llvm][SPIRV] Expose fast popcnt support for SPIR-V targets llvm#109845 [clang][CodeGen][SPIR-V] Fix incorrect SYCL usage, implement missing interface llvm#109415 [SPIRV][RFC] Rework / extend support for memory scopes llvm#106429 [clang][CodeGen][SPIR-V][AMDGPU] Tweak AMDGCNSPIRV ABI to allow for the correct handling of aggregates passed to kernels / functions. llvm#102776 Change-Id: I2b9ab54aba1c9345b9b0eb84409e6ed6c3cdb6cd
This adds the TTI predicate for conveying the availability of fast
popcnt, which subsequently allows passes likeLoopIdiomRecognizeto collapse known popcount patterns. Since SPIR-V natively exposesOpBitcount, it seems preferable to compress the resulting code, and retain the information, even if a concrete target might have to expand back into a loop structure.