-
Notifications
You must be signed in to change notification settings - Fork 14.4k
[LoopIdiom] Select llvm.experimental.memset.pattern intrinsic rather than memset_pattern16 libcall #126736
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[LoopIdiom] Select llvm.experimental.memset.pattern intrinsic rather than memset_pattern16 libcall #126736
Changes from all commits
a052059
6409a4b
59ecf00
809320f
b561986
808e5c2
756d7a4
763b40a
48a9ed8
71e0307
07c8df2
3e77fd8
e693285
77ba4b4
fbed3bc
07af8f7
b70871c
23dcffe
13ae5b8
e6bb52c
c15584d
c178f9f
7a5495d
d8b4ae1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -150,6 +150,11 @@ static cl::opt<bool> UseLIRCodeSizeHeurs( | |
"with -Os/-Oz"), | ||
cl::init(true), cl::Hidden); | ||
|
||
static cl::opt<bool> ForceMemsetPatternIntrinsic( | ||
"loop-idiom-force-memset-pattern-intrinsic", | ||
cl::desc("Use memset.pattern intrinsic whenever possible"), cl::init(false), | ||
cl::Hidden); | ||
|
||
namespace { | ||
|
||
class LoopIdiomRecognize { | ||
|
@@ -323,10 +328,15 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L) { | |
L->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs; | ||
|
||
HasMemset = TLI->has(LibFunc_memset); | ||
// TODO: Unconditionally enable use of the memset pattern intrinsic (or at | ||
// least, opt-in via target hook) once we are confident it will never result | ||
// in worse codegen than without. For now, use it only when the target | ||
// supports memset_pattern16 libcall (or unless this is overridden by | ||
// command line option). | ||
HasMemsetPattern = TLI->has(LibFunc_memset_pattern16); | ||
HasMemcpy = TLI->has(LibFunc_memcpy); | ||
|
||
if (HasMemset || HasMemsetPattern || HasMemcpy) | ||
if (HasMemset || HasMemsetPattern || ForceMemsetPatternIntrinsic || HasMemcpy) | ||
if (SE->hasLoopInvariantBackedgeTakenCount(L)) | ||
return runOnCountableLoop(); | ||
|
||
|
@@ -411,14 +421,12 @@ static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) { | |
if (Size > 16) | ||
return nullptr; | ||
|
||
// If the constant is exactly 16 bytes, just use it. | ||
if (Size == 16) | ||
return C; | ||
// For now, don't handle types that aren't int, floats, or pointers. | ||
if (!isa<ConstantInt>(C) && !isa<ConstantFP>(C) && | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Better check the type for these, otherwise it's going to implicitly allow vector splats in the future... |
||
!isa<PointerType>(C->getType())) | ||
return nullptr; | ||
|
||
// Otherwise, we'll use an array of the constants. | ||
unsigned ArraySize = 16 / Size; | ||
ArrayType *AT = ArrayType::get(V->getType(), ArraySize); | ||
return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C)); | ||
return C; | ||
} | ||
|
||
LoopIdiomRecognize::LegalStoreKind | ||
|
@@ -479,7 +487,8 @@ LoopIdiomRecognize::isLegalStore(StoreInst *SI) { | |
// It looks like we can use SplatValue. | ||
return LegalStoreKind::Memset; | ||
} | ||
if (!UnorderedAtomic && HasMemsetPattern && !DisableLIRP::Memset && | ||
if (!UnorderedAtomic && (HasMemsetPattern || ForceMemsetPatternIntrinsic) && | ||
!DisableLIRP::Memset && | ||
// Don't create memset_pattern16s with address spaces. | ||
StorePtr->getType()->getPointerAddressSpace() == 0 && | ||
getMemSetPatternValue(StoredVal, DL)) { | ||
|
@@ -1061,50 +1070,81 @@ bool LoopIdiomRecognize::processLoopStridedStore( | |
return Changed; | ||
|
||
// Okay, everything looks good, insert the memset. | ||
Value *SplatValue = isBytewiseValue(StoredVal, *DL); | ||
Constant *PatternValue = nullptr; | ||
if (!SplatValue) | ||
PatternValue = getMemSetPatternValue(StoredVal, DL); | ||
|
||
// MemsetArg is the number of bytes for the memset libcall, and the number | ||
// of pattern repetitions if the memset.pattern intrinsic is being used. | ||
Value *MemsetArg; | ||
std::optional<int64_t> BytesWritten; | ||
|
||
if (PatternValue && (HasMemsetPattern || ForceMemsetPatternIntrinsic)) { | ||
const SCEV *TripCountS = | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There's a bunch of complexity in this section of the change that I'm not getting. Since we need the NumBytes (in the original code) for the AAInfo, why not just leave that alone? Isn't count (for memset.pattern intrinsic) just the trip count of the loop by definition? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I still think this might be reworked, but we're down to little enough code here I'm okay with that being a followup. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the problem is that there may be multiple stores in one loop iteration? |
||
SE->getTripCountFromExitCount(BECount, IntIdxTy, CurLoop); | ||
if (!Expander.isSafeToExpand(TripCountS)) | ||
return Changed; | ||
const SCEVConstant *ConstStoreSize = dyn_cast<SCEVConstant>(StoreSizeSCEV); | ||
if (!ConstStoreSize) | ||
return Changed; | ||
Value *TripCount = Expander.expandCodeFor(TripCountS, IntIdxTy, | ||
Preheader->getTerminator()); | ||
uint64_t PatternRepsPerTrip = | ||
(ConstStoreSize->getValue()->getZExtValue() * 8) / | ||
DL->getTypeSizeInBits(PatternValue->getType()); | ||
// If ConstStoreSize is not equal to the width of PatternValue, then | ||
// MemsetArg is TripCount * (ConstStoreSize/PatternValueWidth). Else | ||
// MemSetArg is just TripCount. | ||
MemsetArg = | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Isn't this just weird way of compute NumBytes? |
||
PatternRepsPerTrip == 1 | ||
? TripCount | ||
: Builder.CreateMul(TripCount, | ||
Builder.getIntN(IntIdxTy->getIntegerBitWidth(), | ||
PatternRepsPerTrip)); | ||
if (auto *CI = dyn_cast<ConstantInt>(TripCount)) | ||
BytesWritten = | ||
CI->getZExtValue() * ConstStoreSize->getValue()->getZExtValue(); | ||
|
||
const SCEV *NumBytesS = | ||
getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE); | ||
|
||
// TODO: ideally we should still be able to generate memset if SCEV expander | ||
// is taught to generate the dependencies at the latest point. | ||
if (!Expander.isSafeToExpand(NumBytesS)) | ||
return Changed; | ||
} else { | ||
const SCEV *NumBytesS = | ||
getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE); | ||
|
||
Value *NumBytes = | ||
Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator()); | ||
// TODO: ideally we should still be able to generate memset if SCEV expander | ||
// is taught to generate the dependencies at the latest point. | ||
if (!Expander.isSafeToExpand(NumBytesS)) | ||
return Changed; | ||
MemsetArg = | ||
Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator()); | ||
if (auto *CI = dyn_cast<ConstantInt>(MemsetArg)) | ||
BytesWritten = CI->getZExtValue(); | ||
} | ||
assert(MemsetArg && "MemsetArg should have been set"); | ||
|
||
AAMDNodes AATags = TheStore->getAAMetadata(); | ||
for (Instruction *Store : Stores) | ||
AATags = AATags.merge(Store->getAAMetadata()); | ||
if (auto CI = dyn_cast<ConstantInt>(NumBytes)) | ||
AATags = AATags.extendTo(CI->getZExtValue()); | ||
if (BytesWritten) | ||
AATags = AATags.extendTo(BytesWritten.value()); | ||
else | ||
AATags = AATags.extendTo(-1); | ||
|
||
CallInst *NewCall; | ||
if (Value *SplatValue = isBytewiseValue(StoredVal, *DL)) { | ||
NewCall = Builder.CreateMemSet(BasePtr, SplatValue, NumBytes, | ||
if (SplatValue) { | ||
NewCall = Builder.CreateMemSet(BasePtr, SplatValue, MemsetArg, | ||
MaybeAlign(StoreAlignment), | ||
/*isVolatile=*/false, AATags); | ||
} else if (isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16)) { | ||
// Everything is emitted in default address space | ||
Type *Int8PtrTy = DestInt8PtrTy; | ||
|
||
StringRef FuncName = "memset_pattern16"; | ||
FunctionCallee MSP = getOrInsertLibFunc(M, *TLI, LibFunc_memset_pattern16, | ||
Builder.getVoidTy(), Int8PtrTy, Int8PtrTy, IntIdxTy); | ||
inferNonMandatoryLibFuncAttrs(M, FuncName, *TLI); | ||
|
||
// Otherwise we should form a memset_pattern16. PatternValue is known to be | ||
// an constant array of 16-bytes. Plop the value into a mergable global. | ||
Constant *PatternValue = getMemSetPatternValue(StoredVal, DL); | ||
assert(PatternValue && "Expected pattern value."); | ||
GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true, | ||
GlobalValue::PrivateLinkage, | ||
PatternValue, ".memset_pattern"); | ||
GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these. | ||
GV->setAlignment(Align(16)); | ||
NewCall = Builder.CreateCall(MSP, {BasePtr, GV, NumBytes}); | ||
} else if (ForceMemsetPatternIntrinsic || | ||
isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16)) { | ||
assert(isa<SCEVConstant>(StoreSizeSCEV) && "Expected constant store size"); | ||
|
||
NewCall = Builder.CreateIntrinsic( | ||
Intrinsic::experimental_memset_pattern, | ||
{DestInt8PtrTy, PatternValue->getType(), IntIdxTy}, | ||
{BasePtr, PatternValue, MemsetArg, | ||
ConstantInt::getFalse(M->getContext())}); | ||
if (StoreAlignment) | ||
cast<MemSetPatternInst>(NewCall)->setDestAlignment(*StoreAlignment); | ||
NewCall->setAAMetadata(AATags); | ||
} else { | ||
// Neither a memset, nor memset_pattern16 | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals | ||
; RUN: opt -passes=loop-idiom -mtriple=riscv64 < %s -S | FileCheck %s | ||
; RUN: opt -passes=loop-idiom -mtriple=riscv64 -loop-idiom-force-memset-pattern-intrinsic < %s -S \ | ||
; RUN: | FileCheck -check-prefix=CHECK-INTRIN %s | ||
|
||
define dso_local void @double_memset(ptr nocapture %p) { | ||
; CHECK-LABEL: @double_memset( | ||
; CHECK-NEXT: entry: | ||
; CHECK-NEXT: br label [[FOR_BODY:%.*]] | ||
; CHECK: for.cond.cleanup: | ||
; CHECK-NEXT: ret void | ||
; CHECK: for.body: | ||
; CHECK-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] | ||
; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds double, ptr [[P:%.*]], i64 [[I_07]] | ||
; CHECK-NEXT: store double 3.141590e+00, ptr [[PTR1]], align 1 | ||
; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_07]], 1 | ||
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 16 | ||
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] | ||
; | ||
; CHECK-INTRIN-LABEL: @double_memset( | ||
; CHECK-INTRIN-NEXT: entry: | ||
; CHECK-INTRIN-NEXT: call void @llvm.experimental.memset.pattern.p0.f64.i64(ptr align 1 [[P:%.*]], double 3.141590e+00, i64 16, i1 false) | ||
; CHECK-INTRIN-NEXT: br label [[FOR_BODY:%.*]] | ||
; CHECK-INTRIN: for.cond.cleanup: | ||
; CHECK-INTRIN-NEXT: ret void | ||
; CHECK-INTRIN: for.body: | ||
; CHECK-INTRIN-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] | ||
; CHECK-INTRIN-NEXT: [[PTR1:%.*]] = getelementptr inbounds double, ptr [[P]], i64 [[I_07]] | ||
; CHECK-INTRIN-NEXT: [[INC]] = add nuw nsw i64 [[I_07]], 1 | ||
; CHECK-INTRIN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 16 | ||
; CHECK-INTRIN-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] | ||
; | ||
entry: | ||
br label %for.body | ||
|
||
for.cond.cleanup: | ||
ret void | ||
|
||
for.body: | ||
%i.07 = phi i64 [ %inc, %for.body ], [ 0, %entry ] | ||
%ptr1 = getelementptr inbounds double, ptr %p, i64 %i.07 | ||
store double 3.14159e+00, ptr %ptr1, align 1 | ||
%inc = add nuw nsw i64 %i.07, 1 | ||
%exitcond.not = icmp eq i64 %inc, 16 | ||
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body | ||
} | ||
;. | ||
; CHECK-INTRIN: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) } | ||
;. |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,8 +7,6 @@ target triple = "x86_64-apple-darwin10.0.0" | |
;. | ||
; CHECK: @G = global i32 5 | ||
; CHECK: @g_50 = global [7 x i32] [i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0], align 16 | ||
; CHECK: @.memset_pattern = private unnamed_addr constant [4 x i32] [i32 1, i32 1, i32 1, i32 1], align 16 | ||
; CHECK: @.memset_pattern.1 = private unnamed_addr constant [2 x ptr] [ptr @G, ptr @G], align 16 | ||
;. | ||
define void @test1(ptr %Base, i64 %Size) nounwind ssp { | ||
; CHECK-LABEL: @test1( | ||
|
@@ -533,7 +531,7 @@ for.end13: ; preds = %for.inc10 | |
define void @test11_pattern(ptr nocapture %P) nounwind ssp { | ||
; CHECK-LABEL: @test11_pattern( | ||
; CHECK-NEXT: entry: | ||
; CHECK-NEXT: call void @memset_pattern16(ptr [[P:%.*]], ptr @.memset_pattern, i64 40000) | ||
; CHECK-NEXT: call void @llvm.experimental.memset.pattern.p0.i32.i64(ptr align 4 [[P:%.*]], i32 1, i64 10000, i1 false) | ||
; CHECK-NEXT: br label [[FOR_BODY:%.*]] | ||
; CHECK: for.body: | ||
; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ] | ||
|
@@ -596,7 +594,7 @@ for.end: ; preds = %for.body | |
define void @test13_pattern(ptr nocapture %P) nounwind ssp { | ||
; CHECK-LABEL: @test13_pattern( | ||
; CHECK-NEXT: entry: | ||
; CHECK-NEXT: call void @memset_pattern16(ptr [[P:%.*]], ptr @.memset_pattern.1, i64 80000) | ||
; CHECK-NEXT: call void @llvm.experimental.memset.pattern.p0.p0.i64(ptr align 4 [[P:%.*]], ptr @G, i64 10000, i1 false) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just to note, you dropped the pointer to int conversion from the prior version of the patch, and thus until a reworked #138559 lands we have an aliasing imprecision here. I'm explicitly okay with that being true for the landed patch, and will follow up on my patch post-commit. |
||
; CHECK-NEXT: br label [[FOR_BODY:%.*]] | ||
; CHECK: for.body: | ||
; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ] | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would mildly suggest dropping this cl::opt from the final patch. We probably should have a generic means to force enable a TLI function, but having one only for memset.pattern seems slightly odd. Not a strongly held view, will leave to your discretion.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I do have a patch somewhere that adds an opt flag to do this
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
#145808