-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[LoopIdiom] Select llvm.experimental.memset.pattern intrinsic rather than memset_pattern16 libcall #126736
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
asb
wants to merge
16
commits into
llvm:main
Choose a base branch
from
asb:2025q1-loopidiom-recognize-switch-to-memset-pattern-intrinsic
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+160
−83
Open
[LoopIdiom] Select llvm.experimental.memset.pattern intrinsic rather than memset_pattern16 libcall #126736
Changes from all commits
Commits
Show all changes
16 commits
Select commit
Hold shift + click to select a range
a052059
[LoopIdiom] Select llvm.experimental.memset.pattern intrinsic rather …
asb 6409a4b
Merge remote-tracking branch 'origin/main' into 2025q1-loopidiom-reco…
asb 59ecf00
Merge remote-tracking branch 'origin/main' into 2025q1-loopidiom-reco…
asb 809320f
Fix bit/bytes confusion in setting alignment
asb b561986
Merge remote-tracking branch 'origin/main' into 2025q1-loopidiom-reco…
asb 808e5c2
Excise unneeded logic for creating a globalvariable (which was still …
asb 756d7a4
Merge remote-tracking branch 'origin/main' into 2025q1-loopidiom-reco…
asb 763b40a
Remove unnecessary initialisation of std::optional
asb 48a9ed8
auto *CI and remove comment that's made redundant
asb 71e0307
Merge remote-tracking branch 'origin/main' into 2025q1-loopidiom-reco…
asb 07c8df2
Set alignment for dest ptr of emitted memset.pattern
asb 3e77fd8
Tweak description of ForceMemsetPatternIntrinsic
asb e693285
Merge remote-tracking branch 'origin/main' into 2025q1-loopidiom-reco…
asb 77ba4b4
Drop outdated comment
asb fbed3bc
Tweak comment as per Craig's suggestion
asb 07af8f7
Fix incorrectly resolved merge conflict
asb File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -149,6 +149,11 @@ static cl::opt<bool> UseLIRCodeSizeHeurs( | |
"with -Os/-Oz"), | ||
cl::init(true), cl::Hidden); | ||
|
||
static cl::opt<bool> ForceMemsetPatternIntrinsic( | ||
"loop-idiom-force-memset-pattern-intrinsic", | ||
cl::desc("Use memset.pattern intrinsic whenever possible"), cl::init(false), | ||
cl::Hidden); | ||
|
||
namespace { | ||
|
||
class LoopIdiomRecognize { | ||
|
@@ -322,10 +327,15 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L) { | |
L->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs; | ||
|
||
HasMemset = TLI->has(LibFunc_memset); | ||
// TODO: Unconditionally enable use of the memset pattern intrinsic (or at | ||
// least, opt-in via target hook) once we are confident it will never result | ||
// in worse codegen than without. For now, use it only when the target | ||
// supports memset_pattern16 libcall (or unless this is overridden by | ||
// command line option). | ||
HasMemsetPattern = TLI->has(LibFunc_memset_pattern16); | ||
HasMemcpy = TLI->has(LibFunc_memcpy); | ||
|
||
if (HasMemset || HasMemsetPattern || HasMemcpy) | ||
if (HasMemset || HasMemsetPattern || ForceMemsetPatternIntrinsic || HasMemcpy) | ||
if (SE->hasLoopInvariantBackedgeTakenCount(L)) | ||
return runOnCountableLoop(); | ||
|
||
|
@@ -411,14 +421,12 @@ static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) { | |
if (Size > 16) | ||
return nullptr; | ||
|
||
// If the constant is exactly 16 bytes, just use it. | ||
if (Size == 16) | ||
return C; | ||
// For now, don't handle types that aren't int, floats, or pointers. | ||
if (!isa<ConstantInt>(C) && !isa<ConstantFP>(C) && | ||
!isa<PointerType>(C->getType())) | ||
return nullptr; | ||
|
||
// Otherwise, we'll use an array of the constants. | ||
unsigned ArraySize = 16 / Size; | ||
ArrayType *AT = ArrayType::get(V->getType(), ArraySize); | ||
return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C)); | ||
return C; | ||
} | ||
|
||
LoopIdiomRecognize::LegalStoreKind | ||
|
@@ -482,8 +490,9 @@ LoopIdiomRecognize::isLegalStore(StoreInst *SI) { | |
// It looks like we can use SplatValue. | ||
return LegalStoreKind::Memset; | ||
} | ||
if (!UnorderedAtomic && HasMemsetPattern && !DisableLIRP::Memset && | ||
// Don't create memset_pattern16s with address spaces. | ||
if (!UnorderedAtomic && (HasMemsetPattern || ForceMemsetPatternIntrinsic) && | ||
!DisableLIRP::Memset && | ||
// Don't create memset.pattern intrinsic calls with address spaces. | ||
StorePtr->getType()->getPointerAddressSpace() == 0 && | ||
getMemSetPatternValue(StoredVal, DL)) { | ||
// It looks like we can use PatternValue! | ||
|
@@ -1083,53 +1092,92 @@ bool LoopIdiomRecognize::processLoopStridedStore( | |
return Changed; | ||
|
||
// Okay, everything looks good, insert the memset. | ||
// MemsetArg is the number of bytes for the memset libcall, and the number | ||
// of pattern repetitions if the memset.pattern intrinsic is being used. | ||
Value *MemsetArg; | ||
std::optional<int64_t> BytesWritten; | ||
|
||
if (PatternValue && (HasMemsetPattern || ForceMemsetPatternIntrinsic)) { | ||
const SCEV *TripCountS = | ||
SE->getTripCountFromExitCount(BECount, IntIdxTy, CurLoop); | ||
if (!Expander.isSafeToExpand(TripCountS)) | ||
return Changed; | ||
const SCEVConstant *ConstStoreSize = dyn_cast<SCEVConstant>(StoreSizeSCEV); | ||
if (!ConstStoreSize) | ||
return Changed; | ||
Value *TripCount = Expander.expandCodeFor(TripCountS, IntIdxTy, | ||
Preheader->getTerminator()); | ||
uint64_t PatternRepsPerTrip = | ||
(ConstStoreSize->getValue()->getZExtValue() * 8) / | ||
DL->getTypeSizeInBits(PatternValue->getType()); | ||
// If ConstStoreSize is not equal to the width of PatternValue, then | ||
// MemsetArg is TripCount * (ConstStoreSize/PatternValueWidth). Else | ||
// MemSetArg is just TripCount. | ||
MemsetArg = | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Isn't this just weird way of compute NumBytes? |
||
PatternRepsPerTrip == 1 | ||
? TripCount | ||
: Builder.CreateMul(TripCount, | ||
Builder.getIntN(IntIdxTy->getIntegerBitWidth(), | ||
PatternRepsPerTrip)); | ||
if (auto *CI = dyn_cast<ConstantInt>(TripCount)) | ||
BytesWritten = | ||
CI->getZExtValue() * ConstStoreSize->getValue()->getZExtValue(); | ||
} else { | ||
const SCEV *NumBytesS = | ||
getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE); | ||
|
||
const SCEV *NumBytesS = | ||
getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE); | ||
|
||
// TODO: ideally we should still be able to generate memset if SCEV expander | ||
// is taught to generate the dependencies at the latest point. | ||
if (!Expander.isSafeToExpand(NumBytesS)) | ||
return Changed; | ||
|
||
Value *NumBytes = | ||
Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator()); | ||
// TODO: ideally we should still be able to generate memset if SCEV expander | ||
// is taught to generate the dependencies at the latest point. | ||
if (!Expander.isSafeToExpand(NumBytesS)) | ||
return Changed; | ||
MemsetArg = | ||
Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator()); | ||
if (auto *CI = dyn_cast<ConstantInt>(MemsetArg)) | ||
BytesWritten = CI->getZExtValue(); | ||
} | ||
assert(MemsetArg && "MemsetArg should have been set"); | ||
|
||
if (!SplatValue && !isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16)) | ||
if (!SplatValue && !(ForceMemsetPatternIntrinsic || | ||
isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16))) | ||
return Changed; | ||
|
||
AAMDNodes AATags = TheStore->getAAMetadata(); | ||
for (Instruction *Store : Stores) | ||
AATags = AATags.merge(Store->getAAMetadata()); | ||
if (auto CI = dyn_cast<ConstantInt>(NumBytes)) | ||
AATags = AATags.extendTo(CI->getZExtValue()); | ||
if (BytesWritten) | ||
AATags = AATags.extendTo(BytesWritten.value()); | ||
else | ||
AATags = AATags.extendTo(-1); | ||
|
||
CallInst *NewCall; | ||
if (SplatValue) { | ||
NewCall = Builder.CreateMemSet( | ||
BasePtr, SplatValue, NumBytes, MaybeAlign(StoreAlignment), | ||
BasePtr, SplatValue, MemsetArg, MaybeAlign(StoreAlignment), | ||
/*isVolatile=*/false, AATags.TBAA, AATags.Scope, AATags.NoAlias); | ||
} else { | ||
assert (isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16)); | ||
// Everything is emitted in default address space | ||
Type *Int8PtrTy = DestInt8PtrTy; | ||
|
||
StringRef FuncName = "memset_pattern16"; | ||
FunctionCallee MSP = getOrInsertLibFunc(M, *TLI, LibFunc_memset_pattern16, | ||
Builder.getVoidTy(), Int8PtrTy, Int8PtrTy, IntIdxTy); | ||
inferNonMandatoryLibFuncAttrs(M, FuncName, *TLI); | ||
|
||
// Otherwise we should form a memset_pattern16. PatternValue is known to be | ||
// an constant array of 16-bytes. Plop the value into a mergable global. | ||
GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true, | ||
GlobalValue::PrivateLinkage, | ||
PatternValue, ".memset_pattern"); | ||
GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these. | ||
GV->setAlignment(Align(16)); | ||
Value *PatternPtr = GV; | ||
NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes}); | ||
assert(ForceMemsetPatternIntrinsic || | ||
isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16)); | ||
assert(isa<SCEVConstant>(StoreSizeSCEV) && "Expected constant store size"); | ||
|
||
Value *PatternArg; | ||
IntegerType *PatternArgTy = | ||
Builder.getIntNTy(DL->getTypeSizeInBits(PatternValue->getType())); | ||
|
||
if (isa<ConstantInt>(PatternValue)) | ||
PatternArg = PatternValue; | ||
else if (isa<ConstantFP>(PatternValue)) | ||
PatternArg = Builder.CreateBitCast(PatternValue, PatternArgTy); | ||
else if (isa<PointerType>(PatternValue->getType())) | ||
PatternArg = Builder.CreatePtrToInt(PatternValue, PatternArgTy); | ||
else | ||
report_fatal_error("Unexpected PatternValue type"); | ||
|
||
NewCall = Builder.CreateIntrinsic(Intrinsic::experimental_memset_pattern, | ||
{DestInt8PtrTy, PatternArgTy, IntIdxTy}, | ||
{BasePtr, PatternArg, MemsetArg, | ||
ConstantInt::getFalse(M->getContext())}); | ||
if (StoreAlignment) | ||
cast<MemSetPatternInst>(NewCall)->setDestAlignment(*StoreAlignment); | ||
|
||
// Set the TBAA info if present. | ||
if (AATags.TBAA) | ||
|
@@ -1419,7 +1467,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad( | |
AAMDNodes AATags = TheLoad->getAAMetadata(); | ||
AAMDNodes StoreAATags = TheStore->getAAMetadata(); | ||
AATags = AATags.merge(StoreAATags); | ||
if (auto CI = dyn_cast<ConstantInt>(NumBytes)) | ||
if (auto *CI = dyn_cast<ConstantInt>(NumBytes)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. precommit this please |
||
AATags = AATags.extendTo(CI->getZExtValue()); | ||
else | ||
AATags = AATags.extendTo(-1); | ||
|
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals | ||
; RUN: opt -passes=loop-idiom -mtriple=riscv64 < %s -S | FileCheck %s | ||
; RUN: opt -passes=loop-idiom -mtriple=riscv64 -loop-idiom-force-memset-pattern-intrinsic < %s -S \ | ||
; RUN: | FileCheck -check-prefix=CHECK-INTRIN %s | ||
|
||
define dso_local void @double_memset(ptr nocapture %p) { | ||
; CHECK-LABEL: @double_memset( | ||
; CHECK-NEXT: entry: | ||
; CHECK-NEXT: br label [[FOR_BODY:%.*]] | ||
; CHECK: for.cond.cleanup: | ||
; CHECK-NEXT: ret void | ||
; CHECK: for.body: | ||
; CHECK-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] | ||
; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds double, ptr [[P:%.*]], i64 [[I_07]] | ||
; CHECK-NEXT: store double 3.141590e+00, ptr [[PTR1]], align 1 | ||
; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_07]], 1 | ||
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 16 | ||
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] | ||
; | ||
; CHECK-INTRIN-LABEL: @double_memset( | ||
; CHECK-INTRIN-NEXT: entry: | ||
; CHECK-INTRIN-NEXT: call void @llvm.experimental.memset.pattern.p0.i64.i64(ptr align 1 [[P:%.*]], i64 4614256650576692846, i64 16, i1 false) | ||
; CHECK-INTRIN-NEXT: br label [[FOR_BODY:%.*]] | ||
; CHECK-INTRIN: for.cond.cleanup: | ||
; CHECK-INTRIN-NEXT: ret void | ||
; CHECK-INTRIN: for.body: | ||
; CHECK-INTRIN-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] | ||
; CHECK-INTRIN-NEXT: [[PTR1:%.*]] = getelementptr inbounds double, ptr [[P]], i64 [[I_07]] | ||
; CHECK-INTRIN-NEXT: [[INC]] = add nuw nsw i64 [[I_07]], 1 | ||
; CHECK-INTRIN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 16 | ||
; CHECK-INTRIN-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] | ||
; | ||
entry: | ||
br label %for.body | ||
|
||
for.cond.cleanup: | ||
ret void | ||
|
||
for.body: | ||
%i.07 = phi i64 [ %inc, %for.body ], [ 0, %entry ] | ||
%ptr1 = getelementptr inbounds double, ptr %p, i64 %i.07 | ||
store double 3.14159e+00, ptr %ptr1, align 1 | ||
%inc = add nuw nsw i64 %i.07, 1 | ||
%exitcond.not = icmp eq i64 %inc, 16 | ||
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body | ||
} | ||
;. | ||
; CHECK-INTRIN: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) } | ||
;. |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There's a bunch of complexity in this section of the change that I'm not getting. Since we need the NumBytes (in the original code) for the AAInfo, why not just leave that alone? Isn't count (for memset.pattern intrinsic) just the trip count of the loop by definition?